[net.sources] C++ Lexical Scanner

nmyers@mntgfx.MENTOR.COM (Nathan Myers) (01/15/87)

C++ Lexical Scanner (in C++)

Following this note is a shar-format file containing a
lexical scanner for C++.  Use it however you like.  It
should be adequate to drop into a full compiler.
For such a use, error reporting would probably need 
some refinement.  Also, if the compiler was to have
a built-in preprocessor (i.e. not using cpp) the handler
for "#" lines would need a bit of work.

I will be interested in bug reports, significant improvements.
Nathan Myers   tektronix!sequent!mntgfx!nmyers

Share and enjoy!
-------------------- cut here -----------------------------
#! /bin/sh
#  This is a shar format file.  Extract with sh, not csh.
#
echo "x - read.me"
sed -e 's/^_ //' >read.me <<'%%%EOF%%%'
_ 
_ The following source files are included:
_ 
_     clex.c
_     clex.h
_     clex_sym.h
_     clex_test.c
_     kwhash.c
_     Makefile
_ 
_ They implement a self-contained lexical scanner class for C++.
_ It is extensible by derivation primarily in the area of
_ processing "#" compiler directives (currently, it only
_ interprets the "#line" construct).
_ 
_ It has one other degree of flexibility, in its handling
_ of bracket-enclosed expressions "[]".  These may be treated
_ as a normal sequence of tokens or as delimited strings;
_ the former is of greater use in a traditional parser, while
_ the latter is favored for extraction of declarations by
_ a code browser.
_ 
_ To allay some confusion, I should point out here that
_ clex_sym.h is used in an unusual way: it is included
_ twice in the module clex.c; once for declaration
_ part, once for the (static) definition part.  It is
_ built this way to keep all knowledge of keywords
_ in a single place.
_ 
_ The file kwhash.c is a standard C standalone program used
_ to arrive at the collision-free hash function used to
_ recognize C++ keywords.  Any new keyword stands about
_ one chance in 3 of colliding with an existing keyword,
_ thus requiring that a new hash function be generated.
_ 
_ The file clex_test.c compiles to a program which reads
_ C or C++ code from standard input and emits token names
_ on the standard output.  Try it with different values
_ in the constructor's second argument.
%%%EOF%%%
echo "x - Makefile"
sed -e 's/^_ //' >Makefile <<'%%%EOF%%%'
_ # Makefile for clex/cparse
_ INCLUDE=/usr/local/include/CC
_ CCC=/user/local/CC
_ CCOPTS= -g -O
_ # -g: include debug info  -O: optimize
_ 
_ all: kwhash clex_test
_ 
_ kwhash: kwhash.c /usr/include/stdio.h
_ 		/bin/cc ${CCOPTS} -o kwhash kwhash.c
_ 
_ clex_test: clex_test.o clex.o
_ 		${CCC} -g -o clex_test clex_test.o clex.o
_ 
_ .c.o:
_ 		/usr/lib/cpp -I${INCLUDE} -Dc_plusplus=1 $*.c >$*.cpp
_ 		/user/mentor/local/cfront +L +f$*.c <$*.cpp >$*..c && /bin/rm $*.cpp
_ 		/bin/cc -c ${CCOPTS} $*..c && /bin/mv $*..o $*.o
_ 
_ clex.o: ${INCLUDE}/stdio.h \
_          ${INCLUDE}/stream.h \
_          ${INCLUDE}/string.h \
_          ${INCLUDE}/stdlib.h \
_          ${INCLUDE}/ctype.h \
_          ${INCLUDE}/assert.h \
_          clex.h clex_sym.h
_ 
_ clex_test.o:    ${INCLUDE}/stdio.h \
_                 clex.h clex_sym.h
_ 
%%%EOF%%%
echo "x - clex.h"
sed -e 's/^_ //' >clex.h <<'%%%EOF%%%'
_ 
_ 
_ #ifndef INCLUDED_CLEX
_ #define INCLUDED_CLEX 1
_ 
_ #ifndef INCLUDED_STDIO
_ #include <stdio.h>
_ #endif
_ 
_ enum Boolean { FALSE, TRUE };
_ 
_ #include "clex_sym.h"
_ 
_ class Clex
_     { 
_  friend class Cparse;
_     enum Clex_mode
_         { CL_NONE=0, CL_COMMENT=1, CL_QUOTE=2, CL_POUND=4, CL_BRACK=8 };
_ 
_  protected:
_     short   look;           // a one-char lookahead
_     FILE*   fp;
_     Boolean block_brack;    // if TRUE, treat contents of "[]" as a string
_     long    line_num;       // line number in original source file
_     char    filename[256];  // name of original source file
_     short   bufsiz;         // number of chars currently in buf
_     char    buf[256];
_ 
_     void eat_one()          { look = short(getc(fp)); }
_     void put_in_buf(char c) { if (bufsiz < sizeof(buf)-1) buf[bufsiz++] = c; }
_     void buf_one()          { put_in_buf(look); eat_one(); }
_     Clex_sym terminate(Clex_sym s)  { buf[bufsiz] = '\0'; return s; }
_     Clex_sym eat_return(Clex_sym);
_     Clex_sym num(char);
_     Clex_sym ident(char);
_     Clex_sym lbrack(Clex_mode);
_     Clex_sym quote(char, Clex_sym, Clex_mode);
_     void block_comment(Clex_mode);
_     void line_comment();
_     void eoln(Clex_mode);
_ 
_     virtual Boolean pound(Clex_mode, char*, short len);
_ 
_  public:
_     Clex_sym    next();
_     const char* str()           { return buf; }
_     short       strlen()        { return bufsiz; }
_     long        line_no()       { return line_num; }
_     const char* fname()         { return filename; }
_     const char* debug(Clex_sym);
_ 
_     Clex(FILE*, Boolean block_brack);
_     };
_ 
_ #endif
_ 
%%%EOF%%%
echo "x - clex_sym.h"
sed -e 's/^_ //' >clex_sym.h <<'%%%EOF%%%'
_ 
_ /* clex_sym.h:
_ // This file defines both an enum {} type named "symbol", and
_ //  a variable sym_str[] with string representations of the
_ //  symbols.  It is intended to maintain an exact
_ //  correspondence between array entries and symbol values.
_ */
_ 
_ /*
_     This file is #include'd twice: once for the enum
_     (with CLEX_IMPLEMENTATION turned off) and once for
_     the array initialization (with it turned on).  The
_     lower-numbered symbols have uppercase name strings,
_     but the keyword symbol strings are stored separately.
_ 
_     If a keyword is to be added, add it first to the
_     standalone program kwhash.c and generate a new
_     perfect hash function for the new set.  Then add
_     it to both places below and modify the hash function
_     and table size in clex.c.
_ */
_ 
_ #ifndef CLEX_IMPLEMENTATION
_ 
_ #define CLEX_S(sym) sym
_ #define CLEX_S2(sym1,sym2) sym1
_ enum Clex_sym
_     {
_ 
_ #else /* CLEX_IMPLEMENTATION */
_ 
_ #undef  CLEX_S
_ #undef  CLEX_S2
_ #define CLEX_S(sym) "sym"
_ #define CLEX_S2(sym1,sym2) sym2
_ static char* sym_str[] =
_     {
_ 
_ #endif /* CLEX_IMPLEMENTATION */
_ 
_     CLEX_S(NONE_S = 0),    /* should never get this */
_ 
_     CLEX_S(ERROR_S),
_     CLEX_S(  ERROR_EOLN_S),
_     CLEX_S(  ERROR_EOF_S),
_     CLEX_S(  ERROR_UNKN_S),
_ #ifndef CLEX_IMPLEMENTATION
_     CLEX_S(ERROR_MAX = ERROR_UNKN_S),
_ #endif
_ 
_     CLEX_S(EOF_S),        
_     CLEX_S(EOLN_S),         // \n
_ 
_     CLEX_S(BANG_S),         // !
_     CLEX_S(  NE_S),         // !=
_     CLEX_S(QUOTE_S),        // "
_     CLEX_S(POUND_S),        // #
_     CLEX_S(MOD_S),          // %
_     CLEX_S(  MODAS_S),      // %=
_     CLEX_S(AMPER_S),        // &
_     CLEX_S(  LAND_S),       // &&
_     CLEX_S(  ANDAS_S),      // &=
_     CLEX_S(APOS_S),         // '
_     CLEX_S(LPAR_S),         // (
_     CLEX_S(RPAR_S),         // )
_     CLEX_S(STAR_S),         // *
_     CLEX_S(  MULAS_S),      // *=
_     CLEX_S(PLUS_S),         // +
_     CLEX_S(  INCRE_S),      // ++
_     CLEX_S(  ADDAS_S),      // +=
_     CLEX_S(COMMA_S),        // ),
_     CLEX_S(MINUS_S),        // -
_     CLEX_S(  DECRE_S),      // --
_     CLEX_S(  SUBAS_S),      // -=
_     CLEX_S(  DEREF_S),      // ->
_     CLEX_S(DOT_S),          // .
_     CLEX_S(  ELLIP_S),      // ...
_     CLEX_S(SLASH_S),        // / 
_     CLEX_S(  DIVAS_S),      // /=
_     CLEX_S(COLON_S),        // :
_     CLEX_S(  SCOPE_S),      // ::
_     CLEX_S(SEMI_S),         // ;
_     CLEX_S(LT_S),           // <
_     CLEX_S(  LE_S),         // <=
_     CLEX_S(  SHL_S),        // <<
_     CLEX_S(  SHLAS_S),      // <<=
_     CLEX_S(AS_S),           // =
_     CLEX_S(  EQ_S),         // ==
_     CLEX_S(GT_S),           // >
_     CLEX_S(  GE_S),         // >=
_     CLEX_S(  SHR_S),        // >>
_     CLEX_S(  SHRAS_S),      // >>=
_     CLEX_S(QUEST_S),        // ?
_     CLEX_S(AT_S),           // @ (undefined)
_     CLEX_S(LBRACK_S),       // [
_     CLEX_S(BSLASH_S),       // \ 
_     CLEX_S(RBRACK_S),       // ]
_     CLEX_S(CARET_S),        // ^
_     CLEX_S(  XORAS_S),      // ^=
_     CLEX_S(GRAVE_S),        // ` (undefined)
_     CLEX_S(LBRACE_S),       // {
_     CLEX_S(VBAR_S),         // |
_     CLEX_S(  LOR_S),        // ||
_     CLEX_S(  ORAS_S),       // |=
_     CLEX_S(RBRACE_S),       // }
_     CLEX_S(TILDE_S),        // ~
_ 
_     CLEX_S(IDENT_S),        // a name, or string that could be a name
_     CLEX_S(NUM_S),          // a numeric string
_     CLEX_S(FLOATNUM_S)      // a recognizably floating-point num
_ 
_ #ifndef CLEX_IMPLEMENTATION
_     , CLEX_S(KEYWORD_S),
_ 
_ #else
_ 
_     };
_  static char *keywords[] =
_     {
_ 
_ #endif
_ 
_     CLEX_S2(ASM_S = KEYWORD_S, "asm"),
_     CLEX_S2(AUTO_S,     "auto"),
_     CLEX_S2(BREAK_S,    "break"),
_     CLEX_S2(CASE_S,     "case"),
_     CLEX_S2(CHAR_S,     "char"),
_     CLEX_S2(CLASS_S,    "class"),
_     CLEX_S2(CONST_S,    "const"),
_     CLEX_S2(CONTINUE_S, "continue"),
_     CLEX_S2(DEFAULT_S,  "default"),
_     CLEX_S2(DELETE_S,   "delete"),
_     CLEX_S2(DO_S,       "do"),
_     CLEX_S2(DOUBLE_S,   "double"),
_     CLEX_S2(ELSE_S,     "else"),
_     CLEX_S2(ENUM_S,     "enum"),
_     CLEX_S2(EXTERN_S,   "extern"),
_     CLEX_S2(FLOAT_S,    "float"),
_     CLEX_S2(FOR_S,      "for"),
_     CLEX_S2(FRIEND_S,   "friend"),
_     CLEX_S2(GOTO_S,     "goto"),
_     CLEX_S2(IF_S,       "if"),
_     CLEX_S2(INLINE_S,   "inline"),
_     CLEX_S2(INT_S,      "int"),
_     CLEX_S2(LONG_S,     "long"),
_     CLEX_S2(NEW_S,      "new"),
_     CLEX_S2(OPERATOR_S, "operator"),
_     CLEX_S2(OVERLOAD_S, "overload"),
_     CLEX_S2(PRIVATE_S,  "private"),
_     CLEX_S2(PROTECTED_S,"protected"),
_     CLEX_S2(PUBLIC_S,   "public"),
_     CLEX_S2(REGISTER_S, "register"),
_     CLEX_S2(RETURN_S,   "return"),
_     CLEX_S2(SHORT_S,    "short"),
_     CLEX_S2(SIGNED_S,   "signed"),
_     CLEX_S2(SIZEOF_S,   "sizeof"),
_     CLEX_S2(STATIC_S,   "static"),
_     CLEX_S2(STRUCT_S,   "struct"),
_     CLEX_S2(SWITCH_S,   "switch"),
_     CLEX_S2(THIS_S,     "this"),
_     CLEX_S2(TYPEDEF_S,  "typedef"),
_     CLEX_S2(UNION_S,    "union"),
_     CLEX_S2(UNSIGNED_S, "unsigned"),
_     CLEX_S2(VIRTUAL_S,  "virtual"),
_     CLEX_S2(VOLATILE_S, "volatile"),
_     CLEX_S2(VOID_S,     "void"),
_     CLEX_S2(WHILE_S,    "while"),
_ 
_     CLEX_S2(END_OF_SYMBOLS_S, NULL)
_     };
_ 
_ #ifndef CLEX_IMPLEMENTATION
_ const CLEX_NUMKEYS = (END_OF_SYMBOLS_S - KEYWORD_S);
_ #endif
_ 
%%%EOF%%%
echo "x - clex.c"
sed -e 's/^_ //' >clex.c <<'%%%EOF%%%'
_ 
_ #ifndef INCLUDED_STREAM
_ #include <stream.h>
_ #endif
_ #ifndef INCLUDED_STRING
_ #include <string.h>
_ #endif
_ #ifndef INCLUDED_STDLIB
_ #include <stdlib.h>
_ #endif
_ #ifndef INCLUDED_ASSERT
_ #include <assert.h>
_ #endif
_ #ifndef INCLUDED_CTYPE
_ #include <ctype.h>
_ #endif
_ 
_ #include "clex.h"
_ 
_ // get string value tables, sym_str[] and keyword[] :
_ #define CLEX_IMPLEMENTATION 1
_ #include "clex_sym.h"
_ 
_ /******************************************************************************
_ *                                                                             *
_ *  KWTABLE -- keyword hash table (internal use only)                          *
_ *     KWtable implements a collision-free hash table of C++ keywords.  The    *
_ *     table size and hash function are computed by use of a standalone C      *
_ *     program, kwhash.c, included in this directory.                          *
_ *                                                                             *
_ ******************************************************************************/
_ 
_ #define U_short unsigned short
_ #define U_char  unsigned char
_ 
_ struct KWtable
_     {
_     enum { HASHSIZE = 131 };  // as computed by kwhash.c, for a=9,b=2,c=2
_ 
_     struct  {
_             char* kwp;
_             Clex_sym sym;
_             } kwhash[HASHSIZE];
_ 
_     KWtable(char**);
_     U_short hash(const U_char*, U_short len);
_     void insert(char*, Clex_sym);
_     Clex_sym lookup(char*, short len);
_     };
_ 
_ static KWtable kwt = KWtable(keywords); // keywords[] defined in Clex_sym.h
_ 
_ KWtable::
_ KWtable (char** kwl)
_     {
_     short int i;
_     for (i = 0; i < HASHSIZE; ++i)
_         kwhash[i].kwp = NULL;
_     for (i = 0; i < CLEX_NUMKEYS; ++i)
_         insert(kwl[i], KEYWORD_S + i);
_     // rely on assert() to prevent hash collisions -- may need
_     //  a new hash function or table size when keyword added.
_     }
_ 
_ // the values used in the following hash function, and HASHSIZE, were
_ // determined by use of the standalone C program kwhash.c, to
_ // ensure that no collisions occur.
_ 
_ inline
_ U_short KWtable::
_ hash (const U_char* cp, U_short len)
_     {
_     return (((U_short)cp[0]         ) ^
_             ((U_short)cp[1]     << 9) ^
_             ((U_short)cp[len-1] << 2) ^
_             (len                << 2) ) % HASHSIZE;
_     }
_ 
_ void KWtable::
_ insert (char* cp, Clex_sym s)
_     {
_     U_short h = hash(cp, strlen(cp));
_     assert(kwt.kwhash[h].kwp == NULL);  // collisions not permitted.
_     kwt.kwhash[h].kwp = cp;
_     kwt.kwhash[h].sym = s;
_     }
_ 
_ Clex_sym KWtable::
_ lookup (char* cp, short len)
_     {
_     if (len < 2 || len > 9) return (IDENT_S);
_     short h = hash(cp, len);
_     if (kwt.kwhash[h].kwp == NULL) return (IDENT_S);
_     if (strcmp(kwt.kwhash[h].kwp, cp)) return (IDENT_S);
_     return (kwt.kwhash[h].sym);
_     }
_ 
_ /******************************************************************************
_ *                                                                             *
_ *  CLEX -- c++ lexical scanner                                               *
_ *                                                                             *
_ ******************************************************************************/
_ 
_ // CONSTRUCTOR Clex:
_ //   The argument block_brack, if TRUE, dictates that the contents
_ //   of square brackets "[]" be returned as a string in the string
_ //   buffer.  If false, square brackets are treated as simple tokens.
_ 
_ Clex::
_ Clex (FILE* f, Boolean b)
_     {
_     fp = f;
_     block_brack = b;
_     filename[0] = '\0';
_     bufsiz = 0; buf[0] = '\0';
_     // prime the pipeline:
_     line_num = 0;
_     look = '\n';    // be prepared to handle '#' as first char
_     }
_ 
_ Clex_sym Clex::
_ num (char c)
_     {
_     Clex_sym s = NUM_S;
_ 
_     bufsiz = 0;
_     put_in_buf(c);
_     while (isdigit(look))
_         buf_one();
_ 
_     // hexadecimal
_     if (bufsiz == 1 && *buf == '0' && (look == 'x' || look == 'X'))
_         {
_         do { buf_one(); }
_             while (isxdigit(look));
_         if (look == 'L' || look == 'l' || look == 'U' || look == 'u')
_             buf_one();
_         return terminate(s);
_         }
_ 
_     // long or unsigned
_     if (look == 'L' || look == 'l' || look == 'U' || look == 'u')
_         { buf_one(); return terminate(NUM_S); }
_ 
_     // floating point
_     else if (look == '.')
_         {
_         s = FLOATNUM_S;
_         do { buf_one(); }
_             while (isdigit(look));
_         }
_ 
_     // scientific notation
_     if (look == 'e' || look == 'E')
_          {
_          s = FLOATNUM_S;
_          do { buf_one(); }
_             while (isdigit(look));
_          }
_     else
_         return terminate(s);
_ 
_     if (look == '+' || look == '-')
_          do { buf_one(); }
_             while (isdigit(look));
_     return terminate(s);
_     }
_ 
_ Clex_sym Clex::
_ ident (char first)
_     {
_     register Boolean maybe_kw = TRUE;
_     register short bs = 0;
_     buf[bs++] = first;
_     while (isalnum(look) || look == '_' || look == '$')
_         {
_         // note: this function accounts for 30% of the total scan time
_         if (maybe_kw && (isupper(look) || look == '_' ))
_             maybe_kw = FALSE;
_         buf[bs++] = look;       // don't worry about overflow
_         eat_one();
_         }
_     buf[bs] = '\0';
_     bufsiz = bs;
_ 
_     if (maybe_kw)
_         return kwt.lookup(buf, bufsiz);
_     return IDENT_S;
_     }
_ 
_ Clex_sym Clex::
_ quote (char c, Clex_sym s, Clex_mode m)
_     {
_     if (m == CL_NONE)
_         bufsiz = 0;
_     while (look != c)
_         {
_         if (look == EOF)
_             { return terminate(ERROR_EOF_S); }
_         else if (look == '\n')
_             { return terminate(ERROR_EOLN_S); }
_         else if (look == '\\')
_             {
_             eat_one();
_             if (look == '\n')
_                 { eat_one(); eoln(m|CL_QUOTE); continue; }
_             else if (look == EOF)
_                 { return terminate(ERROR_EOF_S); }
_             else
_                 put_in_buf('\\');   // this handles \' and \" too.
_             }
_         buf_one();
_         }
_     eat_one();  // eat the closing quote
_     return terminate(s);
_     }
_ 
_ 
_ // lbrack() accumulates the contents between "[" and "]" into
_ //  the string buffer, handling syntactically quoted strings,
_ //  comments, and nested brackets.  Note that lbrack() is
_ //  called recursively in the case of nested brackets.
_ 
_ Clex_sym Clex::
_ lbrack (Clex_mode m)
_     {
_     if (m == CL_NONE)
_         bufsiz = 0;
_     while (look != ']')
_         {
_         if (look == EOF)
_             return terminate(ERROR_EOF_S);
_ 
_         else if (look == '\n')
_             { eat_one(); eoln(m|CL_BRACK); }
_         else if (look == '[')
_             {
_             buf_one();
_             if (lbrack(m|CL_BRACK) == ERROR_EOF_S)
_                 return ERROR_EOF_S;     // already cleaned up.
_             else put_in_buf(']');
_             }
_         else if (look == '\'' || look == '"')
_             {
_             char c = look;
_             buf_one();
_             (void) quote(c, NONE_S, m|CL_BRACK);
_             put_in_buf(c);
_             }
_         else if (look == '/')           // maybe a comment
_             {
_             eat_one();
_             if (look == '/')
_                 line_comment();
_             else if (look == '*')
_                 {
_                 block_comment(m|CL_BRACK);
_                 if (look == EOF) return terminate(ERROR_EOF_S);
_                 }
_             else                        // stash the '/' and the char after
_                 { put_in_buf('/'); buf_one(); }
_             }
_         else                            // just a character to save
_             buf_one();
_         }
_ 
_     eat_one(); // eat the ']'.
_     return terminate(LBRACK_S);
_     }
_ 
_ 
_ void Clex::
_ block_comment(Clex_mode m)
_     {
_     eat_one(); // eat the '*'
_     while (! (look == '*' && (eat_one(), look == '/')) )
_         {
_         if (look == EOF) return;
_         if (look == '\n') { eat_one(); eoln(m|CL_COMMENT); }
_         else if (look != '*') eat_one();
_         }
_     eat_one(); // eat the '/'
_     }
_ 
_ void Clex::
_ line_comment()
_     {
_     do { eat_one(); }
_      while (look != '\n' && look != EOF);
_     }
_ 
_ // eat_return() is intended to save space in Clex::next() -- the
_ //  inline function eat_one() produces quite a lot of code.
_ Clex_sym Clex::
_ eat_return(Clex_sym s)
_     { eat_one(); return s; }
_ 
_ Clex_sym Clex::
_ next()
_     {
_     short val;
_     while (val = look, eat_one(), val != EOF)
_         {
_         char ch = char(val);
_         switch (ch)
_             {
_         case ' ' : continue;
_ 
_         case '_' :
_         case '$' : return ident(ch);
_ 
_         case '0' : case '1' : case '2' : case '3' : case '4' :
_         case '5' : case '6' : case '7' : case '8' : case '9' :
_                    return num(ch);
_ 
_         case ',' : return COMMA_S;
_         case ';' : return SEMI_S;
_         case '[' : if (block_brack) return lbrack(CL_NONE);
_                    else             return LBRACK_S;
_         case ']' : return RBRACK_S;
_         case '{' : return LBRACE_S;
_         case '}' : return RBRACE_S;
_         case '(' : return LPAR_S;
_         case ')' : return RPAR_S;
_         case '~' : return TILDE_S;
_         case '?' : return QUEST_S;
_         case '"' : return quote(ch, QUOTE_S, CL_NONE);
_         case '\'': return quote(ch, APOS_S, CL_NONE);
_ 
_         case '=' :                              // '=', '=='
_             if (look != '=') return AS_S;
_             else  return eat_return(EQ_S);
_ 
_         case ':' :                              // ":", "::"
_             if (look != ':') return COLON_S;
_             else  return eat_return(SCOPE_S);
_ 
_         case '!' :                              // "!", "!="
_             if (look != '=') return BANG_S;
_             else  return eat_return(NE_S);
_ 
_         case '^' :                              // "^", "^="
_             if (look != '=') return CARET_S;
_             else  return eat_return(XORAS_S);
_ 
_         case '*' :                              // '*', '*='
_             if (look != '=') return STAR_S;
_             else  return eat_return(MULAS_S);
_ 
_         case '%' :                              // '%', '%='
_             if (look != '=') return MOD_S;
_             else  return eat_return(MODAS_S);
_ 
_         case '|' :                              //  "|=", "||", "|"
_             if      (look == '|') return eat_return(LOR_S);
_             else if (look == '=') return eat_return(ORAS_S);
_             else                             return VBAR_S;
_ 
_         case '&' :                              // "&", "&=", "&&"
_             if      (look == '&') return eat_return(LAND_S);
_             else if (look == '=') return eat_return(ANDAS_S);
_             else                             return AMPER_S;
_ 
_         case '+' :                              // '+', '++', '+='
_             if      (look == '+') return eat_return(INCRE_S);
_             else if (look == '=') return eat_return(ADDAS_S);
_             else                             return PLUS_S;
_ 
_         case '-' :                              // '--', '-=', '->', '-', 
_             if      (look == '-') return eat_return(DECRE_S);
_             else if (look == '=') return eat_return(SUBAS_S);
_             else if (look == '>') return eat_return(DEREF_S);
_             else                             return MINUS_S;
_ 
_         case '/' :                              // '/*', '//', '/=', '/'
_             if (look == '*')
_                 {
_                 block_comment(CL_NONE);
_                 if (look == EOF)       // almost certainly a mistake:
_                     return ERROR_EOF_S;
_                 else continue;
_                 }
_             else if (look == '/')
_                 { line_comment(); continue; }
_             else if (look == '=') return eat_return(DIVAS_S);
_             else                             return SLASH_S;
_ 
_         case '.' :                              // ".", "..."
_             if (isdigit(look))      return num(ch);
_             else if (look == '.')
_                 {
_                 eat_one();          // check for "..", undefined.
_                 if (look != '.')    return ERROR_UNKN_S;
_                 else    return  eat_return(ELLIP_S);
_                 }
_             else                    return DOT_S;
_ 
_         case '<' :                              // '<=', '<', '<<', '<<='
_             if      (look == '=')   return eat_return(LE_S);
_             else if (look == '<')
_                 {
_                 eat_one();
_                 if  (look != '=')   return SHL_S;
_                 else     return eat_return(SHLAS_S);
_                 }
_             else                    return LT_S;
_ 
_         case '>' :                              // '>=', '>', '>>', '>>='
_             if      (look == '=')   return eat_return(GE_S);
_             else if (look == '>')
_                 {
_                 eat_one();
_                 if  (look != '=')   return SHR_S;
_                 else     return eat_return(SHRAS_S);
_                 }
_             else                    return GT_S;
_ 
_         default:
_             if (isalpha(ch))
_                 return ident(ch);
_             if (ch == '\n')
_                 eoln(CL_NONE);
_             else if (iscntrl(ch))
_                 continue;
_             else
_                 return ERROR_UNKN_S;
_             }
_         }
_ 
_     return EOF_S;
_     }
_ 
_ struct Quickbuf
_     {
_     short len;
_     char line[10240];
_     void put_in(char c) { if (len < sizeof(line)-1) line[len++] = c; }
_     void terminate()    { line[len] = '\0'; }
_     Quickbuf() { len = 0; }
_     };
_ 
_ void Clex::
_ eoln(Clex_mode m)
_     {
_     // assume NL character already eaten.
_     ++line_num;
_     // don't process '#' lines in quotes, comments, or '#' continuations.
_     if (m & (CL_QUOTE|CL_POUND|CL_COMMENT))
_         return;
_ 
_     // eat whitespace
_     while (look != EOF && look != '\n')
_         {
_         if (look == ' ' || iscntrl(char(look))) eat_one();
_         else break;
_         }
_     if (look != '#')
_         return;
_ 
_     // eat the '#' and subsequent whitespace
_     do { eat_one(); if (look == EOF || look == '\n') break; }
_        while (look == ' ' || iscntrl(char(look)));
_ 
_     // collect the '#' line
_     Quickbuf b;
_     do  {   // record line
_         if (look == '\\')       // check for continuation line
_             {
_             eat_one();
_             if (look == '\n') { eat_one(); eoln(m|CL_POUND); }
_             else { b.put_in('\\'); }
_             }
_         else if (look == '/')   // check for comment in '#' line
_             {
_             eat_one();
_             if (look == '*')
_                 {
_                 block_comment(m|CL_POUND);
_                 if (look == EOF) break;
_                 }
_             else if (look == '/') line_comment();
_             else { b.put_in('/'); }
_             }
_         else
_             {
_             if (iscntrl(char(look))) look = ' ';
_             b.put_in(look);
_             eat_one();
_             }
_  
_         } while (look != '\n' && look != EOF);
_     b.terminate();
_ 
_     (void) pound(m, b.line, b.len);     // call virtual handler
_     }
_ 
_ Boolean Clex::
_ pound (Clex_mode m, char* line, short len)
_     {
_     void(m);                // to keep cfront blissful
_     char* cp = line;
_     if (!isdigit(*cp))
_         {
_         if (len < 5) return FALSE;
_         if (strncmp(cp, "line ", 5) != 0)
_             return FALSE;   // don't know what it is
_         cp += 4;
_         while (*cp == ' ') ++cp;
_         if (!isdigit(*cp))
_             return FALSE;
_         }
_ 
_     // # <line> "<filename>"   or    #line <line> "<filename>"
_     line_num = atoi(cp) - 1;    // will be incremented by eoln() later
_     while (isdigit(*cp)) ++cp;
_     while (*cp == ' ')   ++cp;
_     if (*cp == '"')
_         {
_         char* cpq = cp;
_         do { ++cpq; }
_            while (*cpq != '"' && *cpq != '\0');
_         strncpy(filename, cp+1, cpq - cp - 1);
_         filename[cpq - cp - 1] = '\0';
_         }
_ 
_     return TRUE;
_     }
_ 
_ const char* Clex::
_ debug (Clex_sym s)
_     {
_     return (s >= KEYWORD_S) ? keywords[s - KEYWORD_S] : sym_str[s] ;
_     }
%%%EOF%%%
echo "x - kwhash.c"
sed -e 's/^_ //' >kwhash.c <<'%%%EOF%%%'
_ 
_ /* this is a C program */
_ 
_ #include <stdio.h>
_ 
_ static char *keywords[] =
_     {
_     "asm",
_     "auto",
_     "break",
_     "case",
_     "char",
_     "class",
_     "const",
_     "continue",
_     "default",
_     "delete",
_     "do",
_     "double",
_     "else",
_     "enum",
_     "extern",
_     "float",
_     "for",
_     "friend",
_     "goto",
_     "if",
_     "inline",
_     "int",
_     "long",
_     "new",
_     "operator",
_     "overload",
_     "private",
_     "protected",
_     "public",
_     "register",
_     "return",
_     "short",
_     "signed",
_     "sizeof",
_     "static",
_     "struct",
_     "switch",
_     "this",
_     "typedef",
_     "union",
_     "unsigned",
_     "virtual",
_     "volatile",
_     "void",
_     "while"
_     };
_ 
_ #define KW_NUMKEYS (sizeof(keywords)/sizeof(*keywords))
_ 
_ unsigned int hashsize = 137;
_ char** kwhash;
_ typedef unsigned short u_short;
_ 
_ u_short
_ hash(cp, len, a, b, c)
_     unsigned char* cp;
_     u_short len;
_     u_short a, b, c;
_     {
_     return (((u_short)cp[0]         ) ^
_             ((u_short)cp[1]     << a) ^
_             ((u_short)cp[len-1] << b) ^
_              (len               << c)  ) % hashsize;
_     }
_ 
_ int
_ insert(cp, a, b, c)
_     char *cp;
_     u_short a, b, c;
_     {
_     short h;
_ 
_     h = hash(cp, strlen(cp), a, b, c);
_     if (kwhash[h] != NULL)
_         {
_ /*
_         printf("Keyword hash collision: %s, %s\n", kwhash[h], cp);
_ */
_         return 0;
_         }
_     else
_         kwhash[h] = cp;
_     return 1;
_     }
_ 
_ int
_ try(a, b, c)
_     short a, b, c;
_     {
_     short int i;
_     int collisions;
_ 
_     collisions = 0;
_     for (i = 0; i < hashsize; ++i)
_         kwhash[i] = NULL;
_     for (i = 0; i < KW_NUMKEYS; ++i)
_         if (!insert(keywords[i], a, b, c))
_             ++collisions;
_     return collisions;
_     }
_ 
_ main(argc, argv)
_     int argc;
_     char **argv;
_     {
_     int min_collisions;
_     int min_abc = 0;
_     short a, b, c;
_ 
_     if (argc > 1) hashsize = atoi(argv[1]);
_     else
_         {
_         printf("usage: %s <hash_size>\n\t<hash_size> should be prime.\n",
_                 argv[0]);
_         exit(-1);
_         }
_ 
_     if (hashsize < KW_NUMKEYS)
_         {
_         printf("Hash table is too small.\n");
_         exit(-1);
_         }
_ 
_     kwhash = (char**) malloc(hashsize * sizeof(char*));
_     min_collisions = hashsize + 1;
_     for (a = 0; a <= 10; ++a)
_         {
_         for (b = 0; b <= 10; ++b)
_             {
_             for (c = 0; c <= 10; ++c)
_                 {
_                 int collisions;
_ 
_                 collisions = try(a, b, c);
_                 if (collisions <= min_collisions)
_                     {
_                     printf("abc: %03x  Collisions: %2d ",
_                            ((a<<8)|(b<<4)|c), collisions);
_                     min_collisions = collisions;
_                     if (collisions == 0) putchar('*');
_                     putchar('\n');
_                     }
_                 }
_             }
_         }
_     }
%%%EOF%%%
echo "x - clex_test.c"
sed -e 's/^_ //' >clex_test.c <<'%%%EOF%%%'
_ 
_ // clex_test -- test code for clex.o
_ 
_ #include "clex.h"
_ 
_ main()
_     {
_     Clex cl = Clex(stdin, TRUE);
_     Clex_sym s;
_     do  {
_         s = cl.next();
_         printf("%5D ", cl.line_no());
_         if (s >= KEYWORD_S)
_             printf(" %s\n", cl.debug(s));
_         else if (s == IDENT_S ||
_                  s == NUM_S ||
_                  s == FLOATNUM_S ||
_                  s == LBRACK_S ||
_                  s == APOS_S ||
_                  s == QUOTE_S )
_             printf( "      %s \"%s\"\n", cl.debug(s), cl.str());
_         else
_             printf( "      %s\n", cl.debug(s));
_         } while (s > EOF_S);
_ 
_     exit(0);
_     }
%%%EOF%%%
echo "Done."
exit 0