news@awdprime.UUCP (USENET News) (03/23/90)
In article <1435@io.UUCP> jar@io.UUCP (Jim Roskind x5570) writes: Thanks for the grammers. Some people requested a lexer. Here is mine that I quick hacked up. I'm sure that someone can use this build a complete one for your grammers. Things to fix: o It doesn't return same token values per jim's grammers (easy fix). o You'll need to fix it to return different values for identifiers, enums, and typedefs (see comment at {ident}). o It doesn't support floating point yet either (easy fix, I have {float} and {exp} defined but not used or tested). o You'll want to handle cases like "test \" foobar" in the string handling section (look for STRING and CHAR_CONST). I couldn't figure out how to do it in 30 seconds or less so I'll leave it up to you. With this I was able to parse a fairly large C program that I had removed all the typedefs from (no enums) using the grammer right out of the back of _The C Programming Language_ Second Edition. ENJOY!!!!! --- don't forget to remove the .sig at the bottom :-) -------------- clexer.l for ANSI C ------------------ %{ #include <stdio.h> #include "y.tab.h" #define STRDUP(X) ((char *)strcpy(malloc(strlen(X)+1),X)) extern int yychar; static int column = 0; static int linenum = 1; #define count(x) counter(x) #ifndef YYDEBUG int yydebug = 0; #else int yydebug = 1; #endif %} alpha [a-zA-Z] digit [0-9] special [\_] ident (({alpha}|{special})({alpha}|{digit}|{special})*) int ({digit}+) exp ([Ee][-+]?{digit}+) float ([-+]?{digit}+\.?{digit}*) %p 3000 %% ^\#.* { count(0); /* skip cpp lines */ } [\ \n\t\v\f]+ { count(0); /* skip white space */ } "/*" { count(1); skipcomments(); } "..." { count(1); return DOTDOTDOT; } ">=" { count(1); return GE; } "<=" { count(1); return LE; } "!=" { count(1); return NOTEQU; } "==" { count(1); return EQU; } "*=" { count(1); return MULTEQU; } "/=" { count(1); return DIVEQU; } "%=" { count(1); return MODEQU; } "+=" { count(1); return INCEQU; } "-=" { count(1); return DECEQU; } "<<=" { count(1); return SHIFTLEFTEQU; } ">>=" { count(1); return SHIFTRIGHTEQU; } "&=" { count(1); return ANDEQU; } "|=" { count(1); return OREQU; } "^=" { count(1); return XOREQU; } "<<" { count(1); return SHIFTLEFT; } ">>" { count(1); return SHIFTRIGHT; } "++" { count(1); return INC; } "--" { count(1); return DEC; } "->" { count(1); return POINTS; } "&&" { count(1); return LOGICALAND; } "||" { count(1); return LOGICALOR; } "(" { count(1); return '('; } "," { count(1); return ','; } ")" { count(1); return ')'; } ";" { count(1); return ';'; } "{" { count(1); return '{'; } "}" { count(1); return '}'; } "[" { count(1); return '['; } "]" { count(1); return ']'; } "*" { count(1); return '*'; } "/" { count(1); return '/'; } "+" { count(1); return '+'; } "-" { count(1); return '-'; } "%" { count(1); return '%'; } "^" { count(1); return '^'; } "&" { count(1); return '&'; } "?" { count(1); return '?'; } ":" { count(1); return ':'; } "!" { count(1); return '!'; } "." { count(1); return '.'; } "~" { count(1); return '~'; } "<" { count(1); return '<'; } ">" { count(1); return '>'; } "." { count(1); return '.'; } "=" { count(1); return '='; } if { count(1); return IF; } else { count(1); return ELSE; } while { count(1); return WHILE; } do { count(1); return DO; } for { count(1); return FOR; } switch { count(1); return SWITCH; } case { count(1); return CASE; } default { count(1); return DEFAULT; } goto { count(1); return GOTO; } continue { count(1); return CONTINUE; } break { count(1); return BREAK; } return { count(1); return RETURN; } sizeof { count(1); return SIZEOF; } auto { count(1); return AUTO; } register { count(1); return REGISTER; } static { count(1); return STATIC; } extern { count(1); return EXTERN; } typedef { count(1); return TYPEDEF; } void { count(1); return VOID; } char { count(1); return CHAR; } short { count(1); return SHORT; } int { count(1); return INT; } long { count(1); return LONG; } float { count(1); return FLOAT; } double { count(1); return DOUBLE; } unsigned { count(1); return UNSIGNED; } enum { count(1); return ENUM; } const { count(1); return CONST; } volatile { count(1); return VOLATILE; } struct { count(1); return STRUCT; } union { count(1); return UNION; } \'.*\' { count(1); yylval.strval = STRDUP(yytext+1); yylval.strval[strlen(yylval.strval)-1] = 0; /* return STRING without quotes */ return CHAR_CONST; } {int} { count(1); yylval.intval = atoi(yytext); return INTEGER_CONST; } \".*\" { count(1); yylval.strval = STRDUP(yytext+1); yylval.strval[strlen(yylval.strval)-1] = 0; /* return STRING without quotes */ return STRING; } {ident} { count(1); yylval.strval = STRDUP(yytext); /* * we need to hash this and make a structure that tells us * what is going on so we know what to do with it * is it an enum, typedef, or identifier * * BTW: you'll have to cooperate with the grammer to do this. */ return IDENTIFIER; } . { count(1); return ERROR; } %% yywrap() {return(1);} /* Skip over comments. */ skipcomments() { char c; while (1) { while ((c = input()) != '*') if (c == '\n') { column = 0; linenum++; } else if (c == '\t') column += 8 - (column % 8); else column++; if ((c = input()) == '/') { column++; #ifdef LEXDEBUG printf ("symbol found: %s\n", "*/"); #endif return; } unput(c); } } /*ARGSUSED*/ counter (notwhite) { register char *s; #ifdef LEXDEBUG if (notwhite) printf ("symbol found: %s\n", yytext); #endif for (s = yytext; *s; s++) if (*s == '\n') { column = 0; linenum++; } else if (*s == '\t') column += 8 - (column % 8); else column++; } yyerror (s) char *s; { fprintf (stderr, "YYERROR: %s: line %d col %d\n", s, linenum, column); fprintf (stderr, "YYERROR: yytext=`%s' symbol was (%d)\n", yytext, yychar); } -- sanders For every message of the day, a new improved message will arise to overcome it. Reply-To: cs.utexas.edu!ibmaus!auschs!sanders.austin.ibm.com!sanders (ugh!)