loo@mister-curious.sw.mcc.com (Joel Loo) (03/29/89)
Is there a C syntax definition in lex which can be obtained easily? It will be useful for creating C lang related tools (e.g. the recent "C comment stripper in lex" discussion would be trivial if we have a standard C syntax in lex to start with). -------------------------------------------------------------------- Joel Loo Peing Ling composed on Tue Mar 28 11:05:10 CST 1989 -------------------------------------------------------------------- MCC | Email: loo@sw.mcc.com 3500 West Balcones Centre Dr. | Voice: (512)338-3680 (O) Austin, TX 78759 | (512)343-1780 (H) [ Disclaimer: The above article reflects only my own opinion; my employer has nothing to do with it. ]
henry@utzoo.uucp (Henry Spencer) (03/30/89)
In article <2188@mister-curious.sw.mcc.com> loo@mister-curious.sw.mcc.com (Joel Loo) writes: >Is there a C syntax definition in lex which can be obtained easily? Um, surely you are thinking of yacc, not lex? It's easy enough to write a lex description of the syntax of C's tokens, if you ignore one or two preprocessor oddities, but a full syntax of C can't possibly be done in lex -- you need something like yacc for that. I did a lex description of C tokens a couple of years ago which I can post if people are interested. It's not up-to-date in a couple of small respects, I think, but it's close. -- Welcome to Mars! Your | Henry Spencer at U of Toronto Zoology passport and visa, comrade? | uunet!attcan!utzoo!henry henry@zoo.toronto.edu
henry@utzoo.uucp (Henry Spencer) (03/31/89)
In article <1989Mar29.224649.5766@utzoo.uucp> I wrote: >I did a lex description of C tokens a couple of years ago which I can post >if people are interested. It's not up-to-date... Enough people have already expressed interest for me to post it. I did a small update on it at the same time, so it is reasonably current. Read the comment at the top before getting too confident, though. Note also that it implements *exactly* ANSI C and makes no attempt at clean error recovery. I personally don't consider it a useful base for major software work -- you just cannot analyze C properly without a full preprocessor -- but it is useful for things like statistics gathering. ---------- %{ /* * ctokens - print tokens of a C or C++ program * * Full ANSI C (draft of 31 Oct 1988) except: no trigraphs; copes with * backslash-newline stripping only inside strings; imperfect understanding * of the context-dependent rule that makes <bletch.h> a single token * inside a #include. The only C++ issues are the "::" operator and "//" * comments. * * There are some limitations inherent in not doing preprocessing. In * ANSI C, characters that look illegal at first glance can disappear * from the source during preprocessing, either by being #ifdefed out * or by vanishing into a string. This code does not consider that. * Preprocessor numbers can also do strange things, again not considered. * * There are also some implementation-dependent decisions in areas like * the exact syntax of header names; we don't try to be smart about this. * * Except for newlines, any white-space character is printed as "\t". * It would be more sensible to make the white-space expression [ \t\v\f]+ * instead of just [ \t\v\f], but some old lexes have problems with that. * * Note that this program uses one (sigh) undocumented feature of Unix lex: * the ability to override the choice of input stream by assigning to yyin. * Avoiding this requires reimplementing lex's input functions, which is a * pain because getc/ungetc isn't good enough. * * $Log$ */ #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <string.h> #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) #ifndef lint static char RCSid[] = "$Header$"; #endif int debug = 0; char *progname; extern void error(), exit(); #ifdef UTZOOERR extern char *mkprogname(); #else #define mkprogname(a) (a) #endif #define PRINTIT printf("%s\n", yytext) int cflag = 0; /* C only. */ /* stuff for stupid context-dependent #include <name> */ #define SAWNL 0 #define SAWNUM 1 #define SAWINC 2 #define OTHER 3 int state = SAWNL; /* #define PS printf("state %d\n", state) */ #define PS /* */ %} EXP ([eE][+-]?[0-9]+) FS [flFL] IS ([uU][lL]?|[lL][uU]?) %% [_a-zA-Z][_a-zA-Z0-9]* { /* identifier */ PRINTIT; if (strcmp(yytext, "include") == 0 && state == SAWNUM) state = SAWINC; else state = OTHER; PS; } [0-9]+"."[0-9]*{EXP}?{FS}? | "."[0-9]+{EXP}?{FS}? | [0-9]+{EXP}{FS}? | [1-9][0-9]*{IS}? | 0[0-7]*{IS}? | 0[xX][0-9a-fA-F]+{IS}? { PRINTIT; /* number */ } L?\'([^'\\\n]|\\(['"?\\abfnrtv]|[0-7]{1,3}|[xX][0-9a-fA-F]+))+\' { PRINTIT; /* character constant */ } L?\"([^"\\\n]|\\(['"?\\abfnrtv\n]|[0-7]{1,3}|[xX][0-9a-fA-F]+))*\" { /* string -- remove backslashed newlines */ register char *p; for (p = yytext; *p != '\0'; p++) if (*p == '\\' && *(p+1) == '\n') p++; else putchar(*p); putchar('\n'); } "#" { if (state == SAWNL) state = SAWNUM; PRINTIT; PS; } "<"[^>\n]*">" { PS; if (state != SAWINC) { REJECT; } else PRINTIT; state = OTHER; } [-()&*+~!/%<>^|,.=;:{}?] | "[" | "]" | "->" | "++" | "--" | "<<" | ">>" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "##" | "..." | [-*/%+&^|]"=" | "<<=" | ">>=" { PRINTIT; /* misc. tokens */ } "::" { if (cflag) { REJECT; } else PRINTIT; } \n { state = SAWNL; PS; printf("\\n\n"); } [ \t\v\f] printf("\\t\n"); "/*" { register int ch; register int nnl = 0; printf("/* "); for (;;) { ch = input(); if (ch == '*') { ch = input(); if (ch == '/') break; else unput(ch); } else if (ch == '\n') { nnl++; if (nnl <= 10) printf("\\n"); if (nnl == 10) printf("..."); } else if (ch == '\0') { fprintf(stderr, "unterminated comment!\n"); exit(0); } } printf(" */\n"); } "//" { register int ch; if (cflag) { REJECT; } else { printf("//\n"); while ((ch = input()) != '\n') if (ch == '\0') { fprintf(stderr, "unterminated comment!\n"); exit(0); } unput(ch); } } . printf("%c ???\n", yytext[0]); %% /* - main - parse arguments and handle options */ main(argc, argv) int argc; char *argv[]; { int c; int errflg = 0; FILE *in; struct stat statbuf; extern int optind; extern char *optarg; extern FILE *efopen(); void process(); progname = mkprogname(argv[0]); while ((c = getopt(argc, argv, "dC")) != EOF) switch (c) { case 'C': /* C only, no C++. */ cflag = 1; break; case 'd': /* Debugging. */ debug++; break; case '?': default: errflg++; break; } if (errflg) { fprintf(stderr, "usage: %s [-C] [file] ...\n", progname); exit(2); } if (optind >= argc) process(stdin, "stdin"); else for (; optind < argc; optind++) if (STREQ(argv[optind], "-")) process(stdin, "-"); else { in = efopen(argv[optind], "r"); if (fstat(fileno(in), &statbuf) < 0) error("can't fstat `%s'", argv[optind]); if ((statbuf.st_mode & S_IFMT) == S_IFDIR) error("`%s' is directory!", argv[optind]); process(in, argv[optind]); (void) fclose(in); } exit(0); } /* * process - process input file */ void process(in, inname) FILE *in; char *inname; { yyin = in; (void) yylex(); } /* - efopen - fopen with error check */ FILE * efopen(name, mode) char *name; char *mode; { FILE *f; f = fopen(name, mode); if (f == NULL) error("can't open `%s'", name); return(f); } /* - error - report trouble */ void /* does not return */ error(s1, s2) char *s1; char *s2; { fprintf(stderr, "%s: ", progname); fprintf(stderr, s1, s2); fprintf(stderr, "\n"); exit(1); } ---------- -- Welcome to Mars! Your | Henry Spencer at U of Toronto Zoology passport and visa, comrade? | uunet!attcan!utzoo!henry henry@zoo.toronto.edu