johnl@ima.UUCP (07/22/87)
We are looking for Lex and Yacc inputs that describe the C language and that of the C pre-processor. While we would prefer separate inputs for C and the C pre-processor, a set that combines them would work also. The actions taken by the Yacc portion are not important as this is not for a C compiler. Thank you in advance, Jack McGillis ...!trwrb!trwspp!spp2!mcgillis [I've never seen a real compiler that used lex, but you could probably use the bison parser from Gnu's GCC. -John] -- Send compilers articles to ima!compilers or, in a pinch, to Levine@YALE.ARPA Plausible paths are { ihnp4 | decvax | cbosgd | harvard | yale | cca}!ima Please send responses to the originator of the message -- I cannot forward mail accidentally sent back to compilers. Meta-mail to ima!compilers-request
johnl@ima.UUCP (08/06/87)
This isn't quite what was asked for, but still might be of general interest. This is a lex program which tokenizes C source, with minor limitations as described in the leading comment. (In fact it does C++, unless you give it the -C option that restricts it to ANSI C only.) It's probably not useful as a compiler front end; in particular, it accepts *exactly* the legal C strings/numbers/etc. rather than accepting more general forms and giving error messages for violations of the detailed rules. It is, however, of some use for things like statistical analysis of C programs. Henry Spencer @ U of Toronto Zoology {allegra,ihnp4,decvax,pyramid}!utzoo!henry ---------------- %{ /* * ctokens - print tokens of a C or C++ program * * Full ANSI C (draft of 1 Oct 1986) except: no trigraphs; copes with * backslash-newline stripping only inside strings; does not understand * the context-dependent rule that makes <bletch.h> a single token * inside a #include. * * Except for newlines, any white-space character is printed as "\t". * It would be more sensible to make the white-space expression [ \t\v\f]+ * instead of just [ \t\v\f], but our old lex has problems with that. * * Note that this program uses one (sigh) undocumented feature of Unix lex: * the ability to override the choice of input stream by assigning to yyin. * Avoiding this requires reimplementing lex's input functions, which is a * pain because getc/ungetc isn't good enough. * * $Log$ */ #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <string.h> #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) #ifndef lint static char RCSid[] = "$Header$"; #endif int debug = 0; char *progname; extern void error(), exit(); #ifdef UTZOOERR extern char *mkprogname(); #else #define mkprogname(a) (a) #endif #define PRINTIT printf("%s\n", yytext) int cflag = 0; /* C only. */ %} EXP ([eE][+-]?[0-9]+) FS [flFL] IS ([uU][lL]?|[lL][uU]?) %% [_a-zA-Z][_a-zA-Z0-9]* { PRINTIT; /* identifier */ } [0-9]+"."[0-9]*{EXP}?{FS}? | "."[0-9]+{EXP}?{FS}? | [0-9]+{EXP}{FS}? | [1-9][0-9]*{IS}? | 0[0-7]*{IS}? | 0[xX][0-9a-fA-F]+{IS}? { PRINTIT; /* number */ } \'([^'\\\n]|\\(['"?\\abfnrtv]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))+\' { PRINTIT; /* character constant */ } \"([^"\\\n]|\\(['"?\\abfnrtv\n]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))*\" { /* string -- remove backslashed newlines */ register char *p; for (p = yytext; *p != '\0'; p++) if (*p == '\\' && *(p+1) == '\n') p++; else putchar(*p); putchar('\n'); } [-()&*+~!/%<>^|,.=;:{}?#] | "[" | "]" | "->" | "++" | "--" | "<<" | ">>" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "##" | "..." | [-*/%+&^|]"=" | "<<=" | ">>=" { PRINTIT; /* misc. tokens */ } "::" { if (cflag) { REJECT; } else PRINTIT; } \n printf("\\n\n"); [ \t\v\f] printf("\\t\n"); "/*" { register int ch; register int nnl = 0; printf("/* "); for (;;) { ch = input(); if (ch == '*') { ch = input(); if (ch == '/') break; else unput(ch); } else if (ch == '\n') { nnl++; if (nnl <= 10) printf("\\n"); if (nnl == 10) printf("..."); } else if (ch == '\0') { fprintf(stderr, "unterminated comment!\n"); exit(0); } } printf(" */\n"); } "//" { register int ch; if (cflag) { REJECT; } else { printf("//\n"); while ((ch = input()) != '\n') if (ch == '\0') { fprintf(stderr, "unterminated comment!\n"); exit(0); } unput(ch); } } . printf("%c ???\n", yytext[0]); %% /* - main - parse arguments and handle options */ main(argc, argv) int argc; char *argv[]; { int c; int errflg = 0; FILE *in; struct stat statbuf; extern int optind; extern char *optarg; extern FILE *efopen(); void process(); progname = mkprogname(argv[0]); while ((c = getopt(argc, argv, "dC")) != EOF) switch (c) { case 'C': /* C only, no C++. */ cflag = 1; break; case 'd': /* Debugging. */ debug++; break; case '?': default: errflg++; break; } if (errflg) { fprintf(stderr, "usage: %s [-C] [file] ...\n", progname); exit(2); } if (optind >= argc) process(stdin, "stdin"); else for (; optind < argc; optind++) if (STREQ(argv[optind], "-")) process(stdin, "-"); else { in = efopen(argv[optind], "r"); if (fstat(fileno(in), &statbuf) < 0) error("can't fstat `%s'", argv[optind]); if ((statbuf.st_mode & S_IFMT) == S_IFDIR) error("`%s' is directory!", argv[optind]); process(in, argv[optind]); (void) fclose(in); } exit(0); } /* * process - process input file */ void process(in, inname) FILE *in; char *inname; { yyin = in; (void) yylex(); } -- Send compilers articles to ima!compilers or, in a pinch, to Levine@YALE.ARPA Plausible paths are { ihnp4 | decvax | cbosgd | harvard | yale | cca}!ima Please send responses to the originator of the message -- I cannot forward mail accidentally sent back to compilers. Meta-mail to ima!compilers-request