[comp.compilers] Lex/Yacc inputs for C and C pre-processor

johnl@ima.UUCP (07/22/87)

We are looking for Lex and Yacc inputs that describe the C language and
that of the C pre-processor.  While we would prefer separate inputs for
C and the C pre-processor, a set that combines them would work also.
The actions taken by the Yacc portion are not important as this is not
for a C compiler.

			Thank you in advance,
   Jack McGillis
	...!trwrb!trwspp!spp2!mcgillis
[I've never seen a real compiler that used lex, but you could probably use
the bison parser from Gnu's GCC.  -John]
--
Send compilers articles to ima!compilers or, in a pinch, to Levine@YALE.ARPA
Plausible paths are { ihnp4 | decvax | cbosgd | harvard | yale | cca}!ima
Please send responses to the originator of the message -- I cannot forward
mail accidentally sent back to compilers.  Meta-mail to ima!compilers-request

johnl@ima.UUCP (08/06/87)

This isn't quite what was asked for, but still might be of general interest.
This is a lex program which tokenizes C source, with minor limitations as
described in the leading comment.  (In fact it does C++, unless you give it
the -C option that restricts it to ANSI C only.)  It's probably not useful
as a compiler front end; in particular, it accepts *exactly* the legal C
strings/numbers/etc. rather than accepting more general forms and giving
error messages for violations of the detailed rules.  It is, however, of
some use for things like statistical analysis of C programs.

				Henry Spencer @ U of Toronto Zoology
				{allegra,ihnp4,decvax,pyramid}!utzoo!henry

----------------
%{
/*
 * ctokens - print tokens of a C or C++ program
 *
 * Full ANSI C (draft of 1 Oct 1986) except:  no trigraphs; copes with
 * backslash-newline stripping only inside strings; does not understand
 * the context-dependent rule that makes <bletch.h> a single token
 * inside a #include.
 *
 * Except for newlines, any white-space character is printed as "\t".
 * It would be more sensible to make the white-space expression [ \t\v\f]+
 * instead of just [ \t\v\f], but our old lex has problems with that.
 *
 * Note that this program uses one (sigh) undocumented feature of Unix lex:
 * the ability to override the choice of input stream by assigning to yyin.
 * Avoiding this requires reimplementing lex's input functions, which is a
 * pain because getc/ungetc isn't good enough.
 *
 * $Log$
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>

#define	STREQ(a, b)	(*(a) == *(b) && strcmp((a), (b)) == 0)

#ifndef lint
static char RCSid[] = "$Header$";
#endif

int debug = 0;
char *progname;

extern void error(), exit();
#ifdef UTZOOERR
extern char *mkprogname();
#else
#define	mkprogname(a)	(a)
#endif

#define	PRINTIT	printf("%s\n", yytext)

int cflag = 0;			/* C only. */
%}

EXP	([eE][+-]?[0-9]+)
FS	[flFL]
IS	([uU][lL]?|[lL][uU]?)

%%

[_a-zA-Z][_a-zA-Z0-9]*		{ PRINTIT;	/* identifier */ }

[0-9]+"."[0-9]*{EXP}?{FS}?	|
"."[0-9]+{EXP}?{FS}?		|
[0-9]+{EXP}{FS}?		|
[1-9][0-9]*{IS}?		|
0[0-7]*{IS}?			|
0[xX][0-9a-fA-F]+{IS}?		{ PRINTIT;	/* number */ }

\'([^'\\\n]|\\(['"?\\abfnrtv]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))+\'	{
		PRINTIT;	/* character constant */
	}

\"([^"\\\n]|\\(['"?\\abfnrtv\n]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))*\"	{
		/* string -- remove backslashed newlines */
		register char *p;

		for (p = yytext; *p != '\0'; p++)
			if (*p == '\\' && *(p+1) == '\n')
				p++;
			else
				putchar(*p);
		putchar('\n');
	}

[-()&*+~!/%<>^|,.=;:{}?#]	|
"["				|
"]"				|
"->"				|
"++"				|
"--"				|
"<<"				|
">>"				|
"<="				|
">="				|
"=="				|
"!="				|
"&&"				|
"||"				|
"##"				|
"..."				|
[-*/%+&^|]"="			|
"<<="				|
">>="				{ PRINTIT;	/* misc. tokens */ }
"::"				{
					if (cflag) {
						REJECT;
					} else
						PRINTIT;
				}

\n				printf("\\n\n");
[ \t\v\f]			printf("\\t\n");

"/*"	{
		register int ch;
		register int nnl = 0;

		printf("/* ");
		for (;;) {
			ch = input();
			if (ch == '*') {
				ch = input();
				if (ch == '/')
					break;
				else
					unput(ch);
			} else if (ch == '\n') {
				nnl++;
				if (nnl <= 10)
					printf("\\n");
				if (nnl == 10)
					printf("...");
			} else if (ch == '\0') {
				fprintf(stderr, "unterminated comment!\n");
				exit(0);
			}
		}
		printf(" */\n");
	}

"//"	{
		register int ch;

		if (cflag) {
			REJECT;
		} else {
			printf("//\n");
			while ((ch = input()) != '\n')
				if (ch == '\0') {
					fprintf(stderr, "unterminated comment!\n");
					exit(0);
				}
			unput(ch);
		}
	}

.				printf("%c ???\n", yytext[0]);

%%

/*
 - main - parse arguments and handle options
 */
main(argc, argv)
int argc;
char *argv[];
{
	int c;
	int errflg = 0;
	FILE *in;
	struct stat statbuf;
	extern int optind;
	extern char *optarg;
	extern FILE *efopen();
	void process();

	progname = mkprogname(argv[0]);

	while ((c = getopt(argc, argv, "dC")) != EOF)
		switch (c) {
		case 'C':	/* C only, no C++. */
			cflag = 1;
			break;
		case 'd':	/* Debugging. */
			debug++;
			break;
		case '?':
		default:
			errflg++;
			break;
		}
	if (errflg) {
		fprintf(stderr, "usage: %s [-C] [file] ...\n", progname);
		exit(2);
	}

	if (optind >= argc)
		process(stdin, "stdin");
	else
		for (; optind < argc; optind++)
			if (STREQ(argv[optind], "-"))
				process(stdin, "-");
			else {
				in = efopen(argv[optind], "r");
				if (fstat(fileno(in), &statbuf) < 0)
					error("can't fstat `%s'", argv[optind]);
				if ((statbuf.st_mode & S_IFMT) == S_IFDIR)
					error("`%s' is directory!", argv[optind]);
				process(in, argv[optind]);
				(void) fclose(in);
			}
	exit(0);
}

/*
 * process - process input file
 */
void
process(in, inname)
FILE *in;
char *inname;
{
	yyin = in;
	(void) yylex();
}
--
Send compilers articles to ima!compilers or, in a pinch, to Levine@YALE.ARPA
Plausible paths are { ihnp4 | decvax | cbosgd | harvard | yale | cca}!ima
Please send responses to the originator of the message -- I cannot forward
mail accidentally sent back to compilers.  Meta-mail to ima!compilers-request