[comp.lang.c] Want C syntax in lex

loo@mister-curious.sw.mcc.com (Joel Loo) (03/29/89)

Is there a C syntax definition in lex which can be obtained easily?
It will be useful for creating C lang related tools (e.g. the
recent "C comment stripper in lex" discussion would be trivial if
we have a standard C syntax in lex to start with).

--------------------------------------------------------------------
Joel Loo Peing Ling composed on Tue Mar 28 11:05:10 CST 1989
--------------------------------------------------------------------
MCC                            |   Email:  loo@sw.mcc.com
3500 West Balcones Centre Dr.  |   Voice:  (512)338-3680 (O)
Austin, TX 78759               |           (512)343-1780 (H)

[ Disclaimer: The above article reflects only my own opinion; my 
employer has nothing to do with it. ]

henry@utzoo.uucp (Henry Spencer) (03/30/89)

In article <2188@mister-curious.sw.mcc.com> loo@mister-curious.sw.mcc.com (Joel Loo) writes:
>Is there a C syntax definition in lex which can be obtained easily?

Um, surely you are thinking of yacc, not lex?  It's easy enough to write
a lex description of the syntax of C's tokens, if you ignore one or two
preprocessor oddities, but a full syntax of C can't possibly be done in
lex -- you need something like yacc for that.

I did a lex description of C tokens a couple of years ago which I can post
if people are interested.  It's not up-to-date in a couple of small respects,
I think, but it's close.
-- 
Welcome to Mars!  Your         |     Henry Spencer at U of Toronto Zoology
passport and visa, comrade?    | uunet!attcan!utzoo!henry henry@zoo.toronto.edu

henry@utzoo.uucp (Henry Spencer) (03/31/89)

In article <1989Mar29.224649.5766@utzoo.uucp> I wrote:
>I did a lex description of C tokens a couple of years ago which I can post
>if people are interested.  It's not up-to-date...

Enough people have already expressed interest for me to post it.  I did a
small update on it at the same time, so it is reasonably current.  Read
the comment at the top before getting too confident, though.  Note also
that it implements *exactly* ANSI C and makes no attempt at clean error
recovery.  I personally don't consider it a useful base for major software
work -- you just cannot analyze C properly without a full preprocessor --
but it is useful for things like statistics gathering.

----------
%{
/*
 * ctokens - print tokens of a C or C++ program
 *
 * Full ANSI C (draft of 31 Oct 1988) except:  no trigraphs; copes with
 * backslash-newline stripping only inside strings; imperfect understanding
 * of the context-dependent rule that makes <bletch.h> a single token
 * inside a #include.  The only C++ issues are the "::" operator and "//"
 * comments.
 *
 * There are some limitations inherent in not doing preprocessing.  In
 * ANSI C, characters that look illegal at first glance can disappear
 * from the source during preprocessing, either by being #ifdefed out
 * or by vanishing into a string.  This code does not consider that.
 * Preprocessor numbers can also do strange things, again not considered.
 *
 * There are also some implementation-dependent decisions in areas like
 * the exact syntax of header names; we don't try to be smart about this.
 *
 * Except for newlines, any white-space character is printed as "\t".
 * It would be more sensible to make the white-space expression [ \t\v\f]+
 * instead of just [ \t\v\f], but some old lexes have problems with that.
 *
 * Note that this program uses one (sigh) undocumented feature of Unix lex:
 * the ability to override the choice of input stream by assigning to yyin.
 * Avoiding this requires reimplementing lex's input functions, which is a
 * pain because getc/ungetc isn't good enough.
 *
 * $Log$
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>

#define	STREQ(a, b)	(*(a) == *(b) && strcmp((a), (b)) == 0)

#ifndef lint
static char RCSid[] = "$Header$";
#endif

int debug = 0;
char *progname;

extern void error(), exit();
#ifdef UTZOOERR
extern char *mkprogname();
#else
#define	mkprogname(a)	(a)
#endif

#define	PRINTIT	printf("%s\n", yytext)

int cflag = 0;			/* C only. */

/* stuff for stupid context-dependent #include <name> */
#define	SAWNL	0
#define	SAWNUM	1
#define	SAWINC	2
#define	OTHER	3
int state = SAWNL;
/* #define	PS	printf("state %d\n", state) */
#define	PS	/* */
%}

EXP	([eE][+-]?[0-9]+)
FS	[flFL]
IS	([uU][lL]?|[lL][uU]?)

%%

[_a-zA-Z][_a-zA-Z0-9]*		{		/* identifier */
					PRINTIT;
					if (strcmp(yytext, "include") == 0 &&
							state == SAWNUM)
						state = SAWINC;
					else
						state = OTHER;
					PS;
				}

[0-9]+"."[0-9]*{EXP}?{FS}?	|
"."[0-9]+{EXP}?{FS}?		|
[0-9]+{EXP}{FS}?		|
[1-9][0-9]*{IS}?		|
0[0-7]*{IS}?			|
0[xX][0-9a-fA-F]+{IS}?		{ PRINTIT;	/* number */ }

L?\'([^'\\\n]|\\(['"?\\abfnrtv]|[0-7]{1,3}|[xX][0-9a-fA-F]+))+\'	{
		PRINTIT;	/* character constant */
	}

L?\"([^"\\\n]|\\(['"?\\abfnrtv\n]|[0-7]{1,3}|[xX][0-9a-fA-F]+))*\"	{
		/* string -- remove backslashed newlines */
		register char *p;

		for (p = yytext; *p != '\0'; p++)
			if (*p == '\\' && *(p+1) == '\n')
				p++;
			else
				putchar(*p);
		putchar('\n');
	}

"#"	{
		if (state == SAWNL)
			state = SAWNUM;
		PRINTIT;
		PS;
	}
"<"[^>\n]*">"	{
		PS;
		if (state != SAWINC) {
			REJECT;
		} else
			PRINTIT;
		state = OTHER;
	}
[-()&*+~!/%<>^|,.=;:{}?]	|
"["				|
"]"				|
"->"				|
"++"				|
"--"				|
"<<"				|
">>"				|
"<="				|
">="				|
"=="				|
"!="				|
"&&"				|
"||"				|
"##"				|
"..."				|
[-*/%+&^|]"="			|
"<<="				|
">>="				{ PRINTIT;	/* misc. tokens */ }
"::"				{
					if (cflag) {
						REJECT;
					} else
						PRINTIT;
				}

\n				{ state = SAWNL; PS;  printf("\\n\n"); }
[ \t\v\f]			printf("\\t\n");

"/*"	{
		register int ch;
		register int nnl = 0;

		printf("/* ");
		for (;;) {
			ch = input();
			if (ch == '*') {
				ch = input();
				if (ch == '/')
					break;
				else
					unput(ch);
			} else if (ch == '\n') {
				nnl++;
				if (nnl <= 10)
					printf("\\n");
				if (nnl == 10)
					printf("...");
			} else if (ch == '\0') {
				fprintf(stderr, "unterminated comment!\n");
				exit(0);
			}
		}
		printf(" */\n");
	}

"//"	{
		register int ch;

		if (cflag) {
			REJECT;
		} else {
			printf("//\n");
			while ((ch = input()) != '\n')
				if (ch == '\0') {
					fprintf(stderr, "unterminated comment!\n");
					exit(0);
				}
			unput(ch);
		}
	}

.				printf("%c ???\n", yytext[0]);

%%

/*
 - main - parse arguments and handle options
 */
main(argc, argv)
int argc;
char *argv[];
{
	int c;
	int errflg = 0;
	FILE *in;
	struct stat statbuf;
	extern int optind;
	extern char *optarg;
	extern FILE *efopen();
	void process();

	progname = mkprogname(argv[0]);

	while ((c = getopt(argc, argv, "dC")) != EOF)
		switch (c) {
		case 'C':	/* C only, no C++. */
			cflag = 1;
			break;
		case 'd':	/* Debugging. */
			debug++;
			break;
		case '?':
		default:
			errflg++;
			break;
		}
	if (errflg) {
		fprintf(stderr, "usage: %s [-C] [file] ...\n", progname);
		exit(2);
	}

	if (optind >= argc)
		process(stdin, "stdin");
	else
		for (; optind < argc; optind++)
			if (STREQ(argv[optind], "-"))
				process(stdin, "-");
			else {
				in = efopen(argv[optind], "r");
				if (fstat(fileno(in), &statbuf) < 0)
					error("can't fstat `%s'", argv[optind]);
				if ((statbuf.st_mode & S_IFMT) == S_IFDIR)
					error("`%s' is directory!", argv[optind]);
				process(in, argv[optind]);
				(void) fclose(in);
			}
	exit(0);
}

/*
 * process - process input file
 */
void
process(in, inname)
FILE *in;
char *inname;
{
	yyin = in;
	(void) yylex();
}

/*
 - efopen - fopen with error check
 */
FILE *
efopen(name, mode)
char *name;
char *mode;
{
	FILE *f;

	f = fopen(name, mode);
	if (f == NULL)
		error("can't open `%s'", name);
	return(f);
}

/*
 - error - report trouble
 */
void				/* does not return */
error(s1, s2)
char *s1;
char *s2;
{
	fprintf(stderr, "%s: ", progname);
	fprintf(stderr, s1, s2);
	fprintf(stderr, "\n");
	exit(1);
}
----------
-- 
Welcome to Mars!  Your         |     Henry Spencer at U of Toronto Zoology
passport and visa, comrade?    | uunet!attcan!utzoo!henry henry@zoo.toronto.edu