[comp.lang.c] Lex Description

gee@dretor.dciem.dnd.ca (Tom Gee see wdf) (11/08/88)

A wee bit back someone was asking if anyone had a LEX description
of C that they were willing to share.  I don't remember the sender's
name, and there were no posted responses.  So, if anyone does have
such a thing, I would like a copy too. 

Thanks.

________
"If you know what a bubble sort is, |  Thomas Gee
 wipe it from your mind"            |  Aerospace Group
        -- Numerical Methods in C.  |  DCIEM
                                    |  Department of National Defence
               {watmath,utzoo}!dciem!zorac!dretor!gee

henry@utzoo.uucp (Henry Spencer) (11/12/88)

In article <1182@dretor.dciem.dnd.ca> gee@dretor (Tom Gee) writes:
>A wee bit back someone was asking if anyone had a LEX description
>of C that they were willing to share...

I don't remember the original query.  I have a lex program which tokenizes
C, with a couple of minor reservations.  If enough people squeak, and if
nobody comes up with a better one, I'll post it.
-- 
Sendmail is a bug,             |     Henry Spencer at U of Toronto Zoology
not a feature.                 | uunet!attcan!utzoo!henry henry@zoo.toronto.edu

bdb@becker.UUCP (Bruce Becker) (11/15/88)

In article <1988Nov11.201700.25003@utzoo.uucp> henry@utzoo.uucp (Henry Spencer) writes:
>In article <1182@dretor.dciem.dnd.ca> gee@dretor (Tom Gee) writes:
>>A wee bit back someone was asking if anyone had a LEX description
>>of C that they were willing to share...
>
>I don't remember the original query.  I have a lex program which tokenizes
>C, with a couple of minor reservations.  If enough people squeak, and if
>nobody comes up with a better one, I'll post it.
>-- 
>Sendmail is a bug,             |     Henry Spencer at U of Toronto Zoology
>not a feature.                 | uunet!attcan!utzoo!henry henry@zoo.toronto.edu


 #####   #####  #     # #######    #    #    #
#     # #     # #     # #         # #   #   #
#       #     # #     # #        #   #  #  #
 #####  #     # #     # #####   #     # ###
      # #   # # #     # #       ####### #  #
#     # #    #  #     # #       #     # #   #
 #####   #### #  #####  ####### #     # #    #


I think posting said lex script would be a distinct service to your fans...

P.S. How about Zmailer instead of sendmail?

Cheers,
-- 
Bruce Becker        Toronto, Ont.
Internet: bdb@becker.UUCP, bruce@gpu.utcs.toronto.edu, becker@ziebmef.UUCP
BitNet:   BECKER@HUMBER.BITNET
"...Now getting it ready..." - Guy Grand, commenting on the upcoming elections

henry@utzoo.uucp (Henry Spencer) (11/18/88)

In article <1988Nov11.201700.25003@utzoo.uucp> I wrote:
>... I have a lex program which tokenizes
>C, with a couple of minor reservations.  If enough people squeak, and if
>nobody comes up with a better one, I'll post it.

Well, I haven't seen any others, and I have heard a certain amount of
squeaking, so here it is.  It's a bit crude in spots, and error recovery
is minimal -- I built it partly as an exercise, and partly for some
statistics gathering on existing legal programs, so it accepts *exactly*
legal draft-ANSI C and nothing else.  Although this was based on a slightly
old draft, I think the lexical structure of the language is stable enough
that this is still current, with one exception:  hexadecimal string escapes
are no longer limited to three digits.

(If you want to make it more robust, the first thing you're going to have
to do is make the string part less fussy.)

(Oh yes, it uses a couple of local functions you may not have:  error()
prints a message and exits, efopen() does fopen() and calls error() if
it failed.)

-----------------
%{
/*
 * ctokens - print tokens of a C or C++ program
 *
 * Full ANSI C (draft of 1 Oct 1986) except:  no trigraphs; copes with
 * backslash-newline stripping only inside strings; imperfect understanding
 * of the context-dependent rule that makes <bletch.h> a single token
 * inside a #include.
 *
 * Except for newlines, any white-space character is printed as "\t".
 * It would be more sensible to make the white-space expression [ \t\v\f]+
 * instead of just [ \t\v\f], but our old lex has problems with that.
 *
 * Note that this program uses one (sigh) undocumented feature of Unix lex:
 * the ability to override the choice of input stream by assigning to yyin.
 * Avoiding this requires reimplementing lex's input functions, which is a
 * pain because getc/ungetc isn't good enough.
 *
 * $Log$
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>

#define	STREQ(a, b)	(*(a) == *(b) && strcmp((a), (b)) == 0)

#ifndef lint
static char RCSid[] = "$Header$";
#endif

int debug = 0;
char *progname;

extern void error(), exit();
#ifdef UTZOOERR
extern char *mkprogname();
#else
#define	mkprogname(a)	(a)
#endif

#define	PRINTIT	printf("%s\n", yytext)

int cflag = 0;			/* C only. */

/* stuff for stupid context-dependent #include <name> */
#define	SAWNL	0
#define	SAWNUM	1
#define	SAWINC	2
#define	OTHER	3
int state = SAWNL;
/* #define	PS	printf("state %d\n", state) */
#define	PS	/* */
%}

EXP	([eE][+-]?[0-9]+)
FS	[flFL]
IS	([uU][lL]?|[lL][uU]?)

%%

[_a-zA-Z][_a-zA-Z0-9]*		{		/* identifier */
					PRINTIT;
					if (strcmp(yytext, "include") == 0 &&
							state == SAWNUM)
						state = SAWINC;
					else
						state = OTHER;
					PS;
				}

[0-9]+"."[0-9]*{EXP}?{FS}?	|
"."[0-9]+{EXP}?{FS}?		|
[0-9]+{EXP}{FS}?		|
[1-9][0-9]*{IS}?		|
0[0-7]*{IS}?			|
0[xX][0-9a-fA-F]+{IS}?		{ PRINTIT;	/* number */ }

\'([^'\\\n]|\\(['"?\\abfnrtv]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))+\'	{
		PRINTIT;	/* character constant */
	}

\"([^"\\\n]|\\(['"?\\abfnrtv\n]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))*\"	{
		/* string -- remove backslashed newlines */
		register char *p;

		for (p = yytext; *p != '\0'; p++)
			if (*p == '\\' && *(p+1) == '\n')
				p++;
			else
				putchar(*p);
		putchar('\n');
	}

"#"	{
		if (state == SAWNL)
			state = SAWNUM;
		PRINTIT;
		PS;
	}
"<"[^>\n]*">"	{
		PS;
		if (state != SAWINC) {
			REJECT;
		} else
			PRINTIT;
		state = OTHER;
	}
[-()&*+~!/%<>^|,.=;:{}?]	|
"["				|
"]"				|
"->"				|
"++"				|
"--"				|
"<<"				|
">>"				|
"<="				|
">="				|
"=="				|
"!="				|
"&&"				|
"||"				|
"##"				|
"..."				|
[-*/%+&^|]"="			|
"<<="				|
">>="				{ PRINTIT;	/* misc. tokens */ }
"::"				{
					if (cflag) {
						REJECT;
					} else
						PRINTIT;
				}

\n				{ state = SAWNL; PS;  printf("\\n\n"); }
[ \t\v\f]			printf("\\t\n");

"/*"	{
		register int ch;
		register int nnl = 0;

		printf("/* ");
		for (;;) {
			ch = input();
			if (ch == '*') {
				ch = input();
				if (ch == '/')
					break;
				else
					unput(ch);
			} else if (ch == '\n') {
				nnl++;
				if (nnl <= 10)
					printf("\\n");
				if (nnl == 10)
					printf("...");
			} else if (ch == '\0') {
				fprintf(stderr, "unterminated comment!\n");
				exit(0);
			}
		}
		printf(" */\n");
	}

"//"	{
		register int ch;

		if (cflag) {
			REJECT;
		} else {
			printf("//\n");
			while ((ch = input()) != '\n')
				if (ch == '\0') {
					fprintf(stderr, "unterminated comment!\n");
					exit(0);
				}
			unput(ch);
		}
	}

.				printf("%c ???\n", yytext[0]);

%%

/*
 - main - parse arguments and handle options
 */
main(argc, argv)
int argc;
char *argv[];
{
	int c;
	int errflg = 0;
	FILE *in;
	struct stat statbuf;
	extern int optind;
	extern char *optarg;
	extern FILE *efopen();
	void process();

	progname = mkprogname(argv[0]);

	while ((c = getopt(argc, argv, "dC")) != EOF)
		switch (c) {
		case 'C':	/* C only, no C++. */
			cflag = 1;
			break;
		case 'd':	/* Debugging. */
			debug++;
			break;
		case '?':
		default:
			errflg++;
			break;
		}
	if (errflg) {
		fprintf(stderr, "usage: %s [-C] [file] ...\n", progname);
		exit(2);
	}

	if (optind >= argc)
		process(stdin, "stdin");
	else
		for (; optind < argc; optind++)
			if (STREQ(argv[optind], "-"))
				process(stdin, "-");
			else {
				in = efopen(argv[optind], "r");
				if (fstat(fileno(in), &statbuf) < 0)
					error("can't fstat `%s'", argv[optind]);
				if ((statbuf.st_mode & S_IFMT) == S_IFDIR)
					error("`%s' is directory!", argv[optind]);
				process(in, argv[optind]);
				(void) fclose(in);
			}
	exit(0);
}

/*
 * process - process input file
 */
void
process(in, inname)
FILE *in;
char *inname;
{
	yyin = in;
	(void) yylex();
}
-----------------
-- 
Sendmail is a bug,             |     Henry Spencer at U of Toronto Zoology
not a feature.                 | uunet!attcan!utzoo!henry henry@zoo.toronto.edu