[comp.lang.c] yaccable grammars for C and C++

news@awdprime.UUCP (USENET News) (03/23/90)
In article <1435@io.UUCP> jar@io.UUCP (Jim Roskind x5570) writes:
Thanks for the grammers.  Some people requested a lexer.  Here is mine
that I quick hacked up.  I'm sure that someone can use this build a
complete one for your grammers.

Things to fix:
 o  It doesn't return same token values per jim's grammers (easy fix).
 o  You'll need to fix it to return different values for identifiers,
    enums, and typedefs (see comment at {ident}).
 o  It doesn't support floating point yet either (easy fix, I have {float}
    and {exp} defined but not used or tested).
 o  You'll want to handle cases like "test \" foobar" in the
    string handling section (look for STRING and CHAR_CONST).
    I couldn't figure out how to do it in 30 seconds or less so I'll
    leave it up to you.

With this I was able to parse a fairly large C program that I had removed
all the typedefs from (no enums) using the grammer right out of the back
of _The C Programming Language_ Second Edition.

ENJOY!!!!!   --- don't forget to remove the .sig at the bottom :-)
-------------- clexer.l for ANSI C ------------------
%{
#include <stdio.h>
#include "y.tab.h"
#define STRDUP(X) ((char *)strcpy(malloc(strlen(X)+1),X))
extern int yychar;
static int column = 0;
static int linenum = 1;
#define count(x) counter(x)
#ifndef YYDEBUG
int     yydebug = 0;
#else
int     yydebug = 1;
#endif
%}
alpha   [a-zA-Z]
digit   [0-9]
special [\_]
ident   (({alpha}|{special})({alpha}|{digit}|{special})*)
int     ({digit}+)
exp     ([Ee][-+]?{digit}+)
float   ([-+]?{digit}+\.?{digit}*)
%p 3000
%%
^\#.*		{ count(0); /* skip cpp lines */ }
[\ \n\t\v\f]+	{ count(0); /* skip white space */ }
"/*"		{ count(1); skipcomments(); }
"..."		{ count(1); return DOTDOTDOT; }
">="		{ count(1); return GE; }
"<="		{ count(1); return LE; }
"!="		{ count(1); return NOTEQU; }
"=="		{ count(1); return EQU; }

"*="		{ count(1); return MULTEQU; }
"/="		{ count(1); return DIVEQU; }
"%="		{ count(1); return MODEQU; }
"+="		{ count(1); return INCEQU; }
"-="		{ count(1); return DECEQU; }
"<<="		{ count(1); return SHIFTLEFTEQU; }
">>="		{ count(1); return SHIFTRIGHTEQU; }
"&="		{ count(1); return ANDEQU; }
"|="		{ count(1); return OREQU; }
"^="		{ count(1); return XOREQU; }

"<<"		{ count(1); return SHIFTLEFT; }
">>"		{ count(1); return SHIFTRIGHT; }
"++"		{ count(1); return INC; }
"--"		{ count(1); return DEC; }
"->"		{ count(1); return POINTS; }
"&&"		{ count(1); return LOGICALAND; }
"||"		{ count(1); return LOGICALOR; }

"("		{ count(1); return '('; }
","		{ count(1); return ','; }
")"		{ count(1); return ')'; }
";"		{ count(1); return ';'; }
"{"		{ count(1); return '{'; }
"}"		{ count(1); return '}'; }
"["		{ count(1); return '['; }
"]"		{ count(1); return ']'; }
"*"		{ count(1); return '*'; }
"/"		{ count(1); return '/'; }
"+"		{ count(1); return '+'; }
"-"		{ count(1); return '-'; }
"%"		{ count(1); return '%'; }
"^"		{ count(1); return '^'; }
"&"		{ count(1); return '&'; }
"?"		{ count(1); return '?'; }
":"		{ count(1); return ':'; }
"!"		{ count(1); return '!'; }
"."		{ count(1); return '.'; }
"~"		{ count(1); return '~'; }
"<"		{ count(1); return '<'; }
">"		{ count(1); return '>'; }
"."		{ count(1); return '.'; }
"="		{ count(1); return '='; }

if		{ count(1); return IF; }
else		{ count(1); return ELSE; }
while		{ count(1); return WHILE; }
do		{ count(1); return DO; }
for		{ count(1); return FOR; }
switch		{ count(1); return SWITCH; }
case		{ count(1); return CASE; }
default		{ count(1); return DEFAULT; }
goto		{ count(1); return GOTO; }
continue	{ count(1); return CONTINUE; }
break		{ count(1); return BREAK; }
return		{ count(1); return RETURN; }
sizeof		{ count(1); return SIZEOF; }
auto		{ count(1); return AUTO; }
register	{ count(1); return REGISTER; }
static		{ count(1); return STATIC; }
extern		{ count(1); return EXTERN; }
typedef		{ count(1); return TYPEDEF; }
void		{ count(1); return VOID; }
char		{ count(1); return CHAR; }
short		{ count(1); return SHORT; }
int		{ count(1); return INT; }
long		{ count(1); return LONG; }
float		{ count(1); return FLOAT; }
double		{ count(1); return DOUBLE; }
unsigned	{ count(1); return UNSIGNED; }
enum		{ count(1); return ENUM; }
const		{ count(1); return CONST; }
volatile	{ count(1); return VOLATILE; }
struct		{ count(1); return STRUCT; }
union		{ count(1); return UNION; }
\'.*\' {
	count(1);
	yylval.strval = STRDUP(yytext+1);
	yylval.strval[strlen(yylval.strval)-1] = 0;
	/* return STRING without quotes */
	return CHAR_CONST;
}
{int} {
	count(1);
	yylval.intval = atoi(yytext);
	return INTEGER_CONST;
}
\".*\" {
	count(1);
	yylval.strval = STRDUP(yytext+1);
	yylval.strval[strlen(yylval.strval)-1] = 0;
	/* return STRING without quotes */
	return STRING;
}
{ident} {
	count(1);
	yylval.strval = STRDUP(yytext);
	/*
	 * we need to hash this and make a structure that tells us
	 * what is going on so we know what to do with it
	 * is it an enum, typedef, or identifier
	 *
	 * BTW: you'll have to cooperate with the grammer to do this.
	 */
	return IDENTIFIER;
}
.			{ count(1); return ERROR; }
%%
yywrap() {return(1);}

/* Skip over comments. */
skipcomments()
{
    char c;

    while (1) {
	while ((c = input()) != '*')
	    if (c == '\n') {
		column = 0;
		linenum++;
	    }
	    else if (c == '\t')
		column += 8 - (column % 8);
	    else
		column++;
	if ((c = input()) == '/') {
	    column++;
#ifdef LEXDEBUG
    printf ("symbol found: %s\n", "*/");
#endif
	    return;
	}
	unput(c);
    }
}

/*ARGSUSED*/
counter (notwhite)
{
    register char *s;

#ifdef LEXDEBUG
    if (notwhite)
	printf ("symbol found: %s\n", yytext);
#endif
    for (s = yytext; *s; s++)
	if (*s == '\n') {
	    column = 0;
	    linenum++;
	}
	else if (*s == '\t')
	    column += 8 - (column % 8);
	else
	    column++;
}

yyerror (s)
char   *s;
{
    fprintf (stderr, "YYERROR: %s: line %d col %d\n", s, linenum, column);
    fprintf (stderr, "YYERROR: yytext=`%s' symbol was (%d)\n", yytext, yychar);
}

-- sanders
For every message of the day, a new improved message will arise to overcome it.
Reply-To: cs.utexas.edu!ibmaus!auschs!sanders.austin.ibm.com!sanders     (ugh!)