[comp.sources.misc] REPOST: v19i090: wacco - A C++ LL parser generator, Part03/06

parag@hpsdeb.sde.hp.com (Parag Patel) (05/19/91)
Submitted-by: Parag Patel <parag@hpsdeb.sde.hp.com>
Posting-number: Volume 19, Issue 90
Archive-name: wacco/part03

#!/bin/sh
# do not concatenate these parts, unpack them in order with /bin/sh
# file table.h continued
#
if test ! -r _shar_seq_.tmp; then
	echo 'Please unpack part 1 first!'
	exit 1
fi
(read Scheck
 if test "$Scheck" != 3; then
	echo Please unpack part "$Scheck" next!
	exit 1
 else
	exit 0
 fi
) < _shar_seq_.tmp || exit 1
if test ! -f _shar_wnt_.tmp; then
	echo 'x - still skipping table.h'
else
echo 'x - continuing file table.h'
sed 's/^X//' << 'SHAR_EOF' >> 'table.h' &&
{ \
X	this->transfer(NULL); \
} \
inline table &iterator::operator=(table &t) \
{ \
X	this->transfer(&t); \
X	return t; \
} \
inline boolean iterator::operator==(iterator &i) \
{ \
X	return (curr == i.curr); \
} \
inline boolean iterator::operator!=(iterator &i) \
{ \
X	return (curr != i.curr); \
} \
inline iterator::operator boolean() \
{ \
X	return (curr != NULL); \
} \
inline symbol &iterator::get() \
{ \
X	if (curr != NULL) \
X		return curr->sym; \
X	else \
X		return *(symbol *)NULL; \
} \
inline symbol &iterator::operator()() { return get(); } \
inline symbol &iterator::operator*() { return get(); } \
inline symbol *iterator::operator->() { return &get(); } \
X
X
X
#define implement_table(table,iterator,symbol,key,hash_func) \
table::table(unsigned hs, TABLE_BUMP *bump_function) \
{ \
X	hash_size = 0; \
X	numnode = 0; \
X	vec = NULL; \
X	curr = NULL; \
X	bump = (bump_function == NULL) ? &table_bump : bump_function; \
X	resize(hs); \
} \
table::table(table& h) \
{ \
X	hash_size = 0; \
X	numnode = 0; \
X	vec = NULL; \
X	curr = NULL; \
X	bump = h.bump; \
X	resize(h.hash_size); \
X	*this = h; \
} \
table::~table() \
{ \
X	clear(); \
X	delete vec; \
} \
boolean table::find(key k) \
{ \
X	register table_node(table) *p = vec[hash_func(k) % hash_size]; \
X	for (; p != NULL; p = p->next) \
X		if (p->sym == k) \
X		{ \
X			curr = p; \
X			return TRUE; \
X		} \
X	curr = NULL; \
X	return FALSE; \
} \
boolean table::insert(key k) \
{ \
X	if (numnode >= threshold) \
X		resize(); \
X	register table_node(table) **q = vec + hash_func(k) % hash_size; \
X	register table_node(table) *p = *q; \
X	for (; p != NULL; p = p->next) \
X		if (p->sym == k) \
X		{ \
X			curr = p; \
X			return FALSE; \
X		} \
X	curr = *q = new table_node(table)(k, *q); \
X	numnode++; \
X	return TRUE; \
} \
boolean table::insertsym(symbol &s) \
{ \
X	if (numnode >= threshold) \
X		resize(); \
X	register table_node(table) **q = vec + \
X		hash_func((table_key(table))s) % hash_size; \
X	register table_node(table) *p = *q; \
X	for (; p != NULL; p = p->next) \
X		if (p->sym == (table_key(table))s) \
X		{ \
X			curr = p; \
X			return FALSE; \
X		} \
X	curr = *q = new table_node(table)(s, *q); \
X	numnode++; \
X	return TRUE; \
} \
boolean table::remove(key k) \
{ \
X	register table_node(table) **p = vec + hash_func(k) % hash_size; \
X	curr = NULL; \
X	for (; (*p) != NULL; p = &((**p).next)) \
X		if ((**p).sym == k) \
X		{ \
X			delnode(p); \
X			return TRUE; \
X		} \
X	return FALSE; \
} \
void table::delnode(table_node(table) **p) \
{ \
X	table_node(table) *next = (**p).next; \
X	delete *p; \
X	*p = next; \
X	--numnode; \
} \
void table::resize(unsigned hs) \
{ \
X	register table_node(table) **pos, **endpos; \
X	table_node(table) *chain = NULL; \
X	unsigned new_hash_size = (hs == 0) ? bump(hash_size) : hs; \
X	if (new_hash_size == 0) \
X		new_hash_size = 1; \
X	if (new_hash_size <= hash_size) \
X		return; \
X	if (vec != NULL) \
X	{ \
X		pos = vec; \
X		endpos = pos + hash_size; \
X		for (; pos < endpos; pos++) \
X		{ \
X			table_node(table) *p = *pos; \
X			if (p != NULL) \
X			{ \
X				while (p->next != NULL) \
X					p = p->next; \
X				p->next = chain; \
X				chain = *pos; \
X			} \
X		} \
X	} \
X	hash_size = new_hash_size; \
X	threshold = hash_size + (hash_size >> 2); \
X	delete vec; \
X	pos = vec = new table_node(table) *[hash_size]; \
X	endpos = pos + hash_size; \
X	while (pos < endpos) \
X		*(pos++) = NULL; \
X	while (chain != NULL) \
X	{ \
X		table_node(table) *entry = chain; \
X		chain = entry->next; \
X		pos = vec + \
X				hash_func((table_key(table))entry->sym) \
X				% hash_size; \
X		entry->next = *pos; \
X		*pos = entry; \
X	} \
} \
table& table::clear() \
{ \
X	for (int i = 0; i < hash_size; i++) \
X	{ \
X		table_node(table) *p = vec[i]; \
X		while (p != NULL) \
X		{ \
X			table_node(table) *next = p->next; \
X			delete p; \
X			p = next; \
X		} \
X		vec[i] = NULL; \
X	} \
X	numnode = 0; \
X	curr = NULL; \
X	return *this; \
} \
table& table::operator=(table& h) \
{ \
X	if (this == &h) \
X	    return *this; \
X	clear(); \
X	for (iterator i = h; i; i++) \
X		insertsym(i()); \
X	return *this; \
} \
boolean table::operator==(table& h) \
{ \
X	if (this == &h) \
X	    return TRUE; \
X	if (size() != h.size()) \
X	    return FALSE; \
X	for (iterator i = h; i; i++) \
X	    if (!find((table_key(table))i())) \
X		return FALSE; \
X	return TRUE; \
} \
table& table::operator+=(table& h) \
{ \
X	if (this == &h) \
X	    return *this; \
X	for (iterator i = h; i; i++) \
X		insert((table_key(table))i()); \
X	return *this; \
} \
table& table::operator-=(table& h) \
{ \
X	if (this == &h) \
X	    return clear(); \
X	for (iterator i = h; i; i++) \
X		remove((table_key(table))i()); \
X	return *this; \
} \
table& table::operator&=(table& h) \
{ \
X	if (this == &h) \
X	    return *this; \
X	for (iterator i = *this; i; ) \
X	    if (h.find((table_key(table))i())) \
X		i++; \
X	    else \
X		i.remove(); \
X	return *this; \
} \
symbol &iterator::operator++() \
{ \
X	if (curr != NULL) \
X	{ \
X		table_node(table) *prev = curr; \
X		for (curr = curr->next; \
X				curr == NULL && ++index < parent->hash_size; \
X				curr = parent->vec[index]) \
X			; \
X		return prev->sym; \
X	} \
X	else \
X	{ \
X		return *(symbol *)NULL; \
X	} \
} \
void iterator::remove() \
{ \
X	if (curr != NULL) \
X	{ \
X		register table_node(table) **p = parent->vec + index; \
X		for (; (*p) != curr; p = &((**p).next)) \
X			; \
X		parent->delnode(p); \
X	} \
} \
void iterator::transfer(table *newparent) \
{ \
X	if ((parent = newparent) != NULL) \
X	{ \
X		for (index = 0, curr = parent->vec[0]; \
X				curr == NULL && ++index < parent->hash_size; \
X				curr = parent->vec[index]) \
X			; \
X	} \
X	else \
X		curr = NULL; \
}
SHAR_EOF
echo 'File table.h is complete' &&
chmod 0444 table.h ||
echo 'restore of table.h failed'
Wc_c="`wc -c < 'table.h'`"
test 8290 -eq "$Wc_c" ||
	echo 'table.h: original size 8290, current size' "$Wc_c"
rm -f _shar_wnt_.tmp
fi
# ============= tgram.bad ==============
if test -f 'tgram.bad' -a X"$1" != X"-c"; then
	echo 'x - skipping tgram.bad (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting tgram.bad (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'tgram.bad' &&
program test(input);
X
var
X	x a: integer;
X	ara: array [1...10] of integer;
X
procedure try(a:integer):result integer;
X	u,v:integer;
X	begin
if not not a try:=u+ else try:=a;
X	end
X	end
X
begin
X	while not x a do
X		x = a-1;
X		a2:=5**a;
X		x:= ara[3]*(ara[x+x)
X	end
end
SHAR_EOF
chmod 0444 tgram.bad ||
echo 'restore of tgram.bad failed'
Wc_c="`wc -c < 'tgram.bad'`"
test 261 -eq "$Wc_c" ||
	echo 'tgram.bad: original size 261, current size' "$Wc_c"
rm -f _shar_wnt_.tmp
fi
# ============= tgram.good ==============
if test -f 'tgram.good' -a X"$1" != X"-c"; then
	echo 'x - skipping tgram.good (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting tgram.good (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'tgram.good' &&
program test(input);
X
var
X	x, a: integer;
X	ara: array [1..10] of integer;
X
procedure try(a:integer);
X	var u,v:integer;
X	begin
X		if not a then try:=u+3 else try:=a;
X	end;
X
begin
X	while a do
X	begin
X		x := a-1;
X		a:=5*a;
X	end;
end.
SHAR_EOF
chmod 0444 tgram.good ||
echo 'restore of tgram.good failed'
Wc_c="`wc -c < 'tgram.good'`"
test 229 -eq "$Wc_c" ||
	echo 'tgram.good: original size 229, current size' "$Wc_c"
rm -f _shar_wnt_.tmp
fi
# ============= tgram.w ==============
if test -f 'tgram.w' -a X"$1" != X"-c"; then
	echo 'x - skipping tgram.w (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting tgram.w (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'tgram.w' &&
// Copyright (c) 1991 by Parag Patel.  All Rights Reserved.
// $Header: tgram.w,v 1.5 91/02/22 16:05:33 hmgr Exp $
X
{
#include <stdio.h>
}
X
program
X	: "program" ID '(' idlist ')' semi
X			declarations subprogdecls compoundstat '.'
X	;
X
semi
X	: ';'[semi+]
X	;
X
idlist
X	: ID (',' ID | [])
X	;
X
declarations
X	: "var" (decllist | semi)
X	| []
X	;
X
decllist
X	: idlist ':' type semi (decllist | [])
X	;
X
type
X	: "integer"
X	| "array" '[' INT ".." INT ']' "of" "integer"
X	;
X
subprogdecls
X	: subproghead declarations subprogdecls
X			compoundstat semi subprogdecls
X	| []
X	;
X
subproghead
X	: "function" ID arguments ':' "integer" semi
X	| "procedure" ID arguments semi
X	;
X
arguments
X	: '(' paramlist ')'
X	| []
X	;
X
paramlist
X	: idlist ':' type (semi paramlist | [])
X	;
X
compoundstat
X	: "begin" statlist "end"[+semi+]
X	;
X
statlist
X	: statement semi statlist
X	| []
X	;
X
statement
X	: ID assignment
X	| compoundstat
X	| "if" expression "then" statement ("else" statement | [])
X	| "while" expression "do" statement
X	| "for" varref ":=" expression ("to" | "downto")
X			expression "do" statement
X	;
X
assignment
X	: '[' expression ']' ":=" expression
X	| '(' exprlist ')'
X	| ":=" expression
X	| []
X	;
X
varref
X	: ID ('[' expression ']' | [])
X	;
X
exprlist
X	: expression (',' exprlist | [])
X	;
X
expression
X	: term (('+'|'-') expression | [])
X	;
X
term
X	: expr (('*'|'/') term | [])
X	;
X
expr
X	: ID ('(' exprlist ')' | '[' expression ']' [')'] | [])
X	| INT
X	| '(' expression ')'
X	| "not" expression
X	| ('+'|'-') expression
X	;
X
{
X    int main()
X    {
X	    w_setfile(stdin);
X
X	    if (!program())
X		    return 1;
X
X	    return 0;
X    }
}
X
$$
X
D	[0-9]
L	[A-Za-z_$]
X
%%
X
$ID		{L}({L}|{D})*
$INT	{D}+
X
[ \t\v\n\f]	;
.			{ w_scanerr("Illegal character %d (%c)", yytext[0], yytext[0]); }
SHAR_EOF
chmod 0444 tgram.w ||
echo 'restore of tgram.w failed'
Wc_c="`wc -c < 'tgram.w'`"
test 1739 -eq "$Wc_c" ||
	echo 'tgram.w: original size 1739, current size' "$Wc_c"
rm -f _shar_wnt_.tmp
fi
# ============= toks.h ==============
if test -f 'toks.h' -a X"$1" != X"-c"; then
	echo 'x - skipping toks.h (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting toks.h (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'toks.h' &&
// Copyright (c) 1991 by Parag Patel.  All Rights Reserved.
// $Header: toks.h,v 1.9 91/02/22 16:04:47 hmgr Exp $
X
#ifndef _T_O_K_E_N_S_
#define _T_O_K_E_N_S_
X
#ifndef NULL
#include <stdio.h>
#endif
X
#define RETOK 1
#define RETERR 0
X
enum tokenids
{
X	EOI = 0,
X	_EMPTY = 256, /* nothing uses this yet */
X	DIRECTIVE = 257,
X	ID,
X	EXPORT,
X	NULLSYM,
X	STRING,
X	CHARACTER,
X	INT,
X	BLOCKCODE,
};
X
#ifdef __cplusplus
extern "C" {
extern int yylex();
extern int yylook();
extern int w_input();
extern int w_unput(int);
extern void w_output(int);
}
#else
extern int yylex();
extern int yylook();
extern int w_input();
extern int w_unput();
extern void w_output();
#endif
X
extern int w_numerrors;
#ifdef __cplusplus
extern "C" {
extern int w_scanerr(const char *, ...);
extern void w_flusherrs();
extern void w_closefile();
extern int w_openfile(char *);
extern FILE *w_getfile();
extern void w_setfile(FILE *);
extern int w_currcol();
extern int w_currline();
extern char *w_getcurrline();
extern int w_gettoken();
extern int w_nexttoken();
extern void w_skiptoken();
extern char *w_tokenname(int);
}
#else
extern int w_scanerr();
extern void w_flusherrs();
extern void w_closefile();
extern int w_openfile();
extern FILE *w_getfile();
extern void w_setfile();
extern int w_currcol();
extern int w_currline();
extern char *w_getcurrline();
extern int w_gettoken();
extern int w_nexttoken();
extern void w_skiptoken();
extern char *w_tokenname();
#endif
X
#ifdef __cplusplus
extern "C" { int program(); }
#else
extern int program();
#endif
X
X
#endif /* _T_O_K_E_N_S_ */
SHAR_EOF
chmod 0444 toks.h ||
echo 'restore of toks.h failed'
Wc_c="`wc -c < 'toks.h'`"
test 1555 -eq "$Wc_c" ||
	echo 'toks.h: original size 1555, current size' "$Wc_c"
rm -f _shar_wnt_.tmp
fi
# ============= version.C ==============
if test -f 'version.C' -a X"$1" != X"-c"; then
	echo 'x - skipping version.C (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting version.C (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'version.C' &&
// Copyright (c) 1991 by Parag Patel.  All Rights Reserved.
X
static char *const whatid = "@(#)wacco 1.1 (17 May 1991)";
SHAR_EOF
chmod 0444 version.C ||
echo 'restore of version.C failed'
Wc_c="`wc -c < 'version.C'`"
test 120 -eq "$Wc_c" ||
	echo 'version.C: original size 120, current size' "$Wc_c"
rm -f _shar_wnt_.tmp
fi
# ============= wacco.1 ==============
if test -f 'wacco.1' -a X"$1" != X"-c"; then
	echo 'x - skipping wacco.1 (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting wacco.1 (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'wacco.1' &&
.\" Copyright (c) 1991 by Parag Patel.  All Rights Reserved.
.\" $Header: wacco.1,v 1.13 91/02/22 16:04:11 hmgr Exp $
.TH WACCO 1 unsupported
.ad b
.SH NAME
wacco \- why another compiler-compiler?
.SH SYNOPSIS
.B wacco
.RB [ -dciOCL ]
.RB [ -h
header]
.RB [ -p
parser]
.RB [ -s
scanner]
[file]
.SH DESCRIPTION
.I Wacco
is another compiler-compiler.
(Why another compiler-compiler you may ask?  Why not!)
It has some rather convenient features with a lot of
syntactic sugar tossed on top over what
.IR yacc (1)
provides.
.PP
Unlike
.IR yacc (1),
.I wacco
generates a top-down recursive-descent LL(1) parser instead
of a bottom-up LALR parser.
Although
.I wacco
generated parsers handle a smaller class of grammars than
.IR yacc (1),
in practice, there is rarely any need for a full LALR parser.
It is much easier to deal with error recovery in a top-down parser.
It is also possible to re-direct and even completely alter the
parse on the fly, as well as perform attribute-driven parsing.
.PP
.I Wacco
generates a parser that automatically attempts to resync on
errors based on some heuristics on the first
and follow sets of non-terminals.
Admittedly this is a far from optimal error-handling system,
but it is much better that what
.IR yacc (1)
provides (skip X tokens, then continue!).
Future versions of
.I wacco
may provide much more intelligent error-recovery systems.
.PP
.I Wacco
also allows using its parser in an attribute-driven manner.
Information may be passed down to the right-hand side of an
expression even though that expression hasn't yet been parsed.
Different rules may have different types associated with them.
The C++ compiler will perform the type-checking for you!
No more funny unions and hoping that you didn't make a mistake!
.PP
Token values do not have to be explicitly defined.
String and character tokens may be specified implicitly as well,
rather than creating a dummy symbol for them.
.I Wacco
will generate a header file containing definitions for
all the tokens.
.PP
There is support for a somewhat smarter scanner.
Errors will be (hopefully) printed out in a clear and simple
manner.
.PP
.I Wacco
currently generates only C++ code.
Some day it may optionally generate straight C code as well
(but don't hold your breath).
.PP
The grammar format is described in the
.I wacco
documentation since it is too lengthy to repeat here.
See the
.I wacco.doc
files for more information.
.SS Options
.I Wacco
expects a grammar on stdin if
.I file
is not specified on the command line.
It will generate the files
.I parser.C
and
.I tokens.h
by default.
If there is a scanner section in the input file,
then the file
.I scanner.l
will also be generated.
.TP
.B -d
Dump mode.
Only prints (somewhat) interesting information about what
.I wacco
thinks the grammar looks like, first and follow sets, and other
miscellaneous stuff.
.TP
.B -i
Do not generate code for scanning case-insensitive strings.
If the
.B "string"
construct is used in the grammer source,
.I wacco
will normally generate code like
.BR [Ss][Tr][Ii][Nn][Gg] .
This option inhibits such behavior to allow exact matches.
.TP
.B -c
Normally,
.I wacco
will generate temporary output files and then compare them with
the originals.
The originals are replaced only if the new files are different.
This is very handy for use inside makefiles, where doing things
like this gets ugly.
This option turns off this feature and always generates the
output files.
.TP
.B -O
Turns off optimization.
Normally,
.I wacco
expands non-terminals that are only used once in the code rather
than creating functions for them.
If you use the "return" operator, you must use this option for now.
If you use the "$?" construct, optimization will be automatically
turned off.
.TP
.B -C
Do not output the imbedded user code within the grammer.
This generates a parser that either accepts or rejects
its input, only printing errors.
It is handy for verifying a grammer.
.TP
.B -L
Do not generate the "#line" entries for the original
.I wacco
source file within in the parser.
.TP
.BI "-h " header
Create a file named
.I header
instead of the default "tokens.h".
.TP
.BI "-p " parser
Create a file named
.I parser
instead of the default "parser.C".
.TP
.BI "-s " scanner
Create a file named
.I scanner
instead of the default "scanner.l".
.SH FILES
wacco.doc wacco.doc.iw wacco.doc.ps
.br
tokens.h scanner.l parser.C ./.wacco.tmp
.SH NOTES
The scanner generated may be ``compiled'' by either
.IR lex (1)
or
.IR flex (1),
although
.I flex
is highly recommended.
.SH AUTHOR
Copyright (c) 1991 by Parag Patel.  All Rights Reserved.
SHAR_EOF
chmod 0444 wacco.1 ||
echo 'restore of wacco.1 failed'
Wc_c="`wc -c < 'wacco.1'`"
test 4588 -eq "$Wc_c" ||
	echo 'wacco.1: original size 4588, current size' "$Wc_c"
rm -f _shar_wnt_.tmp
fi
# ============= wacco.doc ==============
if test -f 'wacco.doc' -a X"$1" != X"-c"; then
	echo 'x - skipping wacco.doc (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting wacco.doc (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'wacco.doc' &&
Copyright (c) 1991 by Parag Patel.  All Rights Reserved.
<< $Header: wacco.doc,v 1.25 91/02/22 16:04:23 hmgr Exp $ >>
X
<< Please see the wacco(1) man page for details on its usage. >>
<< Only the grammar format is described here.                 >>
X
X
The underlying philosophy in wacco is that the code generated should be
exactly like that someone would generate by hand, if they were writing a
recursive-descent compiler manually.
X
X
X
The basic grammar file format is:
X
X	/* C style comments */
X	%opt <directives>
X	{ <header> }
X	<rules>         // C++ style comments
X	$$
X	<scanner>
X
Wacco directives may be placed on the optional "%opt" line at the top of
the source grammer.  Only one such line is allowed in the grammer
source, and it MUST be first in the source.  The directives are actually
the command-line options for wacco!  Options may thus be set either on
the command line, or in the wacco source itself.  The entire "%opt" line
is parsed as if it were the command line.  Please see the man page for
descriptions of the command-line options.
X
The header section (which is optional) is a set of code in curly-braces
{} that is put at the top of the output parser.C file.  This a the place
to include files, define classes, or setup global variables.  Naturally,
there are no curlies {} if there is no need for a header section.
X
The scanner section (the two "$$" and everything after) is entirely
optional.  It is included in the grammar file to make it easy to refer
to the actual values of tokens without explicitly defining those values.
X
Without any of the optional parts, a grammer consists only of rules.
X
X
X
The rules look much like those of yacc at first glance but there are
some interesting differences.  A rule looks like:
X
X	ID <TYPE> : stuff ;
X
The ID on the left-hand side is a non-terminal and so is eventually
turned into a function.  The TYPE is the type that this function will
accept in and return as a reference argument.  It is optional and must
be in angle-brackets <> if present and assumed to be "int" if not.  It
can be used to pass information into a function (rule) or to get
information out of it.
X
X	ID : stuff ;
X	ID <TYPE> : stuff ;
X
A vertical-bar "|" may be used to avoid duplicating the left-hand side:
X
X	ID : stuff1 ;
X	ID : stuff2 ;
X
is equivalent to
X
X	ID : stuff1 | stuff2 ;
X
X
<< For the rest of this document, the conventions are that terminals will be
X   in uppercase and non-terminals in lowercase. >>
X
X
The "stuff" on the right-hand side can get kind of interesting.  Like,
yacc, this is basically a list of terminals or non-terminals that are
expected in sequence.
X
X	parenexpr : LPAREN expr RPAREN ;
X
X
Terminals can be described in several different ways.
X
Simple character tokens are straight-forward.  Their token value is
always that of the character they represent.  The null character '\0'
may not be used as a token - its value used for other things internally.
X
X	parenexpr : '(' expr ')' ;
X
For more complicated strings, just use the strings themselves!
X
X	parenexpr : "<<" expr ">>" ;
X
The same string may be used in other rules to refer to that token.
X
Also, any identifier name may be used to define a terminal.  If that id
does not appear on the left side of a colon `:', then it is assumed to
be a terminal symbol in the grammar.
X
Token codes for terminals are automatically assigned and stored in the
"tokens.h" header file.  The token value of a string is pretty much
inaccessible.  A character constant will be its own token.  Any other
terminal name like LPAREN above will be in the header file as an enum
with the same name.
X
X
X
Actions (code) is imbedded anywhere on the right-hand side within pairs
of curly-braces {}.
X
X	parenexpr: '(' expr { $$ = $expr; } ')'
X
This introduces some other features that wacco has which yacc doesn't.
First though, the value that the non-terminal returns is always "$$".
X
The values of the right-hand side are referred to directly via their
symbolic names.  Thus we use "$expr" instead of "$2" in yacc!  Also,
"expr"s must return "int"s or the C++ compiler will complain!
X
Wacco generates an appropriate temporary variable if and only if it is
used by referring to a "$$" inside some code for that rule.  Thus
parenexpr above will have an in/out argument defined for it.  If there
were no code in {}, then parenexpr wouldn't be passed anything at all.
X
X
Actually, The TYPE specifier of a non-terminal may actually be a lot
more complicated than just a simple type:
X
X	example <double d; int i, j> : ... { $$.d = 0.0; $$.i = 34; }
X
In this case, wacco creates a struct for this non-terminal instead of a
simple variable.  The contents of the <> are put into this struct.  This
allows passing more info in and out of a non-term without having to
create a dummy struct by hand.  It is also passed to the non-terminal
function by reference rather than copying, and thus is very efficient.
X
X	expr <int left, right> :  ...  ;
X	example : expr ';' { $$ = $expr.left + $expr.right } ;
X
Note that all exported non-terms MUST have simple types, to avoid bogus
structure naming conventions.  If you must have a complicated type
returned from a start-symbol, you should create a specially named struct
or class and use it instead.
X
Also, simple types must not be named.  The following is illegal as
well as redundant, and kind of silly anyway:
X
X	expr <int var> : ... ;
X
X
X
If we have 2 "expr"s on the right, things get a little messier:
X
X	example: '(' expr ',' expr ')'
X		{ $$ = $expr1 + $expr2; };
or
X	example: '(' expr=front ',' expr=back ')'
X		{ $$ = $front + $back; };
X
The second form introduces the ability to name (alias) one of the
right-hand side's non-terminal names!  Here we name "expr1" to be called
"front" and "expr2" to be "back" for just this particular right-hand side.
X
X
X
Since wacco generates a C++ recursive-descent parser, we can do even more
interesting things on the right.  Wacco passes the local vars to store
return values by reference.  Thus we can pass information into a rule as
well as get stuff out of it.
X
X	example: { $expr = $$; } '(' expr ')' { $$ = $expr; };
X
This initializes the temp-var used to store the return value from "expr"
to whatever was passed in to "example", then passes it to "expr".  If a
non-terminal never uses "$$", then it is assumed to not return anything,
and no temp-var will be declared nor passed into it.
X
Other things that one can do:
X
X	example: '(' { int v = 2; } expr ')' { v = $expr; };
X
and create temp vars anywhere you want.  Wacco carefully avoids
putting out unnecessary sets of blocks in the output parser file.
X
To generate incomplete blocks, and allow a wierd sort of free-form
grammar, the %{%} format may be used wherever a {} is normally used.
This allows creating incomplete blocks like so:
X
X	example: '(' %{ if (somevar) {  %} expr ')' %{ } %} ;
X
Curly-braces are not counted within %{%} blocks, and %{%} blocks
may be used wherever {} blocks are allowed.
X
X
The empty rule may not be implicitly specified is in yacc, but must
be defined with the special "[]" symbol:
X
X	null: [] ;
X	expr: '(' expr ')' | [] ;
X
An empty statement is an error in wacco to help protect against typos
and other mistakes.
X
X
X
Right-hand sides may have parentheses for grouping.  Basically, a
function must be generated for every parenthesized expression to
maintain the parsing semantics:
X
X	value: (ID | INT) | [];
X
is the equivalent of:
X
X	value: v1 | [];
X	v1: ID | INT;
X
Just like every other non-terminal, parenthesized expressions have
return values, types, aliases, and may be referred to in other parts of
the right-hand side.  The default type is the type of the enclosing
parens or left-hand side for the outer-most parens:
X
X	example (<long> ID | INT) { $$ = $_; };
X
Multiple sets of parens on the right may be refered to as "$_1", "$_2",
and so on.  They may be named as well:
X
X	example<float>: (ID | FLOAT)=num { $$ = $num; };
X
Here the parens inherit the type "float" from "example".
X
Since the left-hand side may be used on the right for recursive
functions, so may parenthesized expressions.  The names just get a
little strange.
X
X	strange: (ID (OP # #1 #2 #3 #* | []) | []);
X
The inner "#" refers to the inner-most set of parens enclosing the
"OP...".  The strings "#" and "#1" are equivalent and refer to this inner
most set of parens.  "#2" refers to the next outer parens starting the
"ID...".  "#3" and "#*" refer the the name of the left-hand side, just
for completeness.  These can be viewed as the outermost "parens" in the
expression.  Ugly but sometimes necessary.
X
X
X
Other things defined in "tokens.h" include the end-of-input token EOI
which has value 0, and the constants RETOK and RETERR, for appropriate
return values.  These have the values of TRUE (1) and FALSE (0)
respectively.  These may be used in the right-hand side of rules if it
is determined that further parsing of rules is un-necessary.
X
X	parenexpr: LPAREN expr { if ($expr == BOGUS) return RETERR; } RPAREN;
X
The return-code from various rules is always available as the magic
string "$?" directly after that particular rule is called:
X
X	parenexpr: LPAREN expr { if ($? != RETOK) return RETERR; } RPAREN;
X
The return code is overwritten with each call to a non-terminal on the
right-hand side, so if a previous return value is needed, you must save it
in some variable yourself.
X
The generated parser code does not look at the actual return value of
non-terminals (funtions), so other return values may be used if desired.
X
X
X
By default, the first rule in the grammar is considered to be the start
symbol.  Instead of calling "yyparse()" to initiate the parse, the
function to call is the name of the left-hand ID in the first rule.  It
is called with no arguments.  It returns either RETOK or RETERR
depending on whether the parse succeeded or not.
X
X	firstsymbol: . . . ;
X	. . .
X
X	main()
X	{
X		if (firstsymbol() == RETOK)
X			return OK;
X		return ERR;
X	}
X
But you don't have to have just one entry point!  Adding a "%export"
modifier after a non-terminal just before the ':' causes that symbol
to become callable from outside the grammer:
X
X	thing<mytype> %export :  . . .  ;
X
X	func() { mytype var;  return thing(var); }
X
The first non-terminal in the grammer is automatically exported unless
"%export" is used somewhere in the grammer.  Also, notice that if a
"type" is defined and used for a non-terminal, that type must be passed
in by reference to that function.
X
The "%export" feature lets you call several non-terminals in the
grammer.  This can be used to export parts of a grammer, say
sub-expression parsing, or let you put several different parsers into
one grammer file.  All exported non-terminals are also listed as
"extern"s in the "tokens.h" header file.
X
X
X
The scanner section is optional.  If there is a "$$" at the end of the
file, the rest is considered to be almost straight lex(1) source.  If
there is a "$$", every terminal must have a lex value associated with
it.  Character and string constants are self-defining.  Other
nonterminals are described in the lex section.
X
An example:
X
X	expr: LPAREN expr RPAREN | "id" | [];
X
X	$$
X
X	%%
X
X	"."		{ return (int)EOI; }
X
X	$LPAREN		"("|"["
X	$RPAREN		")"|"]"
X
X	[ \t\v\n\f]	;
X	.  { w_scanerr("Illegal character %d (%c)", yytext[0], yytext[0]); }
X
The string "id" naturally stands for itself.  LPAREN and RPAREN are
described in the lex section in a reverse order than normal.  Wacco will
convert those lines starting with a `$' into the appropriate lex output.
This is not only to make sure that all terminals are defined, but allows
defining a language without ever having to manually define token ids for
any terminal symbol!
X
The default scanner (located in -lwacco) maintains its own I/O file
pointer.  This is so that user code can implement the equivalent of
"#include" without too much work.  The functions in the scanner include:
X
X	int w_openfile(char *fname)	// open a file to the specified name
X
X	void w_closefile()		// close the last opened file
X
X	void w_setfile(FILE *f)		// set the current file to this
X
X	FILE *w_getfile()		// return the currently opened file
X
X	int w_currcol()			// the current column in the input
X
X	int w_currline()		// the current line in the input
X
X	char *w_getcurrline()		// the text of the current line
X
X	int w_input()			// basic I/O routines which are
X	int w_unput(int c)		// to be used by the scanner
X	void w_output(int c)
X
X
You should call either w_setfile() or w_openfile() before starting the
parse or the default scanner will probably dump core.
X
X
The functions that the parser expects to have available are:
X
X	int w_gettoken()	// get the next token - usually calls yylex()
X				// - must return EOI on end-of-input
X
X	int w_scanerr()		// printf-type error printing routine
X				// - must always returns RETERR
X				// - is called with a NULL argument
X				// when just skipping a token in the input
X
These are either are in the wacco library -lwacco, or must be provided
by the user.
X
The default w_scanerr() will try to print the line that had the error,
and underneath it print "^" where the error occurred and "*" where
tokens were skipped when re-syncing.  Because of some lex(1) funnies,
this doesn't always work as expected.  When I do away with the need for
lex, this won't be a problem anymore.
X
X
Some other convenient functions defined in parser.C include:
X
X	int w_nexttoken() // return the value of the next token but don't
X			// scan it yet - calls gettoken() at most once
X			// - useful for token lookahead
X
X	void w_skiptoken()	// scan the current token - the next call to
X				// nexttoken() will actually read another token
X
X	char *w_tokenname(int tokid)	// return the string name of a token
X					// whose id is tokid
X
These are only really useful if you are writing your own scanner instead
of using lex.  Nexttoken() and skiptoken() can also be used to somewhat
direct the parse.  If you provide your own infinite push-back stack of
tokens, you can completely alter the parse at run-time!
X
The program flex(1) may be used instead of lex(1) if desired, and is
highly recommended.
X
The extern for "yytext" is automatically declared in parser.C.
Unfortunately, it may be wrong for the scanner generator actually being
used.  To change the definition, the macro YYTEXT_DECL may be redefined
at the top of your wacco grammer if you wish to use flex:
X
X	{
X	#undef YYTEXT_DECL
X	#define YYTEXT_DECL char *yytext
X	}
X	...
X
X
X
My original plan was to write a scanner-generator directly into wacco,
but since flex(1) is now available, which is very fast and generates
excellent scanners, I now have no plans to do anything to the scanning
parts of wacco.
X
X
X
X		--  Parag Patel
X
X
X
================= E X A M P L E    G R A M M E R ==================
X
// This is the usual required calculator sample.  It can still use
// a LOT of work, but it illustrates the basics.  Note that the
// precedence of operators is all wrong.
X
{
#include <stdio.h>
#include <stdlib.h>
}
X
calc
X	:	%{
X			while (w_nexttoken() != EOI) {
X		%}
X	    expr ([] | '=' | ';' | ',')
X		%{
X			printf("%f\n", $expr);
X			}
X		%}
X	| []
X	;
X
expr<double>
X	:	term { $binop_expr = $term; } binop_expr { $$ = $binop_expr; }
X	;
X
binop_expr<double>
X	:	'+' expr { $$ += $expr; }
X	|	'-' expr { $$ -= $expr; }
X	|	'*' expr { $$ *= $expr; }
X	|	'/' expr { $$ /= $expr; }
X	|	'&' expr { $$ = (int)$$ & (int)$expr; }
X	|	'|' expr { $$ = (int)$$ | (int)$expr; }
X	|	'^' expr { $$ = (int)$$ ^ (int)$expr; }
X	|	"<<" expr { $$ = (int)$$ << (int)$expr; }
X	|	">>" expr { $$ = (int)$$ >> (int)$expr; }
X	|	"&&" expr { $$ = $$ && $expr; }
X	|	"||" expr { $$ = $$ || $expr; }
X	|	[]
X	;
X
term<double>
X	:	DOUBLE { $$ = atof((char *)yytext); }
X	|	'-' expr { $$ = -$expr; }
X	|	'~' expr { $$ = ~(int)$expr; }
X	|	'!' expr { $$ = !$expr; }
X	|	'(' expr ')' { $$ = $expr; }
X	;
X
{
X	main()
X	{
X		w_setfile(stdin);
X		calc();
X	}
}
X
$$
X
D	[0-9]
L	[_A-Za-z]
X
%%
X
"."		{ return (int)EOI; }
X
$DOUBLE		({D}+)|({D}+\.{D}+)|({D}+[Ee]-?{D}+)|({D}+\.{D}+[Ee]-?{D}+)
X
"#".*$		;
X
[ \t\v\n\f]	;
.	{ w_scanerr("Illegal character %d ($c)", yytext[0], yytext[0]); }
SHAR_EOF
chmod 0444 wacco.doc ||
echo 'restore of wacco.doc failed'
Wc_c="`wc -c < 'wacco.doc'`"
test 15987 -eq "$Wc_c" ||
	echo 'wacco.doc: original size 15987, current size' "$Wc_c"
rm -f _shar_wnt_.tmp
fi
# ============= wacco.doc.iw ==============
if test -f 'wacco.doc.iw' -a X"$1" != X"-c"; then
	echo 'x - skipping wacco.doc.iw (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting wacco.doc.iw (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'wacco.doc.iw' &&
104 pgscriptver 
X
100 DefSpaceEx 100 DefCharEx 1 DefNormalHyphenationOn 100 
DefTypeColor (Times-Roman) DefTypeFace ENGLISH DefLanguage 12 DefPointSize 
USE_POINTSIZE DefSetSize (@default) DefTypeResource 
X
JUSTIFYLEFT DefJustifyFlags 2 DefBeginParaLeadValue ABSOLUTE 
DefBeginParaLeadMode 2 DefEndParaLeadValue ABSOLUTE DefEndParaLeadMode 120 
DefLeadValue PROPORTIONAL DefLeadMode 1 46 0 TAB_LEFT  720 DefTab 1 46 0 
TAB_LEFT  2160 DefTab 1 46 0 TAB_LEFT  3600 DefTab 1 46 0 
TAB_LEFT  5040 DefTab 1 46 0 TAB_LEFT  6480 DefTab 1 46 0 
TAB_LEFT  7920 DefTab 1 46 0 TAB_LEFT  9360 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 0 46 0 TAB_LEFT  24480 DefTab 0 46 0 
TAB_LEFT  24480 DefTab 80 DefWSMN 100 DefWSNM 150 DefWSMX 110 
DefLSMX 100 DefLeaderEx 46 DefLeaderChar 0 DefFirstIndent 0 
DefLeftIndent 0 DefRightIndent 0 DefNumberingOn 0 DefNumberingType 0 
DefNumberingRestart 1 DefNumberingLevel 0 DefNumberingStyle 0 
DefNumberingTabAfter 1 DefNumberingShowAllLevels 1 DefNumberingStart 1 
DefNumberingIncrement () DefNumberingPrefix () DefNumberingSuffix (.) 
DefNumberingSeparator (*default) DefParaResource 
X
0 DefLineWidth TRANSPARENT DefPenColor TRANSPARENT DefFillColor 1 DefIG 300 
DefResolution 100 DefYScale 100 DefXScale (=default) DefPolyResource 
X
0 DefPageDimensions 12240 DefPageWidth 15840 DefPageHeight 1440 
DefInsideMargin 1080 DefOutsideMargin 1080 DefTopMargin 1080 
DefBottomMargin 0 DefOrientation 0 DefPageStyle 1 DefColumns 360 
DefGutter (%default) DefMasterPage 
X
0 DefPageDimensions 12240 DefPageWidth 15840 DefPageHeight 1440 
DefInsideMargin 1080 DefOutsideMargin 1080 DefTopMargin 1080 
DefBottomMargin 0 DefOrientation 0 DefPageStyle 1 DefColumns 360 
DefGutter (%code) DefMasterPage ResDefEnd 
X
0 DefFirstLeft 0 DefDocSetup 1 DefNumPages 1 AutoPage 1 
DefStartPageNum () DefPageNumPrefix 1 DefGraphicLocation document 
X
0 DefAutoPage 
0 (%default) 1 DefPage 
0 DefAutoPage 
0 (%default) 2 DefPage 
0 DefAutoPage 
0 (%default) 3 DefPage 
0 DefAutoPage 
0 (%default) 4 DefPage 
0 DefAutoPage 
0 (%default) 5 DefPage 
0 DefAutoPage 
0 (%default) 6 DefPage 
0 DefAutoPage 
0 (%default) 7 DefPage 
0 DefAutoPage 
0 (%default) 8 DefPage 
0 DefAutoPage 
0 (%default) 9 DefPage 
0 DefAutoPage 
0 (%default) 10 DefPage 
0 DefAutoPage 
0 (%default) 11 DefPage 
0 DefAutoPage 
0 (%default) 12 DefPage 
0 DefAutoPage 
0 (%default) 13 DefPage 
1 DefAutoPage 
0 (%default) 14 DefPage 
X
POLY_OBJECT POLY_EMPTY |  DefPolyType 
X
0 DefLineWidth TRANSPARENT DefPenColor TRANSPARENT DefFillColor 1 DefIG 300 
DefResolution 100 DefYScale 100 DefXScale (=default) DefPolyResId 0 
DefMasterRef 
MP_CPSUCC_LINK MP_CPPRED_LINK POLY_COLUMN | |  DefSLinksFlags 0 DefStreamSucc 0 
DefStreamPred 
1440 1080 11160 1080 11160 14760 1440 14760 4 
POLY_OBJECT POLY_EMPTY |  (%default) 0 1 TextPolygon 
X
POLY_OBJECT POLY_EMPTY |  DefPolyType 
X
0 DefLineWidth TRANSPARENT DefPenColor TRANSPARENT DefFillColor 1 DefIG 300 
DefResolution 100 DefYScale 100 DefXScale (=default) DefPolyResId 0 
DefMasterRef 
MP_CPSUCC_LINK MP_CPPRED_LINK POLY_COLUMN | |  DefSLinksFlags 0 DefStreamSucc 0 
DefStreamPred 
1440 1080 11160 1080 11160 14760 1440 14760 4 
POLY_OBJECT POLY_EMPTY |  (%code) 0 2 TextPolygon 
X
POLY_OBJECT POLY_TEXT |  DefPolyType 
X
0 DefLineWidth TRANSPARENT DefPenColor TRANSPARENT DefFillColor 1 DefIG 300 
DefResolution 100 DefYScale 100 DefXScale (=default) DefPolyResId 1 
DefMasterRef 
MP_CPSUCC_LINK MP_CPPRED_LINK MPREF_VALID POLY_COLUMN | | |  DefSLinksFlags 0 
DefStreamSucc 0 DefStreamPred 4 DefTextHandle 
1440 1080 11160 1080 11160 14760 1440 14760 4 
POLY_OBJECT POLY_TEXT |  (1) 0 3 TextPolygon 
X
4 asciitextstream 
<(AvantGarde) cf ><24 cs><eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<32 cs><ON bo><CENTER align><tab>WACCO<BO type_rev><eop>
<24 cs><ON bo><eop>
<eop>
<tab>W<BO type_rev>hy <ON bo>A<BO type_rev>nother <ON bo>C<BO type_rev>ompiler 
<ON bo>Co<BO type_rev>mpiler?<eop>
<eop>
<tab>Why not!!!<eop>
<eop>
<eop>
<14 cs>by<eop>
Parag Patel
<textstream_end> 
X
POLY_OBJECT POLY_TEXT |  DefPolyType 
X
0 DefLineWidth TRANSPARENT DefPenColor TRANSPARENT DefFillColor 1 DefIG 300 
DefResolution 100 DefYScale 100 DefXScale (=default) DefPolyResId 1 
DefMasterRef 
X
MP_CPSUCC_LINK MP_CPPRED_LINK LINK_OVERFLOW MPREF_VALID POLY_COLUMN AUTO_STREAM | | | | | 
DefSLinksFlags 8 DefStreamSucc 0 DefStreamPred 6 DefTextHandle 
1440 1080 11160 1080 11160 14760 1440 14760 4 
POLY_OBJECT POLY_TEXT |  (2) 0 5 TextPolygon 
X
6 asciitextstream 
<(Palatino) cf ><eop>
<16 cs><(AvantGarde) cf ><ON bo><CENTER align>Overview<SIZE type_rev><(Palatino
) cf ><BO type_rev><eop>
<ALIGN para_rev><eop>
<eop>
<1440 FirstIndent><1440 LeftIndent><1440 RightIndent>Wacco is another compiler 
compiler. It generates a C++ recursive-descent LL(1) parser rather than a botto
m-up style LALR parser. It also directly supports lexical scanning, making it v
ery easy to describe a complete language within a single wacco source file.<eop
>
<eop>
The underlying philosophy in wacco is that the code generated should be exactly
X like that someone would generate by hand, if they were writing a recursive-des
cent compiler the hard way.<eop>
<eop>
Wacco also generates some relatively simple-minded error recovery schemes using
X FIRST and FOLLOW sets. While it leaves a lot to be desired, it's seems to be m
uch better than <ON it>yacc(1)<IT type_rev>'s!  Indeed, wacco was written to  p
rovide a platform for experimenting with various error recovery schemes, but tu
rned out to be quite useful in its own right.<eop>
<eop>
For more details about how to run wacco, please see the <ON it>wacco(1)<IT type
_rev> manual page. This document is intended to describe the wacco grammar form
at.<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<eop>
<9 cs>Copyright ) 1991 by Parag Patel.  All Rights Reserved.<pe><FIRSTINDENT pa
ra_rev><LEFTINDENT para_rev><RIGHTINDENT para_rev><FIRSTINDENT para_rev><LEFTIN
DENT para_rev><RIGHTINDENT para_rev><FIRSTINDENT para_rev><LEFTINDENT para_rev>
<RIGHTINDENT para_rev><SIZE type_rev><eop>
<CENTER align><16 cs><(AvantGarde) cf ><ON bo>File format<SIZE type_rev><
(Palatino) cf ><BO type_rev><eop>
<ALIGN para_rev><eop>
<eop>
The basic wacco grammar file format is:<eop>
<eop>
<(Courier) cf ><tab>/* C style comments */<eop>
<tab>%opt <(Palatino) cf ><ON it>directives<IT type_rev><eop>
<tab><(Courier) cf >{ <(Palatino) cf ><ON it>header<
(Courier) cf ><IT type_rev> }<eop>
<(Palatino) cf ><tab><ON it>rules<(Courier) cf ><IT type_rev><tab>// C++ style 
comments<eop>
<tab>$$<eop>
<(Palatino) cf ><tab><ON it>scanner<IT type_rev><eop>
<eop>
Wacco directives may be placed on the optional <(Courier) cf >%opt<
(Palatino) cf > line at the top of the source grammar.  Only one such line is a
llowed in the grammar source, and it MUST be first in the source.  The directiv
es are actually the command-line options for wacco!  Options may thus be set ei
ther on the command line, or in the wacco source itself.  The entire "<(Courier
) cf >%opt<(Palatino) cf >" line is parsed as if it were the command line.  Ple
ase see the man page for descriptions of the command-line options.<eop>
<eop>
The header section (which is optional) is a set of code in curly-braces <(Couri
er) cf >{}<(Palatino) cf > that is put at the top of the output <ON it>parser.C
<IT type_rev> file.  This a the place to include files, define classes, or setu
p global variables.  Naturally, there are no curlies <
(Courier) cf >{}<(Palatino) cf > if there is no need for a header section.<eop>
X
<eop>
The scanner section (the two<(Courier) cf ><(Palatino) cf > "<(Courier) cf >$$<
X
(Palatino) cf >" and everything after) is entirely optional.  It is included in
X the grammar file to make it easy to refer to the actual values of tokens witho
ut explicitly defining those values by hand.<eop>
<eop>
Without any of the optional parts, a grammar consists only of rules.<eop>
<pe><eop>
<CENTER align><16 cs><
(AvantGarde) cf ><ON bo>Rules<SIZE type_rev><(Palatino) cf ><BO type_rev><eop>
<ALIGN para_rev><eop>
<eop>
The rules look much like those of yacc at first glance but there are some inter
esting differences.  A rule looks like:<eop>
<eop>
<(Courier) cf ><tab>id <la>TYPE<ra>  : <(Palatino) cf ><ON it>stuff<(Courier) c
f ><IT type_rev> ;<eop>
<
(Palatino) cf ><eop>
The <ON it>id <IT type_rev>on the left-hand side is a non-terminal and so  is e
ventually turned into a function.  The <ON it>TYPE <IT type_rev>is the type tha
t this function will accept in and return as a reference argument.  It is optio
nal and must be in angle-brackets  if present . The default <ON it>TYPE<IT type
_rev> is  "<(Courier) cf >int<(Palatino) cf >". This <ON it>TYPE<IT type_rev> i
s used to pass information into and out of the rule, as will be seen later.<eop
>
<eop>
<(Courier) cf ><tab>id : <(Palatino) cf ><ON it>stuff<
(Courier) cf ><IT type_rev> ;<eop>
<tab>id <la>TYPE<ra> : <(Palatino) cf ><ON it>stuff<(Courier) cf ><IT type_rev>
X ;<eop>
<(Palatino) cf ><eop>
A vertical-bar "<(Courier) cf >|<
(Palatino) cf >" may be used to avoid duplicating the left-hand side:<eop>
<eop>
<(Courier) cf ><tab>id : <(Palatino) cf ><ON it>stuff1<(Courier) cf ><IT type_r
ev> ;<eop>
<tab>id : <(Palatino) cf ><ON it>stuff2<
(Courier) cf ><IT type_rev> ;<eop>
<(Palatino) cf ><eop>
is equivalent to<eop>
<eop>
<(Courier) cf ><tab>id : <(Palatino) cf ><ON it>stuff1<(Courier) cf ><IT type_r
ev> | <
(Palatino) cf ><ON it>stuff2<(Courier) cf ><IT type_rev> ;<(Palatino) cf ><eop>
X
<eop>
The <ON it>stuff<IT type_rev> on the right-hand side can get kind of interestin
g.  Like, yacc, this is basically a list of terminals or non-terminals that are
X expected in sequence.<eop>
<eop>
<
(Courier) cf ><tab>parenexpr : LPAREN expr RPAREN ;<eop>
<(Palatino) cf ><eop>
<eop>
<(AvantGarde) cf ><ON bo><14 cs>Terminals<(Palatino) cf ><BO type_rev><SIZE typ
e_rev><eop>
<eop>
Terminals can be described in several different ways.<eop>
<eop>
Simple character tokens are straight-forward.  Their token value is always that
X of the character they represent.  The null character '\0' may not be used as a
X token - its value used for other things internally.<eop>
<eop>
<(Courier) cf ><tab>parenexpr : '(' expr ')' ;<eop>
<
(Palatino) cf ><eop>
For more complicated strings, just use the strings themselves!<eop>
<eop>
<(Courier) cf ><tab>parenexpr : "<la><la>" expr "<ra><ra>" ;<eop>
<(Palatino) cf ><eop>
The same string may be used in other rules to refer to that very same token.<eo
p>
<eop>
Also, any identifier name may be used to define a terminal.  If that id does no
t appear on the left side of a colon "<(Courier) cf >:<(Palatino) cf >", then i
t is assumed to be a terminal symbol in the grammar.<eop>
<eop>
Token codes for terminals are automatically assigned and stored in the <ON it>t
okens.h<IT type_rev> header file.  The token value of a string is pretty much i
naccessible.  A character constant will always be its own token.  Any other ter
minal name like <ON it>LPAREN<IT type_rev> above will be in the header file as 
an enum with the same name, allowing it to be used symbolically.<eop>
<eop>
<eop>
<
(AvantGarde) cf ><ON bo><14 cs>Actions<(Palatino) cf ><BO type_rev><SIZE type_r
ev><eop>
<eop>
Actions (that is, C++ code) are imbedded anywhere on the right-hand side within
X pairs of curly-braces <(Courier) cf >{}<(Palatino) cf >.<eop>
<eop>
<(Courier) cf ><tab>parenexpr: '(' expr { $$ = $expr; } ')'<eop>
<
(Palatino) cf ><eop>
This introduces some other features that wacco has which yacc doesn't. First th
ough, the value that the non-terminal returns is always "<(Courier) cf >$$<(Pal
atino) cf >".<eop>
<eop>
The values of the right-hand side are referred to directly via their symbolic n
ames.  Thus we use "<(Courier) cf >$expr<(Palatino) cf >" instead of "<
(Courier) cf >$2<(Palatino) cf >" in yacc!  Also, <ON it>expr<IT type_rev>s mus
t return <(Courier) cf >int<(Palatino) cf >s or the C++ compiler will complain!
<eop>
<eop>
Wacco generates an appropriate temporary variable if and only if it is used by 
referring to a "<(Courier) cf >$$<
(Palatino) cf >" inside some code for that rule.  Thus <ON it>parenexpr<IT type
_rev> above will have an in/out argument defined for it.  If there were no code
X in <(Courier) cf >{}<(Palatino) cf >, then <ON it>parenexpr<IT type_rev> would
n't be passed anything at all.<eop>
<eop>
<eop>
<(AvantGarde) cf ><14 cs><ON bo>Types<(Palatino) cf ><SIZE type_rev><BO type_re
v><eop>
<eop>
Actually, The <ON it>TYPE<IT type_rev> specifier of a non-terminal may actually
X be a lot more complicated than just a simple type:<eop>
<eop>
<
(Courier) cf ><tab>example <la>double d; int i, j<ra> : ... { $$.d = 0.0; $$.i 
= 34; }<eop>
<(Palatino) cf ><eop>
In this case, wacco creates a struct for this non-terminal instead of a simple 
variable.  The contents of the  <
(Courier) cf >"<la><ra><(Palatino) cf >" are put into this struct.  This allows
X passing more information in and out of a non-terminal without having to create
X a dummy <(Courier) cf >struct<(Palatino) cf > by hand.  It is also passed to t
he non-terminal function by reference rather than copying, and thus is very eff
icient.<eop>
<eop>
<(Courier) cf ><tab>expr <la>int left, right<ra> :  ...  ;<eop>
<tab>example : expr ';' { $$ = $expr.left + $expr.right } ;<eop>
<
(Palatino) cf ><eop>
Note that all exported non-terminals <ON bo>must<BO type_rev> have simple types
, to avoid bogus structure naming conventions.  If you must have a complicated 
type returned from a start-symbol, you should create a specially named <(Courie
r) cf >struct<(Palatino) cf > or <(Courier) cf >class<(Palatino) cf > and use i
t instead. (more about start symbols later)<eop>
<eop>
Also, simple types must not be named.  The following is illegal as well as redu
ndant, and kind of silly anyway:<eop>
<eop>
<
(Courier) cf ><tab>expr <la>int var<ra> : ... ;<tab>// ILLEGAL!!!<eop>
<(Palatino) cf ><eop>
<eop>
<(AvantGarde) cf ><14 cs><ON bo>Aliases<(Palatino) cf ><SIZE type_rev><BO type_
rev><eop>
<eop>
If we have 2 <ON it>expr<IT type_rev>s on the right, things get a little messie
r:<eop>
<eop>
<(Courier) cf ><tab>example: '(' expr ',' expr ')'<eop>
<tab><tab>{ $$ = $expr1 + $expr2; };<eop>
<
(Palatino) cf >or<eop>
<(Courier) cf ><tab>example: '(' expr=front ',' expr=back ')'<eop>
<tab><tab>{ $$ = $front + $back; };<eop>
SHAR_EOF
true || echo 'restore of wacco.doc.iw failed'
fi
echo 'End of  part 3'
echo 'File wacco.doc.iw is continued in part 4'
echo 4 > _shar_seq_.tmp
exit 0
exit 0 # Just in case...
-- 
Kent Landfield                   INTERNET: kent@sparky.IMD.Sterling.COM
Sterling Software, IMD           UUCP:     uunet!sparky!kent
Phone:    (402) 291-8300         FAX:      (402) 291-4362
Please send comp.sources.misc-related mail to kent@uunet.uu.net.