[comp.os.minix] bawk part 1 of 2

ast@cs.vu.nl (Andy Tanenbaum) (09/14/87)

: This is a shar archive.  Extract with sh, not csh.
: This archive ends with exit, so do not worry about trailing junk.
: --------------------------- cut here --------------------------
PATH=/bin:/usr/bin
echo Extracting \b\a\w\k\.\c
sed 's/^X//' > \b\a\w\k\.\c << '+ END-OF-FILE '\b\a\w\k\.\c
X/*
X * Bawk main program
X */
X#define MAIN 1
X#include <stdio.h>
X#include "bawk.h"
X
X/*
X * Main program
X */
Xmain(argc, argv)
X        int argc;
X        char **argv;
X{
X        char gotrules, didfile, getstdin;
X
X        getstdin =
X                didfile =
X                gotrules = 0;
X
X        /*
X         * Initialize global variables: 
X         */
X        Beginact = (char *) 0;
X        Endact = (char *) 0;
X        Rules = (RULE *) 0;
X        Rulep = (RULE *) 0;
X#ifdef DEBUG
X        Debug = 0;
X#endif
X        Filename = (char *) 0;
X        Linecount = 0;
X        Saw_break = 0;
X
X        Stackptr = Stackbtm - 1;
X        Stacktop = Stackbtm + MAXSTACKSZ;
X        Nextvar = Vartab;
X
X        strcpy(Fieldsep, " \t");
X        strcpy(Recordsep, "\n");
X
X        /*
X         * Parse command line 
X         */
X        while (--argc) {
X                if (**(++argv) == '-') {
X                        /*
X                         * Process dash options. 
X                         */
X                        switch (tolower(*(++(*argv)))) {
X#ifdef DEBUG
X                        case 'd':
X                                ++Debug;
X                                break;
X#endif
X                        case 0:
X                                ++getstdin;
X                                --argv;
X                                goto dosomething;
X                                break;
X                        default:
X                                usage();
X                        }
X                }
X                else {
X        dosomething:
X                        if (gotrules) {
X                                /*
X                                 * Already read rules file - assume this is
X                                 * is a text file for processing. 
X                                 */
X                                if (++didfile == 1 && Beginact)
X                                        doaction(Beginact);
X                                if (getstdin) {
X                                        --getstdin;
X                                        newfile(0);
X                                }
X                                else
X                                        newfile(*argv);
X                                process();
X                        }
X                        else {
X                                /*
X                                 * First file name argument on command line
X                                 * is assumed to be a rules file - attempt to
X                                 * compile it. 
X                                 */
X                                if (getstdin) {
X                                        --getstdin;
X                                        newfile(0);
X                                }
X                                else
X                                        newfile(*argv);
X                                compile();
X                                gotrules = 1;
X                        }
X                }
X        }
X        if (!gotrules)
X                usage();
X
X        if (!didfile) {
X                /*
X                 * Didn't process any files yet - process stdin. 
X                 */
X                newfile(0);
X                if (Beginact)
X                        doaction(Beginact);
X                process();
X        }
X        if (Endact)
X                doaction(Endact);
X        exit(0);
X}
X
X/*
X * Regular expression/action file compilation routines.
X */
Xcompile()
X{
X        /*
X         * Compile regular expressions and C actions into Rules struct,
X         * reading from current input file "Fileptr". 
X         */
X        int c, len;
X
X#ifdef DEBUG
X        if (Debug)
X                error("compiling...", 0);
X#endif
X
X        while ((c = getcharacter()) != -1) {
X                if (c == ' ' || c == '\t' || c == '\n')
X                        /* swallow whitespace */
X                        ;
X                else if (c == '#') {
X                        /*
X                         * Swallow comments 
X                         */
X                        while ((c = getcharacter()) != -1 && c != '\n');
X                }
X                else if (c == '{') {
X#ifdef DEBUG
X                        if (Debug)
X                                error("action", 0);
X#endif
X                        /*
X                         * Compile (tokenize) the action string into our
X                         * global work buffer, then allocate some memory for
X                         * it and copy it over. 
X                         */
X                        ungetcharacter('{');
X                        len = act_compile(Workbuf);
X
X                        if (Rulep && Rulep->action) {
X                                Rulep->nextrule =
X                                (struct rule *)getmem(sizeof(*Rulep));
X                                Rulep = Rulep->nextrule;
X                                fillmem(Rulep, sizeof(*Rulep), 0);
X                        }
X                        if (!Rulep) {
X                                /*
X                                 * This is the first action encountered.
X                                 * Allocate the first Rules structure and
X                                 * initialize it 
X                                 */
X                                Rules = Rulep =
X                                        (RULE *) getmem(sizeof(*Rulep));
X                                fillmem(Rulep, sizeof(*Rulep), 0);
X                        }
X                        Rulep->action = getmem(len);
X                        movemem(Workbuf, Rulep->action, len);
X                }
X                else if (c == ',') {
X#ifdef DEBUG
X                        if (Debug)
X                                error("stop pattern", 0);
X#endif
X                        /*
X                         * It's (hopefully) the second part of a two-part
X                         * pattern string.  Swallow the comma and start
X                         * compiling an action string. 
X                         */
X                        if (!Rulep || !Rulep->pattern.start)
X                                error("stop pattern without a start",
X                                      RE_ERROR);
X                        if (Rulep->pattern.stop)
X                                error("already have a stop pattern",
X                                      RE_ERROR);
X                        len = pat_compile(Workbuf);
X                        Rulep->pattern.stop = getmem(len);
X                        movemem(Workbuf, Rulep->pattern.stop, len);
X                }
X                else {
X                        /*
X                         * Assume it's a regular expression pattern 
X                         */
X#ifdef DEBUG
X                        if (Debug)
X                                error("start pattern", 0);
X#endif
X
X                        ungetcharacter(c);
X                        len = pat_compile(Workbuf);
X
X                        if (*Workbuf == T_BEGIN) {
X                                /*
X                                 * Saw a "BEGIN" keyword - compile following
X                                 * action into special "Beginact" buffer. 
X                                 */
X                                len = act_compile(Workbuf);
X                                Beginact = getmem(len);
X                                movemem(Workbuf, Beginact, len);
X                                continue;
X                        }
X                        if (*Workbuf == T_END) {
X                                /*
X                                 * Saw an "END" keyword - compile following
X                                 * action into special "Endact" buffer. 
X                                 */
X                                len = act_compile(Workbuf);
X                                Endact = getmem(len);
X                                movemem(Workbuf, Endact, len);
X                                continue;
X                        }
X                        if (Rulep) {
X                                /*
X                                 * Already saw a pattern/action - link in
X                                 * another Rules structure. 
X                                 */
X                                Rulep->nextrule =
X                                        (struct rule *) getmem(sizeof(*Rulep));
X                                Rulep = Rulep->nextrule;
X                                fillmem(Rulep, sizeof(*Rulep), 0);
X                        }
X                        if (!Rulep) {
X                                /*
X                                 * This is the first pattern encountered.
X                                 * Allocate the first Rules structure and
X                                 * initialize it 
X                                 */
X                                Rules = Rulep = 
X                                        (RULE *) getmem(sizeof(*Rulep));
X                                fillmem(Rulep, sizeof(*Rulep), 0);
X                        }
X                        if (Rulep->pattern.start)
X                                error("already have a start pattern",
X                                      RE_ERROR);
X
X                        Rulep->pattern.start = getmem(len);
X                        movemem(Workbuf, Rulep->pattern.start, len);
X                }
X        }
X        endfile();
X}
X
X/*
X * Text file main processing loop.
X */
Xprocess()
X{
X        /*
X         * Read a line at a time from current input file at "Fileptr", then
X         * apply each rule in the Rules chain to the input line. 
X         */
X        int i;
X
X#ifdef DEBUG
X        if (Debug)
X                error("processing...", 0);
X#endif
X
X        Recordcount = 0;
X
X        while (getline()) {
X                /*
X                 * Parse the input line. 
X                 */
X                Fieldcount = parse(Linebuf, Fields, Fieldsep);
X#ifdef DEBUG
X                if (Debug > 1) {
X                        printf("parsed %d words:\n", Fieldcount);
X                        for (i = 0; i < Fieldcount; ++i)
X                                printf("<%s>\n", Fields[i]);
X                }
X#endif
X
X                Rulep = Rules;
X                do {
X                        if (!Rulep->pattern.start) {
X                                /*
X                                 * No pattern given - perform action on every
X                                 * input line. 
X                                 */
X                                doaction(Rulep->action);
X                        }
X                        else if (Rulep->pattern.startseen) {
X                                /*
X                                 * Start pattern already found - perform
X                                 * action then check if line matches stop
X                                 * pattern. 
X                                 */
X                                doaction(Rulep->action);
X                                if (dopattern(Rulep->pattern.stop))
X                                        Rulep->pattern.startseen = 0;
X                        }
X                        else if (dopattern(Rulep->pattern.start)) {
X                                /*
X                                 * Matched start pattern - perform action. If
X                                 * a stop pattern was given, set "start
X                                 * pattern seen" flag and process every input
X                                 * line until stop pattern found. 
X                                 */
X                                doaction(Rulep->action);
X                                if (Rulep->pattern.stop)
X                                        Rulep->pattern.startseen = 1;
X                        }
X                }
X                while (Rulep = Rulep->nextrule);
X
X                /*
X                 * Release memory allocated by parse(). 
X                 */
X                while (Fieldcount)
X                        free(Fields[--Fieldcount]);
X        }
X}
X
X/*
X * Miscellaneous functions
X */
Xparse(str, wrdlst, delim)
X        char *str;
X        char *wrdlst[];
Xchar *delim;
X{
X        /*
X         * Parse the string of words in "str" into the word list at "wrdlst".
X         * A "word" is a sequence of characters delimited by one or more of
X         * the characters found in the string "delim". Returns the number of
X         * words parsed. CAUTION: the memory for the words in "wrdlst" is
X         * allocated by malloc() and should eventually be returned by
X         * free()... 
X         */
X        int wrdcnt, wrdlen;
X        char wrdbuf[MAXLINELEN], c;
X
X        wrdcnt = 0;
X        while (*str) {
X                while (instr(*str, delim))
X                        ++str;
X                if (!*str)
X                        break;
X                wrdlen = 0;
X                while ((c = *str) && !instr(c, delim)) {
X                        wrdbuf[wrdlen++] = c;
X                        ++str;
X                }
X                wrdbuf[wrdlen++] = 0;
X                /*
X                 * NOTE: allocate a MAXLINELEN sized buffer for every word,
X                 * just in case user wants to copy a larger string into a
X                 * field. 
X                 */
X                wrdlst[wrdcnt] = getmem(MAXLINELEN);
X                strcpy(wrdlst[wrdcnt++], wrdbuf);
X        }
X
X        return wrdcnt;
X}
X
Xunparse(wrdlst, wrdcnt, str, delim)
X        char *wrdlst[];
Xint wrdcnt;
Xchar *str;
Xchar *delim;
X{
X        /*
X         * Replace all the words in "str" with the words in "wrdlst",
X         * maintaining the same word seperation distance as found in the
X         * string. A "word" is a sequence of characters delimited by one or
X         * more of the characters found in the string "delim". 
X         */
X        int wc;
X        char strbuf[MAXLINELEN], *sp, *wp, *start;
X
X        wc = 0;                 /* next word in "wrdlst" */
X        sp = strbuf;            /* points to our local string */
X        start = str;            /* save start address of "str" for later... */
X        while (*str) {
X                /*
X                 * Copy the field delimiters from the original string to our
X                 * local version. 
X                 */
X                while (instr(*str, delim))
X                        *sp++ = *str++;
X                if (!*str)
X                        break;
X                /*
X                 * Skip over the field in the original string and... 
X                 */
X                while (*str && !instr(*str, delim))
X                        ++str;
X
X                if (wc < wrdcnt) {
X                        /*
X                         * ...copy in the field in the wordlist instead. 
X                         */
X                        wp = wrdlst[wc++];
X                        while (*wp)
X                                *sp++ = *wp++;
X                }
X        }
X        /*
X         * Tie off the local string, then copy it back to caller's string. 
X         */
X        *sp = 0;
X        strcpy(start, strbuf);
X}
X
Xinstr(c, s)
X        char c, *s;
X{
X        while (*s)
X                if (c == *s++)
X                        return 1;
X        return 0;
X}
X
Xchar *
Xgetmem(len)
X        unsigned len;
X{
X        char *cp, *malloc();
X
X        if (cp = malloc(len))
X                return cp;
X        error("out of memory", MEM_ERROR);
X}
X
X/* char * */
Xnewfile(s)
X        char *s;
X{
X        Linecount = 0;
X        if (Filename = s) {
X#ifdef BDS_C
X                if (fopen(s, Fileptr = Curfbuf) == -1)
X#else
X                if (!(Fileptr = fopen(s, "r")))
X#endif
X                        error("file not found", FILE_ERROR);
X        }
X        else {
X                /*
X                 * No file name given - process standard input. 
X                 */
X                Fileptr = stdin;
X                Filename = "standard input";
X        }
X}
X
Xgetline()
X{
X        /*
X         * Read a line of text from current input file.  Strip off trailing
X         * record seperator (newline). 
X         */
X        int rtn, len;
X
X        for (len = 0; len < MAXLINELEN; ++len) {
X                if ((rtn = getcharacter()) == *Recordsep || rtn == -1)
X                        break;
X                Linebuf[len] = rtn;
X        }
X        Linebuf[len] = 0;
X
X        if (rtn == -1) {
X                endfile();
X                return 0;
X        }
X        return 1;
X}
X
Xgetcharacter()
X{
X        /*
X         * Read a character from curren input file. WARNING: your getc() must
X         * convert lines that end with CR+LF to LF and CP/M's EOF character
X         * (^Z) to a -1. Also, getc() must return a -1 when attempting to
X         * read from an unopened file. 
X         */
X        int c;
X
X#ifdef BDS_C
X        /*
X         * BDS C doesn't do CR+LF to LF and ^Z to -1 conversions <gag> 
X         */
X        if ((c = getc(Fileptr)) == '\r') {
X                if ((c = getc(Fileptr)) != '\n') {
X                        ungetc(c);
X                        c = '\r';
X                }
X        }
X        else if (c == 26)       /* ^Z */
X                c = -1;
X#else
X        c = getc(Fileptr);
X#endif
X
X        if (c == *Recordsep)
X                ++Recordcount;
X        if (c == '\n')
X                ++Linecount;
X
X        return c;
X}
X
Xungetcharacter(c)
X{
X        /*
X         * Push a character back into the input stream. If the character is a
X         * record seperator, or a newline character, the record and line
X         * counters are adjusted appropriately. 
X         */
X        if (c == *Recordsep)
X                --Recordcount;
X        if (c == '\n')
X                --Linecount;
X        return ungetc(c, Fileptr);
X}
X
Xendfile()
X{
X        fclose(Fileptr);
X        Filename = "";
X        Linecount = 0;
X}
X
Xerror(s, severe)
X        char *s;
X        int severe;
X{
X        char *cp, *errat;
X
X        if (Filename)
X                fprintf(stderr, "%s:", Filename);
X
X        if (Linecount)
X                fprintf(stderr, " line %d:", Linecount);
X
X        fprintf(stderr, " %s\n", s);
X        if (severe)
X                exit(1);
X}
X
Xusage()
X{
X        error("Usage: bawk <actfile> [<file> ...]\n", USAGE_ERROR);
X}
X
Xmovemem(from, to, count)
X        char *from, *to;
X        int count;
X{
X        while (count-- > 0)
X                *to++ = *from++;
X}
X
X
Xstrncmp(s, t, n)
X        char *s, *t;
X        int n;
X{
X        while (--n > 0 && *s && *t && *s == *t) {
X                ++s;
X                ++t;
X        }
X        if (*s || *t)
X                return *s - *t;
X        return 0;
X}
X
Xnum(c)
X        char c;
X{
X        return '0' <= c && c <= '9';
X}
X
Xalpha(c)
X        char c;
X{
X        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
X}
X
Xalphanum(c)
X        char c;
X{
X        return alpha(c) || num(c);
X}
X
Xfillmem(array, count, value)
X        char *array, value;
X        int count;
X{
X        while (count-- > 0)
X                *array++ = value;
X}
X
+ END-OF-FILE bawk.c
chmod 'u=rw,g=r,o=r' \b\a\w\k\.\c
set `sum \b\a\w\k\.\c`
sum=$1
case $sum in
30221)	:;;
*)	echo 'Bad sum in '\b\a\w\k\.\c >&2
esac
echo Extracting \b\a\w\k\.\d\o\c
sed 's/^X//' > \b\a\w\k\.\d\o\c << '+ END-OF-FILE '\b\a\w\k\.\d\o\c
XNAME
X
X	bawk - text processor
X
XSYNOPSIS
X
X	bawk rules [file] ...
X
XDESCRIPTION
X
X	Bawk is a text processing program that searches files for
X	specific patterns and performs "actions" for every occurrance
X	of these patterns.  The patterns can be "regular expressions"
X	as used in the UNIX "ex" editor.  The actions are expressed
X	using a subset of the "C" language.
X
X	The patterns and actions are usually placed in a "rules" file
X	whose name must be the first argument in the command line.
X	All other arguments are taken to be the names of text files on
X	which the rules are to be applied.
X	The special file name "-" may also be used anywhere on the
X	command line to take input from the standard input device.
X
X	The command:
X
X		bawk - prog.c - prog.h
X
X	would read the patterns and actions rules from the standard
X	input, then apply them to the files "prog.c", the standard
X	input and "prog.h" in that order.
X
X	The general format of a rules file is:
X
X		<pattern> { <action> }
X		<pattern> { <action> }
X		...
X
X	There may be any number of these <pattern> { <action> }
X	sequences in the rules file.  Bawk reads a line of input from
X	the current input file and applies every <pattern> { <action> }
X	in sequence to the line.
X	
X	If the <pattern> corresponding to any { <action> } is missing,
X	the action is applied to every line of input.  The default
X	{ <action> } is to print the matched input line.
X
XPATTERNS
X
X	The <pattern>'s may consist of any valid C expression.  If the
X	<pattern> consists of two expressions seperated by a comma, it
X	is taken to be a range and the <action> is performed on all
X	lines of input that match the range.  <pattern>'s may contain
X	"regular expressions" delimited by an '@' symbol.  Regular
X	expressions can be thought of as a generalized "wildcard"
X	string matching mechanism, similar to that used by many
X	operating systems to specify file names.  Regular expressions
X	may contain any of the following characters:
X
X		x	An ordinary character (not mentioned below)
X			matches that character.
X		'\'	The backslash quotes any character.
X			"\$" matches a dollar-sign.
X		'^'	A circumflex at the beginning of an expression
X			matches the beginning of a line.
X		'$'	A dollar-sign at the end of an expression
X			matches the end of a line.
X		'.'	A period matches any single character except
X			newline.
X		':x'	A colon matches a class of characters described
X			by the character following it:
X		':a'	":a" matches any alphabetic;
X		':d'	":d" matches digits;
X		':n'	":n" matches alphanumerics;
X		': '	": " matches spaces, tabs, and other control
X			characters, such as newline.
X		'*'	An expression followed by an asterisk matches
X			zero or more occurrances of that expression:
X			"fo*" matches "f", "fo", "foo", "fooo", etc.
X		'+'	An expression followed by a plus sign matches
X			one or more occurrances of that expression:
X			"fo+" matches "fo", "foo", "fooo", etc.
X		'-'	An expression followed by a minus sign
X			optionally matches the expression.
X		'[]'	A string enclosed in square brackets matches
X			any single character in that string, but no
X			others.  If the first character in the string
X			is a circumflex, the expression matches any
X			character except newline and the characters in
X			the string.  For example, "[xyz]" matches "xx"
X			and "zyx", while "[^xyz]" matches "abc" but not
X			"axb".  A range of characters may be specified
X			by two characters separated by "-".  Note that,
X			[a-z] matches alphabetics, while [z-a] never
X			matches.
X
X	For example, the following rules file would print every line
X	that contained a valid C identifier:
X
X		@[a-zA-Z][a-zA-Z0-9]@
X
X	And this rules file would print all lines between and including
X	the ones that contained the word "START" and "END":
X
X		@START@, @END@
X
XACTIONS
X
X	Actions are expressed as a subset of the C language.  All
X	variables are global and default to int's if not formally
X	declared.  Variable declarations may appear anywhere within
X	an action.  Only char's and int's and pointers and arrays of
X	char and int are allowed.  Bawk allows only decimal integer
X	constants to be used - no hex (0xnn) or octal (0nn). String
X	and character constants may contain all of the special C
X	escapes (\n, \r, etc.).
X
X	Bawk supports the "if", "else", "while" and "break" flow of
X	control constructs, which behave exactly as in C.
X
X	Also supported are the following unary and binary operators,
X	listed in order from highest to lowest precedence:
X
X		operator           type    associativity
X		() []              unary   left to right
X		! ~ ++ -- - * &    unary   right to left
X		* / %              binary  left to right
X		+ -                binary  left to right
X		<< >>              binary  left to right
X		< <= > >=          binary  left to right
X		== !=              binary  left to right
X		&                  binary  left to right
X		^                  binary  left to right
X		|                  binary  left to right
X		&&                 binary  left to right
X		||                 binary  left to right
X		=                  binary  right to left
X
X	Comments are introduced by a '#' symbol and are terminated by
X	the first newline character.  The standard "/*" and "*/"
X	comment delimiters are not supported and will result in a
X	syntax error.
X
XFIELDS
X
X	When bawk reads a line from the current input file, the
X	record is automatically seperated into "fields".  A field is
X	simply a string of consecutive characters delimited by either
X	the beginning or end of line, or a "field seperator" character
X	Initially, the field seperators are the space and tab character.
X	The special unary operator '$' is used to reference one of the
X	fields in the current input record (line).  The fields are
X	numbered sequentially starting at 1.  The expression "$0"
X	references the entire input line.
X
X	Similarly, the "record seperator" is used to determine the end
X	of an input "line", initially the newline character.
X	The field and record seperators may be changed programatically
X	by one of the actions and will remain in effect until changed
X	again.
X
X	Fields behave exactly like strings; and can be used in the same
X	context as a character array.  These "arrays" can be considered
X	to have been declared as:
X
X		char ($n)[ 128 ];
X
X	In other words, they are 128 bytes long.  Notice that the
X	parentheses are necessary because the operators [] and $
X	associate from right to left; without them, the statement
X	would have parsed as:
X
X		char $(1[ 128 ]);
X
X	which is obviously ridiculous.
X
X	If the contents of one of these field arrays is altered, the
X	"$0" field will reflect this change.  For example, this
X	expression:
X
X		*$4 = 'A';
X
X	will change the first character of the fourth field to an upper-
X	case letter 'A'.  Then, when the following input line:
X
X		120 PRINT "Name         address        Zip"
X
X	is processed, it would be printed as:
X
X		120 PRINT "Name         Address        Zip"
X
X	Fields may also be modified with the strcpy() function (see
X	below).  For example, the expression:
X
X		strcpy( $4, "Addr." );
X
X	applied to the same line above would yield:
X
X		120 PRINT "Name         Addr.        Zip"
X
XPREDEFINED VARIABLES
X
X	The following variables are pre-defined:
X
X		FS		Field seperator (see below).
X		RS		Record seperator (see below also).
X		NF		Number of fields in current input
X				record (line).
X		NR		Number of records processed thus far.
X		FILENAME	Name of current input file.
X		BEGIN		A special <pattern> that matches the
X				beginning of input text, before the
X				first record is read.
X		END		A special <pattern> that matches the
X				end of input text, after the last
X				record has been read.
X
X	Bawk also provides some useful builtin functions for string
X	manipulation and printing:
X
X		printf(arg..)	Exactly the printf() function from C.
X		getline()	Reads the next record from the current
X				input file and returns 0 on end of file.
X		nextfile()	Closes out the current input file and
X				begins processing the next file in the
X				list (if any).
X		strlen(s)	Returns the length of its string argument.
X		strcpy(s,t)	Copies the string "t" to the string "s".
X		strcmp(s,t)	Compares the "s" to "t" and returns 0 if
X				they match.
X		toupper(c)	Returns its character argument converted
X				to upper-case.
X		tolower(c)	Returns its character argument converted
X				to lower-case.
X		match(s,@re@)	Compares the string "s" to the regular
X				expression "re" and returns the number
X				of matches found (zero if none).
X
XEXAMPLES
X
X	The following rules file will scan a C program, counting the
X	number of mismatched parentheses, brackets, and braces.
X
X		/[()\[\]{}]/
X		{
X			parens = parens + match( $0, @(@ );
X			parens = parens - match( $0, @)@ );
X			bracks = bracks + match( $0, @[@ );
X			bracks = bracks - match( $0, @]@ );
X			braces = braces + match( $0, @{@ );
X			braces = braces - match( $0, @}@ );
X		}
X		END { printf("parens=%d, brackets=%d, braces=%d\n",
X				parens, bracks, braces );
X		}
X
X	This program will capitalize the first word in every sentence of
X	a document:
X
X		BEGIN
X		{
X			RS = '.';  # set record seperator to a period
X		}
X		{
X			if ( match( $1, @^[a-z]@ ) )
X				*$1 = toupper( *$1 );
X			printf( "%s\n", $0 );
X		}
X
XLIMITATIONS
X
X	Bawk was originally written in BDS C, but every attempt was made
X	to keep the code as portable as possible.  The program should
X	be compilable with any "standard" C compiler.  On CP/M systems
X	compiled with BDS C, bawk takes up about 24K.
X
X	An input record may be no longer than 128 characters. If longer
X	records are encountered, they terminate prematurely and the
X	next record starts where the previous one was hacked off.
X
X	A single pattern or action statement may be no longer than about
X	4K characters, excluding comments and whitespace.  Since the
X	program is semi-compiled the tokenized version will probably
X	wind up being smaller than the source code, so the 4K figure is
X	only approximate.
X
XAUTHOR
X
X	Bob Brodt
X	486 Linden Ave.
X	Bogota, NJ 07603
X
XACKNOWLEDGEMENTS
X
X	The concept for bawk (and 3/4 of the name!) was taken from
X	the program "awk" written by Afred V. Aho, Brian W. Kernighan
X	and Peter J. Weinberger.  My apologies for any irreverences.
X
X	The regular expression compiler/parser was borrowed from a
X	program called "grep" and has been highly modified.  Grep is
X	distributed by the DEC Users Society (DECUS) and is Copyright
X	(C) 1980 by DECUS.  The author acknowledges DECUS with a nod of
X	thanks for giving their general permission and okey-dokey to
X	copy or modify the grep program.
X
X	UNIX is a trademark of AT&T Bell Labs.
+ END-OF-FILE bawk.doc
chmod 'u=rw,g=r,o=r' \b\a\w\k\.\d\o\c
set `sum \b\a\w\k\.\d\o\c`
sum=$1
case $sum in
36437)	:;;
*)	echo 'Bad sum in '\b\a\w\k\.\d\o\c >&2
esac
echo Extracting \b\a\w\k\.\h
sed 's/^X//' > \b\a\w\k\.\h << '+ END-OF-FILE '\b\a\w\k\.\h
X#include <ctype.h>
X/*
X * Bawk constants and variable declarations.
X */
X
X#ifdef BDS_C
X#define EXTERN
X#else
X
X#ifdef MAIN
X#define EXTERN
X#else
X#define EXTERN extern
X#endif
X
X#endif
X
X
X#ifdef DEBUG
XEXTERN char Debug;              /* debug print flag */
X#endif
X
X/*
X * Table and buffer sizes
X */
X#define MAXLINELEN      128
X#define MAXWORDS        (MAXLINELEN/2)
X#define MAXWORKBUFLEN   4096
X#define MAXVARTABSZ     50
X#define MAXVARLEN       10
X#define MAXSTACKSZ      40
X
X
X/**********************************************************
X * Current Input File variables                           *
X **********************************************************/
X/*
X * Current Input File pointer:
X */
X#ifdef BDS_C
XEXTERN char *Fileptr, Curfbuf[BUFSIZ];
X#else
XEXTERN FILE *Fileptr;
X#endif
XEXTERN char *Filename;          /* current input file name */
XEXTERN int Linecount;           /* current input line number */
XEXTERN int Recordcount;         /* record count */
X/*
X * Working buffers.
X */
XEXTERN char Linebuf[MAXLINELEN];/* current input line buffer */
XEXTERN char *Fields[MAXWORDS];  /* pointers to the words in Linebuf */
XEXTERN int Fieldcount;          /* and the # of words */
XEXTERN char Workbuf[MAXWORKBUFLEN];     /* work area for C action and */
X /* regular expression parsers */
X
X/**********************************************************
X * Regular Expression Parser variables                    *
X **********************************************************/
X/*
X * Tokens:
X */
X#define CHAR    1
X#define BOL     2
X#define EOL     3
X#define ANY     4
X#define CLASS   5
X#define NCLASS  6
X#define STAR    7
X#define PLUS    8
X#define MINUS   9
X#define ALPHA   10
X#define DIGIT   11
X#define NALPHA  12
X#define PUNCT   13
X#define RANGE   14
X#define ENDPAT  15
X
X
X/**********************************************************
X * C Actions Interpreter variables                        *
X **********************************************************/
X/*
X * Tokens:
X */
X#define T_STRING        'S'
X#define T_DOLLAR        '$'
X#define T_REGEXP        'r'
X#define T_CONSTANT      'C'
X#define T_VARIABLE      'V'
X#define T_FUNCTION      'F'
X#define T_SEMICOLON     ';'
X#define T_EOF           'Z'
X#define T_LBRACE        '{'
X#define T_RBRACE        '}'
X#define T_LPAREN        '('
X#define T_RPAREN        ')'
X#define T_LBRACKET      '['
X#define T_RBRACKET      ']'
X#define T_COMMA         ','
X#define T_ASSIGN        '='
X#define T_MUL           '*'
X#define T_DIV           '/'
X#define T_MOD           '%'
X#define T_ADD           '+'
X#define T_SUB           '-'
X#define T_SHL           'L'
X#define T_SHR           'R'
X#define T_LT            '<'
X#define T_LE            'l'
X#define T_GT            '>'
X#define T_GE            'g'
X#define T_EQ            'q'
X#define T_NE            'n'
X#define T_NOT           '~'
X#define T_AND           '&'
X#define T_XOR           '^'
X#define T_IOR           '|'
X#define T_LNOT          '!'
X#define T_LAND          'a'
X#define T_LIOR          'o'
X#define T_INCR          'p'
X#define T_DECR          'm'
X#define T_IF            'i'
X#define T_ELSE          'e'
X#define T_WHILE         'w'
X#define T_BREAK         'b'
X#define T_CHAR          'c'
X#define T_INT           't'
X#define T_BEGIN         'B'
X#define T_END           'E'
X#define T_NF            'f'
X#define T_NR            '#'
X#define T_FS            ' '
X#define T_RS            '\n'
X#define T_FILENAME      'z'
X
X#define PATTERN 'P'
X#define ACTION  'A'
X
X/*
X * Symbol Table values
X */
X#define ACTUAL          0
X#define LVALUE          1
X#define BYTE            1
X#define WORD            2
X/*
X * Symbol table
X */
Xstruct variable {
X        char vname[MAXVARLEN];
X        char vclass;
X        char vsize;
X        int vlen;
X        char *vptr;
X};
X#define VARIABLE struct variable
XEXTERN VARIABLE Vartab[MAXVARTABSZ], *Nextvar;
X/*
X * Value stack
X */
Xunion datum {
X        int ival;
X        char *dptr;
X        char **ptrptr;
X};
X#define DATUM union datum
Xstruct item {
X        char class;
X        char lvalue;
X        char size;
X        DATUM value;
X};
X#define ITEM struct item
XEXTERN ITEM Stackbtm[MAXSTACKSZ], *Stackptr, *Stacktop;
X/*
X * Miscellaneous
X */
XEXTERN char *Actptr;            /* pointer into Workbuf during compilation */
XEXTERN char Token;              /* current input token */
XEXTERN DATUM Value;             /* and its value */
XEXTERN char Saw_break;          /* set when break stmt seen */
XEXTERN char Where;              /* indicates whether C stmt is a PATTERN or
X                                 * ACTION */
XEXTERN char Fieldsep[3];        /* field seperator */
XEXTERN char Recordsep[3];       /* record seperator */
XEXTERN char *Beginact;          /* BEGINning of input actions */
XEXTERN char *Endact;            /* END of input actions */
X
X/**********************************************************
X * Rules structure                                        *
X **********************************************************/
Xstruct rule {
X        struct {
X                char *start;    /* C statements that match pattern start */
X                char *stop;     /* C statements that match pattern end */
X                char startseen; /* set if both a start and stop pattern */
X                /* given and if an input line matched the */
X                /* start pattern */
X        } pattern;
X        char *action;           /* contains quasi-C statements of actions */
X        struct rule *nextrule;  /* pointer to next rule */
X};
X#define RULE struct rule
XEXTERN RULE *Rules,             /* rule structures linked list head */
X*Rulep;                         /* working pointer */
X
X
X/**********************************************************
X * Miscellaneous                                          *
X **********************************************************/
X/*
X * Error exit values (returned to command shell)
X */
X#define USAGE_ERROR     1
X#define FILE_ERROR      2
X#define RE_ERROR        3
X#define ACT_ERROR       4
X#define MEM_ERROR       5
X/*
X * Functions that return something special:
X */
Xchar *
Xstr_compile(), *getmem(), *cclass(), *pmatch(), *fetchptr();
Xchar *storeptr();
XVARIABLE *
Xfindvar(), *addvar(), *decl();
+ END-OF-FILE bawk.h
chmod 'u=rw,g=r,o=r' \b\a\w\k\.\h
set `sum \b\a\w\k\.\h`
sum=$1
case $sum in
06605)	:;;
*)	echo 'Bad sum in '\b\a\w\k\.\h >&2
esac
echo Extracting \b\a\w\k\a\c\t\.\c
sed 's/^X//' > \b\a\w\k\a\c\t\.\c << '+ END-OF-FILE '\b\a\w\k\a\c\t\.\c
X/*
X * Bawk C actions compiler
X */
X#include <stdio.h>
X#include "bawk.h"
X
Xact_compile(actbuf)
X	char *actbuf;		/* where tokenized actions are compiled into */
X{
X	Where = ACTION;
X	return stmt_compile(actbuf);
X}
X
Xpat_compile(actbuf)
X	char *actbuf;		/* where tokenized actions are compiled into */
X{
X	Where = PATTERN;
X	return stmt_compile(actbuf);
X}
X
Xstmt_compile(actbuf)
X	char *actbuf;		/* where tokenized actions are compiled into */
X{
X	/*
X	 * Read and tokenize C actions from current input file into the
X	 * action buffer.  Strip out comments and whitespace in the process. 
X	 */
X	char *actptr,		/* actbuf pointer */
X	*cp,			/* work pointer */
X	 buf[MAXLINELEN];	/* string buffer */
X	int braces,		/* counts '{}' pairs - return when 0 */
X	 parens,		/* counts '()' pairs */
X	 i,			/* temp */
X	 c;			/* current input character */
X
X	braces = parens = 0;
X	actptr = actbuf;
X	while ((c = getcharacter()) != -1) {
X		/*
X		 * Skip over spaces, tabs and newlines 
X		 */
X		if (c == ' ' || c == '\t' || c == '\n')
X			continue;
X		if (c == '#') {
X			/*
X			 * Skip comments.  Comments start with a '#' and end
X			 * at the next newline. 
X			 */
X			while ((c = getcharacter()) != -1 && c != '\n');
X			continue;
X		}
X
X		if (c == '{') {
X			if (Where == PATTERN) {
X				/*
X				 * We're compiling a pattern. The '{' marks
X				 * the beginning of an action statement. Push
X				 * the character back and return. 
X				 */
X				ungetcharacter('{');
X				break;
X			}
X			else {
X				/*
X				 * We must be compiling an action statement.
X				 * '{'s mark beginning of action or compound
X				 * statements. 
X				 */
X				++braces;
X				*actptr++ = T_LBRACE;
X			}
X		}
X		else if (c == '}') {
X			*actptr++ = T_RBRACE;
X			if (!--braces)
X				/*
X				 * Found the end of the action string 
X				 */
X				break;
X		}
X		else if (c == '(') {
X			++parens;
X			*actptr++ = T_LPAREN;
X		}
X		else if (c == ')') {
X			if (--parens < 0)
X				error("mismatched '()'", ACT_ERROR);
X			*actptr++ = T_RPAREN;
X		}
X		else if (c == ',' && !braces && !parens && Where == PATTERN) {
X			/*
X			 * found a comma outside of any braces or parens-
X			 * this must be a regular expression seperator. 
X			 */
X			ungetcharacter(',');
X			break;
X		}
X
X		/*
X		 * Check if it's a regular expression: 
X		 */
X		else if (c == '/') {
X			/*
X			 * A '/' inside a pattern string starts a regular
X			 * expression.  Inside action strings, a '/' is the
X			 * division operator. 
X			 */
X			if (Where == PATTERN)
X				goto dopattern;
X			else
X				*actptr++ = T_DIV;
X		}
X		else if (c == '@') {
X	dopattern:
X			/*
X			 * Within action strings, only the '@' may be used to
X			 * delimit regular expressions 
X			 */
X			*actptr++ = T_REGEXP;
X			ungetcharacter(c);
X			actptr += re_compile(actptr);
X		}
X
X		/*
X		 * symbol, string or constant: 
X		 */
X		else if (alpha(c)) {
X			/*
X			 * It's a symbol reference. Copy the symbol into
X			 * string buffer. 
X			 */
X			cp = buf;
X			do
X				*cp++ = c;
X			while ((c = getcharacter()) != -1 && alphanum(c));
X			ungetcharacter(c);
X			*cp = 0;
X			/*
X			 * Check if a keyword, builtin function or variable. 
X			 */
X			if (c = iskeyword(buf))
X				*actptr++ = c;
X			else if (i = isfunction(buf)) {
X				*actptr++ = T_FUNCTION;
X				storeint(actptr, i);
X				actptr += sizeof(i);
X			}
X			else {
X				/*
X				 * It's a symbol name. 
X				 */
X				*actptr++ = T_VARIABLE;
X			if (!(cp = (char *) findvar(buf)))
X					cp = (char *) addvar(buf);
X				storeptr(actptr, cp);
X				actptr += sizeof(cp);
X			}
X		}
X
X		else if (c == '"') {
X			/*
X			 * It's a string constant 
X			 */
X			*actptr++ = T_STRING;
X			actptr = str_compile(actptr, '"');
X		}
X		else if (c == '\'') {
X			/*
X			 * It's a character constant 
X			 */
X			*actptr++ = T_CONSTANT;
X			str_compile(buf, '\'');
X			storeint(actptr, *buf);
X			actptr += sizeof(i);
X		}
X
X		else if (num(c)) {
X			/*
X			 * It's a numeric constant 
X			 */
X			*actptr++ = T_CONSTANT;
X			cp = buf;
X			do
X				*cp++ = c;
X			while ((c = getcharacter()) != -1 && num(c));
X			ungetcharacter(c);
X			*cp = 0;
X			storeint(actptr, atoi(buf));
X			actptr += sizeof(i);
X		}
X
X		/*
X		 * unary operator: 
X		 */
X		else if (c == '$')
X			*actptr++ = T_DOLLAR;
X
X		/*
X		 * or binary operator: 
X		 */
X		else if (c == '=') {
X			if ((c = getcharacter()) == '=')
X				*actptr++ = T_EQ;
X			else {
X				ungetcharacter(c);
X				*actptr++ = T_ASSIGN;
X			}
X		}
X
X		else if (c == '!') {
X			if ((c = getcharacter()) == '=')
X				*actptr++ = T_NE;
X			else {
X				ungetcharacter(c);
X				*actptr++ = T_LNOT;
X			}
X		}
X
X		else if (c == '<') {
X			if ((c = getcharacter()) == '<')
X				*actptr++ = T_SHL;
X			else if (c == '=')
X				*actptr++ = T_LE;
X			else {
X				ungetcharacter(c);
X				*actptr++ = T_LT;
X			}
X		}
X
X		else if (c == '>') {
X			if ((c = getcharacter()) == '>')
X				*actptr++ = T_SHR;
X			else if (c == '=')
X				*actptr++ = T_GE;
X			else {
X				ungetcharacter(c);
X				*actptr++ = T_GT;
X			}
X		}
X
X		else if (c == '&') {
X			if ((c = getcharacter()) == '&')
X				*actptr++ = T_LAND;
X			else {
X				ungetcharacter(c);
X				*actptr++ = T_AND;
X			}
X		}
X
X		else if (c == '|') {
X			if ((c = getcharacter()) == '|')
X				*actptr++ = T_LIOR;
X			else {
X				ungetcharacter(c);
X				*actptr++ = T_IOR;
X			}
X		}
X		else if (c == '+') {
X			if ((c = getcharacter()) == '+')
X				*actptr++ = T_INCR;
X			else {
X				ungetcharacter(c);
X				*actptr++ = T_ADD;
X			}
X		}
X
X		else if (c == '-') {
X			if ((c = getcharacter()) == '-')
X				*actptr++ = T_DECR;
X			else {
X				ungetcharacter(c);
X				*actptr++ = T_SUB;
X			}
X		}
X
X		/*
X		 * punctuation 
X		 */
X		else if (instr(c, "[](),;*/%+-^~"))
X			*actptr++ = c;
X
X		else {
X			/*
X			 * Bad character in input line 
X			 */
X			error("lexical error", ACT_ERROR);
X		}
X
X		if (actptr >= Workbuf + MAXWORKBUFLEN)
X			error("action too long", MEM_ERROR);
X	}
X	if (braces || parens)
X		error("mismatched '{}' or '()'", ACT_ERROR);
X
X	*actptr++ = T_EOF;
X
X	return actptr - actbuf;
X}
X
Xchar *
Xstr_compile(str, delim)
X	char *str, delim;
X{
X	/*
X	 * Compile a string from current input file into the given string
X	 * buffer.  Stop when input character is the delimiter in "delim".
X	 * Returns a pointer to the first character after the string. 
X	 */
X	int c;
X	char buf[MAXLINELEN];
X
X	while ((c = getcharacter()) != -1 && c != delim) {
X		if (c == '\\') {
X			switch (c = getcharacter()) {
X			case -1:
X				goto err;
X			case 'b':
X				c = '\b';
X				break;
X			case 'n':
X				c = '\n';
X				break;
X			case 't':
X				c = '\t';
X				break;
X			case 'f':
X				c = '\f';
X				break;
X			case 'r':
X				c = '\r';
X				break;
X			case '0':
X			case '1':
X			case '2':
X			case '3':
X				*buf = c;
X				for (c = 1; c < 3; ++c) {
X					if ((buf[c] = getcharacter()) == -1)
X						goto err;
X				}
X				buf[c] = 0;
X				sscanf(buf, "%o", &c);
X				break;
X			case '\n':
X				if (getcharacter() == -1)
X					goto err;
X			default:
X				if ((c = getcharacter()) == -1)
X					goto err;
X			}
X		}
X		*str++ = c;
X	}
X	*str++ = 0;
X
X	return (str);
Xerr:
X	sprintf(buf, "missing %c delimiter", delim);
X	error(buf, 4);
X}
X
Xstoreint(ip, i)
X	int *ip, i;
X{
X	return *ip = i;
X}
X
Xchar *
Xstoreptr(pp, p)
X	char **pp, *p;
X{
X	return (*pp = p);
X}
X
Xfetchint(ip)
X	int *ip;
X{
X	return *ip;
X}
X
Xchar *
Xfetchptr(pp)
X	char **pp;
X{
X	return *pp;
X}
X
Xgetoken()
X{
X	char *cp;
X	int i;
X
X	switch (Token = *Actptr++) {
X	case T_STRING:
X	case T_REGEXP:
X		Value.dptr = Actptr;
X		Actptr += strlen(Actptr) + 1;
X		break;
X	case T_VARIABLE:
X		Value.dptr = fetchptr(Actptr);
X		Actptr += sizeof(cp);
X		break;
X	case T_FUNCTION:
X	case T_CONSTANT:
X		Value.ival = fetchint(Actptr);
X		Actptr += sizeof(i);
X		break;
X	case T_EOF:
X		--Actptr;
X	default:
X		Value.dptr = 0;
X	}
X
X#ifdef DEBUG
X	if (Debug > 1)
X		printf("Token='%c' (0x%x), Value=%d\n",
X		       Token, Token, Value.ival);
X#endif
X
X	return Token;
X}
+ END-OF-FILE bawkact.c
chmod 'u=rw,g=r,o=r' \b\a\w\k\a\c\t\.\c
set `sum \b\a\w\k\a\c\t\.\c`
sum=$1
case $sum in
28797)	:;;
*)	echo 'Bad sum in '\b\a\w\k\a\c\t\.\c >&2
esac
exit 0