brennan@ssc-vax.UUCP (Mike Brennan) (05/11/91)
------------------cut here---------------- # This is a shell archive. Remove anything before this line, # then unpack it by saving it in a file and typing "sh file". # # Wrapped by ssc-bee!brennan on Fri May 10 18:11:41 PDT 1991 # Contents: mawk0.97/ mawk0.97/rexp/ mawk0.97/test/ mawk0.97/examples/ # mawk0.97/msdos/ mawk0.97/packing.list mawk0.97/README # mawk0.97/LIMITATIONS mawk0.97/Makefile mawk0.97/mawk.manual # mawk0.97/array.c mawk0.97/bi_funct.c mawk0.97/bi_funct.h # mawk0.97/bi_vars.c mawk0.97/bi_vars.h mawk0.97/cast.c mawk0.97/code.c # mawk0.97/code.h mawk0.97/da.c mawk0.97/error.c mawk0.97/execute.c # mawk0.97/fcall.c mawk0.97/field.c mawk0.97/field.h mawk0.97/files.c # mawk0.97/files.h mawk0.97/fin.c mawk0.97/fin.h mawk0.97/hash.c # mawk0.97/init.c mawk0.97/init.h mawk0.97/jmp.c mawk0.97/jmp.h # mawk0.97/kw.c mawk0.97/machine.h mawk0.97/main.c mawk0.97/makescan.c # mawk0.97/matherr.c mawk0.97/mawk.h mawk0.97/memory.c mawk0.97/memory.h # mawk0.97/parse.y mawk0.97/print.c mawk0.97/re_cmpl.c mawk0.97/regexp.h # mawk0.97/repl.h mawk0.97/scan.c mawk0.97/scan.h mawk0.97/scancode.c # mawk0.97/sizes.h mawk0.97/split.c mawk0.97/symtype.h mawk0.97/types.h # mawk0.97/zmalloc.c mawk0.97/zmalloc.h mawk0.97/rexp/Makefile # mawk0.97/rexp/rexp.c mawk0.97/rexp/rexp.h mawk0.97/rexp/rexp0.c # mawk0.97/rexp/rexp1.c mawk0.97/rexp/rexp2.c mawk0.97/rexp/rexp3.c # mawk0.97/rexp/rexpdb.c mawk0.97/test/README mawk0.97/test/benchmarks # mawk0.97/test/cat.awk mawk0.97/test/concat.awk mawk0.97/test/fields.awk # mawk0.97/test/loops.awk mawk0.97/test/newton.awk # mawk0.97/test/primes.awk mawk0.97/test/qsort.awk mawk0.97/test/reg0.awk # mawk0.97/test/reg1.awk mawk0.97/test/reg2.awk mawk0.97/test/sample # mawk0.97/test/squeeze.awk mawk0.97/test/test.sh mawk0.97/test/wc.awk # mawk0.97/test/wfrq.awk mawk0.97/test/wfrq0.awk mawk0.97/test/words.awk # mawk0.97/test/words0.awk mawk0.97/examples/decl.awk # mawk0.97/examples/deps.awk mawk0.97/examples/gdecl.awk # mawk0.97/examples/nocomment.awk mawk0.97/msdos/INSTALL # mawk0.97/msdos/makefile mawk0.97/msdos/mklib.bat # mawk0.97/msdos/rand48.asm mawk0.97/msdos/rand48.h # mawk0.97/msdos/rand48_0.c mawk0.97/msdos/reargv.c echo mkdir - mawk0.97 mkdir mawk0.97 chmod u=rwx,g=rx,o=rx mawk0.97 echo x - mawk0.97/packing.list sed 's/^@//' > "mawk0.97/packing.list" <<'@//E*O*F mawk0.97/packing.list//' ################################################ # These files form the mawk distribution # # Mawk is an implementation of the AWK Programming Language as # defined and described in Aho, Kernighan and Weinberger, The # Awk Programming Language, Addison-Wesley, 1988. # ################################################ # Source code written by Michael D. Brennan # Copyright (C) 1991 , Michael D. Brennan ################################################ packing.list this file README how to get started LIMITATIONS restrictions on use Makefile mawk makefile mawk.manual mock manual ###################### array.c source files bi_funct.c bi_funct.h bi_vars.c bi_vars.h cast.c code.c code.h da.c error.c execute.c fcall.c field.c field.h files.c files.h fin.c fin.h hash.c init.c init.h jmp.c jmp.h kw.c machine.h main.c makescan.c matherr.c mawk.h memory.c memory.h parse.y print.c re_cmpl.c regexp.h repl.h scan.c scan.h scancode.c sizes.h split.c symtype.h types.h zmalloc.c zmalloc.h ######################## # directory: rexp rexp/Makefile makefile for regexp.a rexp/rexp.c source for regular matching library rexp/rexp.h rexp/rexp0.c rexp/rexp1.c rexp/rexp2.c rexp/rexp3.c rexp/rexpdb.c ####################### # directory: test benchmarking directory test/README test/benchmarks test/cat.awk test/concat.awk test/fields.awk test/loops.awk test/newton.awk test/primes.awk test/qsort.awk test/reg0.awk test/reg1.awk test/reg2.awk test/sample sample input file for test.sh test/squeeze.awk test/test.sh test/wc.awk test/wfrq.awk test/wfrq0.awk test/words.awk test/words0.awk ###################### # directory: examples useful awk programs examples/decl.awk examples/deps.awk examples/gdecl.awk examples/nocomment.awk ###################### # directory msdos msdos/INSTALL msdos/makefile msdos/mklib.bat msdos/rand48.asm msdos/rand48.h msdos/rand48_0.c msdos/reargv.c @//E*O*F mawk0.97/packing.list// chmod u=rw,g=r,o=r mawk0.97/packing.list echo x - mawk0.97/README sed 's/^@//' > "mawk0.97/README" <<'@//E*O*F mawk0.97/README//' to build mawk: make sure there is an appropriate description of your system in machine.h set CFLAGS in the Makefile to pick the appropriate blob in machine.h run make PS: I expected to have bcopy() <-> memcpy() hassles on 4.3BSD, but didn't Is this right? or did someone add memcpy(), strchr() etc to that machine? If 4.3BSD in machine.h is wrong, let me know at brennan@bcsaic.boeing.com @//E*O*F mawk0.97/README// chmod u=r,g=r,o=r mawk0.97/README echo x - mawk0.97/LIMITATIONS sed 's/^@//' > "mawk0.97/LIMITATIONS" <<'@//E*O*F mawk0.97/LIMITATIONS//' Mawk is an implementation of the AWK Programming Language as defined in Aho, Kernighan and Weinberger, The AWK Programming Language, Addison-Wesley, 1988. The source code is original work, in the sense that its development relied only on the specification of the AWK language in the book above. Most of the algorithms and data structures used in this code are not original -- but based on knowledge acquired from numerous sources. Originality is claimed only for the aggregate work. Any ideas or techniques in this code can be freely copied and used in other work. The source code may be modified provided the copyright notices remain intact, and modifications are unambiguously distinct from the original. I want to retain credit for my work and do not want credit for yours. Redistribution in any form is permitted provided the built-in variable VERSION is retained, and its initial value only modified by appending extra lines. For example, if you modify a mawk with VERSION mawk x.xx Mon Year, Copyright (C) Michael D. Brennan then add an extra line to VERSION without modifying the first line. mawk x.xx Mon Year, Copyright (C) Michael D. Brennan mod y.yy Mon Year, your name Michael D. Brennan 16 Apr 1991 @//E*O*F mawk0.97/LIMITATIONS// chmod u=r,g=r,o=r mawk0.97/LIMITATIONS echo x - mawk0.97/Makefile sed 's/^@//' > "mawk0.97/Makefile" <<'@//E*O*F mawk0.97/Makefile//' # ################################################### # This is a makefile for mawk, # an implementation of The AWK Programmin Language, 1988. # # SHELL=/bin/sh #################################### # CFLAGS needs to match a define in machine.h # unless machine.h uses a built-in compiler flag # CFLAGS = -O -DULTRIX #CFLAGS = -O -DBSD43 YACC=yacc -dv #YACC=bison -dvy ####################################### O=parse.o scan.o memory.o main.o hash.o execute.o code.o\ da.o error.o init.o bi_vars.o cast.o print.o bi_funct.o\ kw.o jmp.o array.o field.o split.o re_cmpl.o zmalloc.o\ fin.o files.o scancode.o matherr.o fcall.o REXP_C=rexp/rexp.c rexp/rexp0.c rexp/rexp1.c rexp/rexp2.c\ rexp/rexp3.c rexp/rexpdb.c mawk : $(O) rexp/regexp.a cc $(CFLAGS) -o mawk $(O) -lm rexp/regexp.a rexp/regexp.a : $(REXP_C) cd rexp ; make parse.c : parse.y @echo expect 3 shift/reduce conflicts $(YACC) parse.y mv y.tab.c parse.c -if cmp -s y.tab.h parse.h ;\ then rm y.tab.h ;\ else mv y.tab.h parse.h ; fi scancode.c : makescan.c scan.h cc -o makescan.exe makescan.c makescan.exe > scancode.c rm makescan.exe array.o : bi_vars.h sizes.h zmalloc.h memory.h types.h machine.h mawk.h symtype.h bi_funct.o : fin.h bi_vars.h sizes.h memory.h zmalloc.h regexp.h types.h machine.h field.h repl.h files.h bi_funct.h mawk.h symtype.h init.h bi_vars.o : bi_vars.h sizes.h memory.h zmalloc.h types.h machine.h field.h mawk.h symtype.h init.h cast.o : parse.h sizes.h memory.h zmalloc.h types.h machine.h field.h scan.h repl.h mawk.h symtype.h code.o : sizes.h memory.h zmalloc.h types.h machine.h code.h mawk.h init.h da.o : sizes.h memory.h zmalloc.h types.h machine.h field.h repl.h code.h bi_funct.h mawk.h symtype.h error.o : parse.h bi_vars.h sizes.h types.h machine.h scan.h mawk.h symtype.h execute.o : sizes.h memory.h zmalloc.h regexp.h types.h machine.h field.h code.h repl.h bi_funct.h mawk.h symtype.h fcall.o : sizes.h memory.h zmalloc.h types.h machine.h code.h mawk.h symtype.h field.o : parse.h bi_vars.h sizes.h memory.h zmalloc.h regexp.h types.h machine.h field.h scan.h repl.h mawk.h symtype.h init.h files.o : fin.h sizes.h memory.h zmalloc.h types.h machine.h files.h mawk.h fin.o : parse.h fin.h bi_vars.h sizes.h memory.h zmalloc.h types.h machine.h field.h scan.h mawk.h symtype.h hash.o : sizes.h memory.h zmalloc.h types.h machine.h mawk.h symtype.h init.o : bi_vars.h sizes.h memory.h zmalloc.h types.h machine.h field.h code.h mawk.h symtype.h init.h jmp.o : sizes.h memory.h zmalloc.h types.h machine.h code.h jmp.h mawk.h init.h kw.o : parse.h sizes.h types.h machine.h mawk.h symtype.h init.h main.o : fin.h bi_vars.h sizes.h memory.h zmalloc.h types.h machine.h field.h code.h files.h mawk.h init.h makescan.o : parse.h scan.h symtype.h matherr.o : sizes.h types.h machine.h mawk.h memory.o : sizes.h memory.h zmalloc.h types.h machine.h mawk.h parse.o : bi_vars.h sizes.h memory.h zmalloc.h types.h machine.h field.h code.h files.h bi_funct.h mawk.h jmp.h symtype.h print.o : bi_vars.h parse.h sizes.h memory.h zmalloc.h types.h machine.h field.h scan.h files.h bi_funct.h mawk.h symtype.h re_cmpl.o : parse.h sizes.h memory.h zmalloc.h regexp.h types.h machine.h scan.h repl.h mawk.h symtype.h scan.o : parse.h fin.h sizes.h memory.h zmalloc.h types.h machine.h field.h scan.h repl.h files.h mawk.h symtype.h init.h split.o : bi_vars.h parse.h sizes.h memory.h zmalloc.h regexp.h types.h machine.h field.h scan.h bi_funct.h mawk.h symtype.h zmalloc.o : sizes.h zmalloc.h types.h machine.h mawk.h @//E*O*F mawk0.97/Makefile// chmod u=r,g=r,o=r mawk0.97/Makefile echo x - mawk0.97/mawk.manual sed 's/^@//' > "mawk0.97/mawk.manual" <<'@//E*O*F mawk0.97/mawk.manual//' Mawk Manual Mawk implements the awk language as defined in Aho, Kernighan and Weinberger, The AWK Programming Language, Addison-Wesley, 1988, ISBN 0-201-07981-X, hereafter called the AWK book. Chapter 2 serves as a reference to the language and the rest (8 total chapters) provides a wide range of examples and applications. This book is must reading to understand the versatility of the language. The 1988 version of the language is sometimes called new awk as opposed to the 1977 version (awk or old awk.) Virtially every Unix system has old awk, somewhere in the documentation will be an (old) awk tutorial (probably in support tools). If you use (old) awk, the transition to new awk is easy. The language has been extended and ambiguous points clarified, but old awk programs still run under new awk. This manual assumes you know (old) awk, and hence concentrates on the new features of awk. Feature xxx is new means xxx was added to the 1988 version. Experienced new awk users should read sections 9 and 12, and skim sections 7 and 8. 1. Command line mawk [-Fs] 'program' optional_list_of_files mawk [-Fs] -f program_file optional_list_of_files 2. Program blocks Program blocks are of the form: pattern { action } pattern can be: regular_expression expression ( pattern ) ! pattern pattern || pattern pattern && pattern pattern , pattern (range pattern) BEGIN END Range, BEGIN and END patterns cannot be combined to form new patterns. BEGIN and END patterns require an action; otherwise, if action is omitted it is implicitly { print }. NR==2 { print } # prints line number 2 NR==2 # also prints line number 2 If pattern is omitted then action is always applied. { print $NF } prints the last field of every record. 3. Statement format and loops Statements are terminated by newlines, semi-colons or both. Groups of statements are blocked via { ... } as in C. The last statement in a block doesn't need a terminator. Blank lines have no meaning; an empty statement is terminated with a semi-colon. Long statements can be continued with a backslash, \. A statement can be broken without a backslash after a comma, left brace, &&, ||, do, else, the right parenthesis of an if, while or for statement, and the right parenthesis of a function definition. Loops are for(){}, while(){} and do{}while() as in C. 4. Expression syntax The expression syntax and grouping of the language is similar to C. Primary expressions are numeric constants, string constants, variables, arrays and functions. Complex expressions are composed with the following operators in order of increasing precedence. assignment: = += -+ *= /= ^= conditional: ? : logical or: || logical and: && array membership : in matching : ~ !~ relational : < > <= >= == != concatenation: (no explicit operator) add ops: + - mul ops: * / % unary : + - logical not : ! exponentiation: ^ inc and dec: ++ -- (both post and pre) field: $ 5. Builtin variables. The following variables are built-in and initialized before program execution. ARGC number of command line arguments ARGV array of command line arguments, 0..ARGC-1 FILENAME name of the current input file FNR current record number in the current input file FS splits records into fields as a regular expression NF number of fields in the current record, i.e., $0 NR current record number in the total input stream OFMT format for printing numbers; initially = "%.6g" OFS inserted between fields on output, initially = " " ORS terminates each record on output, initially = "\n" RLENGTH length of the last call to the built-in function, match() RS input record separator, initially = " " RSTART index of the last call to match() SUBSEP used to build multiple array subscripts, initially = "\034" VERSION Mawk version, unique to mawk. ARGC, ARGV, FNR, RLENGTH, RSTART and SUBSEP are new. The current input record is stored in the field, $0. The fields of $0 determined by splitting with RS are stored in $1, $2, ..., $NF. 6. Built-in Functions String functions index(s,t) length(s), length split(s, A, r), split(s, A) substr(s,i,n) , substr(s,i) sprintf(format, expr_list) match(s,r) returns the index where string s matches regular expression r or 0 if no match. As a side effect, sets RSTART and RLENGTH. gsub(r, s, t) Global substitution, every match of regular expression r in variable t is replaced by s. The number of matches/replacements is returned. sub(r, s, t) Like gsub(), except at most one replacement. Match(), gsub() and sub() are new. If r is an expr it is coerced to string and then treated as a regular expression. In sub and gsub, t can be a variable, field or array element, i.e., it must have storage to hold the modification. Sub(r,s) and gsub(r,s) are the same as sub(r,s,$0) and gsub(r,s,$0). In the replacement string s, an & is replaced by the matched piece and a literal & is obtained with \&. E.g., y = x = "abbc" sub(/b+/, "B&B" , x) sub(/b+/, "B\&B" , y) print x, y outputs: aBbbBc aB&Bc Arithmetic functions atan2(y,x) arctan of y/x between -pi and pi. cos(x) exp(x) int(x) x.dddd -> x.0 log(x) rand() returns random number , 0 <= r < 1. sin(x) sqrt(x) srand(x) , srand() seeds random number generator, uses clock if x is omitted. Output functions print writes $0 ORS to stdout. print expr1 , expr2 , ... , exprn writes expr1 OFS expr2 OFS ... OFS exprn ORS to stdout. printf format, expr_list Acts like the C library function, writing to stdout. Supported conversions are %c, %d, %e, %f, %g, %o, %s and %x. - , width and .prec are supported. Dynamic widths can be built using string operations Output can be redirected print[f] > file >> file | command File and command are awk expressions that are interpreted as a filename or a shell command. Input functions getline read $0, update NF, NR and FNR. getline < file read $0 from file, update NF. getline var read var from input stream, update NR, FNR. getline var < file read var from next record of file command | getline read $0 from piped command, update NF. command | getline var read var from next record of piped command. (Old) awk had getline, the redirection facilities are new. Files or commands are closed with close(expr) where expr is command or file as a string. Close returns 0 if expr was in fact an open file or command else -1. Close is needed if you want to reread a file, rerun a command, have a large number of output files without mawk running out of resources or wait for an output command to finish. Here is an example of the last case: { .... do some processing on each input line # send the processed line to sort print | "sort > temp_file" } END { # reread the sorted input close( "sort > temp_file") # makes sure sort is finished cnt=1 while ( getline line[cnt++] < "temp_file" > 0 ) ; system( "rm temp_file" ) # cleanup ... process line[1], line[2] ... line[cnt-1] } The system() function executes a command and returns the command's exit status. Mawk uses the shell in the environment variable SHELL to execute system or command pipelines; defaulting to "/bin/sh" if SHELL is not set. 7. String constants String constants are written as in C. "This is a string with a newline at the end.\n" Strings can be continued across a line by escaping (\) the newline. The following escape sequences are recognized. \\ \ \" " \' ' \a alert, ascii 7 \b backspace, ascii 8 \t tab, ascii 9 \n newline, ascii 10 \v vertical tab, ascii 11 \f formfeed, ascii 12 \r carriage return, ascii 13 \ddd 1, 2 or 3 octal digits for ascii ddd \xhh 1 or 2 hex digits for ascii hh If you escape any other character \c, you get \c, i.e. the escape is ignored. Mawk is different than most awks here; the AWK book says \c is c. The reason mawk chooses to be different is for easier conversion of strings to regular expressions. 8. Regular expressions Awk notation for regular expressions is in the style of egrep(1). In awk, regular expressions are enclosed in / ... /. A regular expression /r/, is a set of strings. s ~ /r/ is an awk expression that evaluates to 1 if an element of /r/ is a substring of s and evaluates to 0 otherwise. ~ is called the match operator and the expression is read "s matches r". s ~ /^r/ is 1 if some element of r is a prefix of s. s ~ /r$/ is 1 if some element of r is a suffix of s. s ~ /^r$/ is 1 if s is an element of r. Replacing ~ by !~ , the not match operator, reverses the meanings. In patterns, /r/ and !/r/ are shorthand for $0 ~ /r/ and $0 !~ /r/. Regular expressions are combined by the following rules. // stands for the one element set "" (not the empty set). /c/ for a character c is the one element set "c". /rs/ is all elements of /r/ concatenated with all elements of /s/. /r|s/ is the set union of /r/ and /s/. /r*/ called the closure of r is // union /rr/ union /rrr/ ... In words, r repeated zero or more times. The above operations are sufficient to describe all regular expressions, but for ease of notation awk defines additional operations and notation. /r?/ // union /r/. In words r 0 or 1 time. /r+/ Positive closure of r. R 1 or more times. (r) Same as r -- allows grouping. . Stands for any character (for mawk this means ascii 1 through ascii 255) [c1c2..cn] A character class same as (c1|c2|...|cn) where ci's are single characters. [^c1c2..cn] Complement of the class [c1c2..cn]. For mawk complement in the ascii character set 1 to 255. Ranges c1-cn are allowed in character classes. For example, /[_a-zA-Z][_a-zA-Z0-9]*/ expresses the set of possible identifiers in awk. The operators have increasing precedence: | implicit concatenation + * ? So /a|b+/ means a or (1 or more b's), and /(a|b)+/ means (a or b) one or more times. The so called regular expression metacharacters are \ ^ $ . [ ] | ( ) * + ? . To stand for themselves as characters they have to be escaped. (They don't have to be escaped in classes, inside classes the meta-meaning is off). The same escape sequences that are recognized in strings (see above) are recognized in regular expressions. For mawk, the escape rule for \c changes to c. For example, /[ \t]*/ is optional space /^[-+]?([0-9]+\.?|\.[0-9])[0-9]*([eE][-+]?[0-9]+)?$/ is numbers in the Awk language. Note, . must be escaped to have its meaning as decimal point. For building regular expressions, you can think of ^ and $ as phantom characters at the front and back of every string. So /(^a|b$|^A.*B$)/ is the set of strings that start with a or end with b or (start with A and end with B). Dynamic regular expressions are new. You can write x ~ expr and expr is interpreted as a regular expression. The result of x ~ y can vary with the variable y; so x ~ /a\+b/ and x ~ "a\+b" are the same, or are they? In mawk, they are; in some other awk's they are not. In the second expression, "a\+b" is scanned twice: once as a string constant and then again as a regular expression. In mawk the first scan gives the four character string 'a' '\' '+' 'b' because mawk treats \+ as \+; the second scan gives a regular expression matched by the three character string 'a' '+' 'b' because on the second scan \+ becomes +. If \c becomes c in strings, you need to double escape metacharacters, i.e., write x ~ "a\\+b". Exercise: what happens if you double escape in mawk? In strings if you only escape characters with defined escape sequences such as \t or \n or meta-characters when you expect to use a string as a regular expression, then mawk's rules are intuitive and simple. See example/cdecl.awk and example/gdecl.awk for the same program with single and double escapes, the first is clearer. 9. How Mawk splits lines, records and files. Mawk uses the essentially the same algorithm to split lines into pieces with split(), records into fields on FS, and files into records on RS. Split( s, A, sep ) splits string s into array A with separator sep as follows: Sep is interpreted as a regular expression. If s = "", there are no pieces and split returns 0. Otherwise s is split into pieces by the matches with sep of positive length treated as a separator between pieces, so the number of pieces is the number of matches + 1. Matches of the null string do not split. So sep = "b+" and sep = "b*" split the same although the latter executes more slowly. Split(s, A) is the same as split(s, A, FS). With mawk you can write sep as a regular expression, i.e., split(s, A, "b+") and split(s, A, /b+/) are the same. Sep = " " (a single space) is special. Before the algorithm is applied, white-space is trimmed from the front and back of s. Mawk defines white-space as SPACE, TAB, FORMFEED, VERTICAL TAB or NEWLINE, i.e [ \t\f\v\n]. Usually this means SPACE or TAB because NEWLINE usually separates records, and the other characters are rare. The above algorithm is then applied with sep = "[ \t\f\v\n]+". If length(sep) = 1, then regular expression metacharacters do not have to be escaped, i.e. split(s, A, "+") is the same as split(s, A, /\+/). Splitting records into fields works exactly the same except the pieces are loaded into $1, $2 ... $NF. Records are also the same, RS is treated as a regular expression. But there is a slight difference, RS is really a record terminator (ORS is really a terminator also). E.g., if FS = ":" and $0 = "a:b:" , then NF = 3 and $1 = "a", $2 = "b" and $3 = "", but if "a:b:" is the contents of an input file and RS = ":", then there are two records "a" and "b". RS = " " does not have special meaning as with FS. Not all versions of (new) awk support RS as a regular expression. This feature of mawk is useful and improves performance. BEGIN { RS = "[^a-zA-Z]+" getline if ( $0 == "" ) NR = 0 else word[1] = $0 } { word[NR] = $0 } END { ... do something with word[1]...word[NR] } isolates words in a document over twice as fast as reading one line at a time and then examining each field with FS = "[^a-zA-Z]+". To remove comments from C code: BEGIN { RS = "/\*([^*]|\*[^/])*\*/" # comment is RS ORS = " " } { print } END { printf "\n" } 10. Multi-line records Since mawk interprets RS as a regular expression, multi-line records are easy. Setting RS = "\n\n+", makes one or more blank lines separate records. If FS = " " (the default), then single newlines, by the rules for space above, become space. For example, if a file is "a b\nc\n\n", RS = "\n\n+" and FS = " ", then there is one record "a b\nc" with three fields "a", "b" and "c". Changing FS = "\n", gives two fields "a b" and "c"; changing FS = "", gives one field identical to the record. For compatibility with (old) awk, setting RS = "" has the same effect on determining records as RS = "\n([ \t]*\n)+". Most of the time when you change RS for mult-line records, you will also want to change ORS to "\n\n". 11. User functions. User defined functions are new. They can be passed expressions by value or arrays by reference. Function calls can be nested and support recursion. The syntax is function funcname( args ) { .... body } Newlines are ignored after the ')' so the '{' can start on a different line. Inside the body, you can use a return statement return expr return As in C, there is no distinction between functions and procedures. A function does not need an explicit return. Extra arguments act as local variables. For example, csplit(s, A) puts each character of s in array A. function csplit(s, A, i) { for(i=1; i <= length(s) ; i++) A[i] = substr(s, i, 1) } Putting lots of space between the passed arguments and the local variables is a convention that can be ignored if you don't like it. Dynamic regular expressions allow regular expressions to be passed to user defined functions. The following function gobble() is the lexical scanner for a recursive descent parser, the whole program is in examples/cdecl.awk. function gobble( r, x) # eat regular expression # r off the front of global variable line { if ( match( line, "^(" r ")") ) { x = substr(line, 1, RLENGTH) line = substr(line, RLENGTH) } else x = "" return x } You can call a function before it is defined, but the function name and the '(' must not be separated by white space to avoid confusion with concatenation. 12. Other differences in mawk The main differences between mawk and other awks have been discussed, RS as a regular expression and regular expression metacharacters don't have to be double escaped. Here are some others: VERSION -- built-in variable holding version number of mawk. mawk 'BEGIN{print VERSION}' shows it. -D -- command line flag causes mawk to dump to stderr a mawk assembler listing of the current program. The program is executed by a stack machine internal to mawk. The op codes are in code.h, the machine in execute.c. srand() -- During initialization, mawk seeds the random number generator by silently calling srand(), so calling srand() yourself is unnecessary. The main use of srand is to use srand(x) to get a repeatable stream of random numbers. Srand(x) returns x and srand() returns the value of the system clock in some form of ticks. 13. MsDOS For a number of reasons, entering a mawk program on the command line using command.com as your shell is an exercise in futility, so under MsDOS the command syntax is mawk [-Fs] optional_list_of_files You'll get a prompt, and then type in the program. The -f option works as before. If you use a DOS shell that gives you a Unix style command line, to use it you'll need to provide a C function reargv() that retrieves argc and argv[] from your shell. The details are in msdos/INSTALL. Some features are missing from the DOS version of mawk: No system(), and no input or output pipes. To provide a hook to stderr, I've added errmsg( "string" ) which prints "string\n" to stderr which will be the console and only the console under command.com. A better solution would be to associate a file with handle 2, so print and printf would be available. Consider the errmsg() feature as temporary. For compatibility with Unix, CR are silently stripped from input and LF silently become CRLF on output. WARNING: If you write an infinite loop that does not print to the screen, then you will have to reboot. For example x = 1 while( x < 10 ) A[x] = x x++ By mistake the x++ is outside the loop. What you need to do is type control break and the keyboard hardware will generate an interrupt and the operating system will service that interrupt and terminate your program, but unfortunately MsDOS does not have such a feature. 14. Bugs Currently mawk cannot handle \0 (NUL) characters in input files otherwise mawk is 8 bit clean. Also "a\0b", doesn't work right -- you get "a". You can't use \0 in regular expressions either. printf "A string%c more string\n" , 0 does work, but more by luck than design since it doesn't work with sprintf(). 15. Releases This release is version 0.97. After a reasonable period of time, any bugs that appear will be fixed, and this release will become version 1.0. Evidently features have been added to awk by Aho, Kernighan and Weinberger since the 1988 release of the AWK book. Version 1.1 will add whatever features are necessary to remain compatible with the language as defined by its designers. After that ... ? 16. Correspondence Send bug reports or other correspondence to Mike Brennan brennan@bcsaic.boeing.com If you have some interesting awk programs, contributions to the examples directory would be appreciated. @//E*O*F mawk0.97/mawk.manual// chmod u=rw,g=r,o=r mawk0.97/mawk.manual echo x - mawk0.97/array.c sed 's/^@//' > "mawk0.97/array.c" <<'@//E*O*F mawk0.97/array.c//' /******************************************** array.c copyright 1991, Michael D. Brennan This is a source file for mawk, an implementation of the Awk programming language as defined in Aho, Kernighan and Weinberger, The AWK Programming Language, Addison-Wesley, 1988. See the accompaning file, LIMITATIONS, for restrictions regarding modification and redistribution of this program in source or binary form. ********************************************/ /* $Log: array.c,v $ * Revision 2.1 91/04/08 08:22:15 brennan * VERSION 0.97 * */ #include "mawk.h" #include "symtype.h" #include "memory.h" #include "bi_vars.h" extern int returning ; /* flag -- on if returning from function call */ extern unsigned hash() ; /* An array A is a pointer to a hash table of size A_HASH_PRIME holding linked lists of ANODEs. When an index is deleted via delete A[i], the ANODE is not removed from the hash chain. A[i].cp and A[i].sval are both freed and sval is set NULL. This method of deletion simplifies for( i in A ) loops. */ /* is sval in A ? */ int array_test( A, sval) ARRAY A ; STRING *sval ; { char *s = sval->str ; register ANODE *p = A[ hash(s) % A_HASH_PRIME ] ; while ( p ) { if ( p->sval && strcmp(s, p->sval->str) == 0 ) return 1 ; p = p->link ; } /* not there */ return 0 ; } /* find x in array a if flag is ON x is a char* else a STRING*, computes a[x] as a CELL* */ CELL *array_find( a, x, flag) ARRAY a ; PTR x ; int flag ; { register ANODE *p ; /* search with p */ ANODE *q ; /* pts at a deleted node */ unsigned h ; char *s ; s = flag ? (char *) x : ( (STRING *) x) -> str ; p = a[ h = hash(s) % A_HASH_PRIME ] ; q = (ANODE *) 0 ; while ( p ) { if ( p->sval ) { if ( strcmp(s,p->sval->str) == 0 ) /* found */ return p->cp ; } else /* a deleted node */ if ( !q ) q = p ; p = p->link ; } /* not there make one */ if ( q ) p = q ; /* reuse the node */ else { p = (ANODE *) zmalloc( sizeof(ANODE) ) ; p->link = a[h] ; a[h] = p ; } if ( flag ) p->sval = new_STRING(s) ; else { p->sval = (STRING *) x ; p->sval->ref_cnt++ ; } p->cp = new_CELL() ; p->cp->type = C_NOINIT ; return p->cp ; } void array_delete( a, sval) ARRAY a ; STRING *sval ; { char *s = sval->str ; register ANODE *p = a[ hash(s) % A_HASH_PRIME ] ; while ( p ) { if ( p->sval && strcmp(s, p->sval->str)== 0 ) /* found */ { cell_destroy(p->cp) ; free_CELL(p->cp) ; free_STRING(p->sval) ; p->sval = (STRING *) 0 ; break ; } p = p->link ; } } /* for ( i in A ) , loop over elements of an array sp[0].ptr : a pointer to A ( the hash table of A) sp[-1] : a pointer to i ( a cell ptr) cdp[0] : a stop op to catch breaks cdp[1] : offset from cdp of the code after the loop (n+2) cdp[2] : start of body of the loop cdp[3..n] : the rest of the body cdp[n+1] : a stop op to delimit the body and catch continues */ INST *array_loop( cdp, sp, fp) /* passed code, stack and frame ptrs */ INST *cdp ; CELL *sp, *fp ; { int i ; register ANODE *p ; ARRAY A = (ARRAY) sp-- -> ptr ; register CELL *cp = (CELL *) sp-- -> ptr ; for ( i = 0 ; i < A_HASH_PRIME ; i++ ) for ( p = A[i] ; p ; p = p->link ) { if ( ! p->sval /* its deleted */ ) continue ; cell_destroy(cp) ; cp->type = C_STRING ; cp->ptr = (PTR) p->sval ; p->sval->ref_cnt++ ; /* execute the body of the loop */ if ( execute(cdp+2, sp, fp) == cdp /* exec'ed a break statement */ || returning /* function return in body of loop */ ) goto break2 /* break both for loops */ ; } break2 : return cdp + cdp[1].op ; } /* cat together cnt elements on the eval stack to form an array index using SUBSEP */ CELL *array_cat( sp, cnt) register CELL *sp ; int cnt ; { register CELL *p ; /* walks the stack */ CELL subsep ; /* a copy of bi_vars[SUBSEP] */ unsigned subsep_len ; char *subsep_str ; unsigned total_len ; /* length of cat'ed expression */ CELL *top ; /* sp at entry */ char *t ; /* target ptr when catting */ STRING *sval ; /* build new STRING here */ /* get a copy of subsep, we may need to cast */ (void) cellcpy(&subsep, bi_vars + SUBSEP) ; if ( subsep.type < C_STRING ) cast1_to_s(&subsep) ; subsep_len = string(&subsep)->len ; subsep_str = string(&subsep)->str ; total_len = --cnt * subsep_len ; top = sp ; sp -= cnt ; for( p = sp ; p <= top ; p++ ) { if ( p->type < C_STRING ) cast1_to_s(p) ; total_len += string(p)->len ; } sval = new_STRING((char *)0, total_len) ; t = sval->str ; /* put the pieces together */ for( p = sp ; p < top ; p++ ) { (void) memcpy(t, string(p)->str, string(p)->len) ; (void) memcpy( t += string(p)->len, subsep_str, subsep_len) ; t += subsep_len ; } /* p == top */ (void) memcpy(t, string(p)->str, string(p)->len) ; /* done, now cleanup */ free_STRING(string(&subsep)) ; while ( p >= sp ) { free_STRING(string(p)) ; p-- ; } sp->type = C_STRING ; sp->ptr = (PTR) sval ; return sp ; } /* free all memory used by an array, only used for arrays local to a function call */ void array_free(A) ARRAY A ; { register ANODE *p ; register int i ; ANODE *q ; for( i = 0 ; i < A_HASH_PRIME ; i++ ) { p = A[i] ; while ( p ) { /* check its not a deleted node */ if ( p->sval ) { free_STRING(p->sval) ; cell_destroy(p->cp) ; free_CELL(p->cp) ; } q = p ; p = p->link ; zfree( q, sizeof(ANODE)) ; } } zfree(A, sizeof(ANODE *) * A_HASH_PRIME ) ; } @//E*O*F mawk0.97/array.c// chmod u=rw,g=r,o=r mawk0.97/array.c echo x - mawk0.97/bi_funct.c sed 's/^@//' > "mawk0.97/bi_funct.c" <<'@//E*O*F mawk0.97/bi_funct.c//' /******************************************** bi_funct.c copyright 1991, Michael D. Brennan This is a source file for mawk, an implementation of the Awk programming language as defined in Aho, Kernighan and Weinberger, The AWK Programming Language, Addison-Wesley, 1988. See the accompaning file, LIMITATIONS, for restrictions regarding modification and redistribution of this program in source or binary form. ********************************************/ /* $Log: bi_funct.c,v $ * Revision 2.3 91/04/17 06:34:00 brennan * index("","") should be 1 not 0 for consistency with match("",//) * * Revision 2.2 91/04/09 12:38:42 brennan * added static to funct decls to satisfy STARDENT compiler * * Revision 2.1 91/04/08 08:22:17 brennan * VERSION 0.97 * */ #include "mawk.h" #include "bi_funct.h" #include "bi_vars.h" #include "memory.h" #include "init.h" #include "files.h" #include "fin.h" #include "field.h" #include "regexp.h" #include "repl.h" #include <math.h> #ifndef BSD43 void PROTO( srand48, (long) ) ; double PROTO( drand48, (void) ) ; #endif /* statics */ static STRING *PROTO(gsub, (PTR, CELL *, char *, int) ) ; static void PROTO( fplib_err, (char *, double, char *) ) ; /* global for the disassembler */ BI_REC bi_funct[] = { /* info to load builtins */ "index" , bi_index , 2, 2 , "substr" , bi_substr, 2, 3, "sprintf" , bi_sprintf, 1, 255, "sin", bi_sin , 1, 1 , "cos", bi_cos , 1, 1 , "atan2", bi_atan2, 2,2, "exp", bi_exp, 1, 1, "log", bi_log , 1, 1 , "int", bi_int, 1, 1, "sqrt", bi_sqrt, 1, 1, "rand" , bi_rand, 0, 0, "srand", bi_srand, 0, 1, "close", bi_close, 1, 1, "system", bi_system, 1, 1, #if DOS /* this might go away, when pipes and system are added for DOS */ "errmsg", bi_errmsg, 1, 1, #endif (char *) 0, (PF_CP) 0, 0, 0 } ; void bi_funct_init() { register BI_REC *p = bi_funct ; register SYMTAB *stp ; while ( p->name ) { stp = insert( p->name ) ; stp->type = ST_BUILTIN ; stp->stval.bip = p++ ; } /* seed rand() off the clock */ { CELL c ; c.type = 0 ; (void) bi_srand(&c) ; } stp = insert( "length") ; stp->type = ST_LENGTH ; } /************************************************** string builtins (except split (in split.c) and [g]sub (at end)) **************************************************/ CELL *bi_length(sp) register CELL *sp ; { unsigned len ; if ( sp->type < C_STRING ) cast1_to_s(sp) ; len = string(sp)->len ; free_STRING( string(sp) ) ; sp->type = C_DOUBLE ; sp->dval = (double) len ; return sp ; } char *str_str(target, key , key_len) register char *target, *key ; unsigned key_len ; { switch( key_len ) { case 0 : return (char *) 0 ; case 1 : return strchr( target, *key) ; case 2 : while ( target = strchr(target, *key) ) if ( target[1] == key[1] ) return target ; else target++ ; /*failed*/ return (char *) 0 ; } key_len-- ; while ( target = strchr(target, *key) ) if ( memcmp(target+1, key+1, key_len) == 0 ) return target ; else target++ ; /*failed*/ return (char *) 0 ; } CELL *bi_index(sp) register CELL *sp ; { register int idx ; unsigned len ; char *p ; sp-- ; if ( TEST2(sp) != TWO_STRINGS ) cast2_to_s(sp) ; if ( len = string(sp+1)->len ) idx = (p = str_str(string(sp)->str,string(sp+1)->str,len)) ? p - string(sp)->str + 1 : 0 ; else /* index of the empty string */ idx = 1 ; free_STRING( string(sp) ) ; free_STRING( string(sp+1) ) ; sp->type = C_DOUBLE ; sp->dval = (double) idx ; return sp ; } /* substr(s, i, n) if l = length(s) then get the characters from max(1,i) to min(l,n-i-1) inclusive */ CELL *bi_substr(sp) CELL *sp ; { int n_args, len ; register int i, n ; char *s ; /* substr(s, i, n) */ STRING *sval ; n_args = sp->type ; sp -= n_args ; if ( sp->type < C_STRING ) cast1_to_s(sp) ; s = (sval = string(sp)) -> str ; if ( n_args == 2 ) { n = 0x7fff ; /* essentially infinity */ if ( sp[1].type != C_DOUBLE ) cast1_to_d(sp+1) ; } else { if ( sp[1].type + sp[2].type != TWO_STRINGS ) cast2_to_d(sp+1) ; n = (int) sp[2].dval ; } i = (int) sp[1].dval - 1 ; /* i now indexes into string */ if ( (len = strlen(s)) == 0 ) return sp ; /* get to here is s is not the null string */ if ( i < 0 ) { n += i ; i = 0 ; } if ( n > len - i ) n = len - i ; if ( n <= 0 ) /* the null string */ { free_STRING( sval ) ; sp->ptr = (PTR) &null_str ; null_str.ref_cnt++ ; } else /* got something */ { sp->ptr = (PTR) new_STRING((char *)0, n) ; (void) memcpy(string(sp)->str, s+i, n) ; string(sp)->str[n] = 0 ; } return sp ; } /* match(s,r) sp[0] holds s, sp[-1] holds r */ CELL *bi_match(sp) register CELL *sp ; { double d ; char *p ; unsigned length ; if ( sp->type != C_RE ) cast_to_RE(sp) ; if ( (--sp)->type < C_STRING ) cast1_to_s(sp) ; if ( p = REmatch(string(sp)->str, (sp+1)->ptr, &length) ) d = (double) ( p - string(sp)->str + 1 ) ; else d = 0.0 ; cell_destroy( & bi_vars[RSTART] ) ; cell_destroy( & bi_vars[RLENGTH] ) ; bi_vars[RSTART].type = C_DOUBLE ; bi_vars[RSTART].dval = d ; bi_vars[RLENGTH].type = C_DOUBLE ; bi_vars[RLENGTH].dval = (double) length ; free_STRING(string(sp)) ; sp->type = C_DOUBLE ; sp->dval = d ; return sp ; } /************************************************ arithemetic builtins ************************************************/ static void fplib_err( fname, val, error) char *fname ; double val ; char *error ; { rt_error("%s(%g) : %s" , fname, val, error) ; } CELL *bi_sin(sp) register CELL *sp ; { #if ! STDC_MATHERR if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; sp->dval = sin( sp->dval ) ; return sp ; #else double x ; errno = 0 ; if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; x = sp->dval ; sp->dval = sin( sp->dval ) ; if ( errno ) fplib_err("sin", x, "loss of precision") ; return sp ; #endif } CELL *bi_cos(sp) register CELL *sp ; { #if ! STDC_MATHERR if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; sp->dval = cos( sp->dval ) ; return sp ; #else double x ; errno = 0 ; if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; x = sp->dval ; sp->dval = cos( sp->dval ) ; if ( errno ) fplib_err("cos", x, "loss of precision") ; return sp ; #endif } CELL *bi_atan2(sp) register CELL *sp ; { #if ! STDC_MATHERR sp-- ; if ( TEST2(sp) != TWO_DOUBLES ) cast2_to_d(sp) ; sp->dval = atan2(sp->dval, (sp+1)->dval) ; return sp ; #else errno = 0 ; sp-- ; if ( TEST2(sp) != TWO_DOUBLES ) cast2_to_d(sp) ; sp->dval = atan2(sp->dval, (sp+1)->dval) ; if ( errno ) rt_error("atan2(0,0) : domain error") ; return sp ; #endif } CELL *bi_log(sp) register CELL *sp ; { #if ! STDC_MATHERR if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; sp->dval = log( sp->dval ) ; return sp ; #else double x ; errno = 0 ; if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; x = sp->dval ; sp->dval = log( sp->dval ) ; if ( errno ) fplib_err("log", x, "domain error") ; return sp ; #endif } CELL *bi_exp(sp) register CELL *sp ; { #if ! STDC_MATHERR if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; sp->dval = exp(sp->dval) ; return sp ; #else double x ; errno = 0 ; if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; x = sp->dval ; sp->dval = exp(sp->dval) ; if ( errno && sp->dval) fplib_err("exp", x, "overflow") ; /* on underflow sp->dval==0, ignore */ return sp ; #endif } CELL *bi_int(sp) register CELL *sp ; { if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; sp->dval = sp->dval >= 0.0 ? floor( sp->dval ) : ceil(sp->dval) ; return sp ; } CELL *bi_sqrt(sp) register CELL *sp ; { #if ! STDC_MATHERR if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; sp->dval = sqrt( sp->dval ) ; return sp ; #else double x ; errno = 0 ; if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; x = sp->dval ; sp->dval = sqrt( sp->dval ) ; if ( errno ) fplib_err("sqrt", x, "domain error") ; return sp ; #endif } #ifdef __TURBOC__ long biostime(int, long) ; #define time(x) (biostime(0,0L)<<4) #else #include <sys/types.h> #if 0 #ifndef STARDENT #include <sys/timeb.h> #endif #endif #endif CELL *bi_srand(sp) register CELL *sp ; { register long l ; void srand48() ; if ( sp-- -> type ) /* user seed */ { if ( sp->type != C_DOUBLE ) cast1_to_d(sp) ; l = (long) sp->dval ; } else { l = (long) time( (time_t *) 0 ) ; (++sp)->type = C_DOUBLE ; sp->dval = (double) l ; } srand48(l) ; return sp ; } CELL *bi_rand(sp) register CELL *sp ; { (++sp)->type = C_DOUBLE ; sp->dval = drand48() ; return sp ; } /************************************************* miscellaneous builtins close, system and getline *************************************************/ CELL *bi_close(sp) register CELL *sp ; { int x ; if ( sp->type < C_STRING ) cast1_to_s(sp) ; x = file_close( (STRING *) sp->ptr) ; free_STRING( string(sp) ) ; sp->type = C_DOUBLE ; sp->dval = (double) x ; return sp ; } #if ! DOS CELL *bi_system(sp) CELL *sp ; { int pid ; unsigned ret_val ; if ( !shell ) shell = (shell = getenv("SHELL")) ? shell : "/bin/sh" ; if ( sp->type < C_STRING ) cast1_to_s(sp) ; switch( pid = fork() ) { case -1 : /* fork failed */ errmsg(errno, "could not create a new process") ; ret_val = 128 ; break ; case 0 : /* the child */ (void) execl(shell, shell, "-c", string(sp)->str, (char *) 0) ; /* if get here, execl() failed */ errmsg(errno, "execute of %s failed", shell) ; fflush(stderr) ; _exit(128) ; default : /* wait for the child */ ret_val = wait_for(pid) ; if ( ret_val & 0xff ) ret_val = 128 ; else ret_val = (ret_val & 0xff00) >> 8 ; break ; } cell_destroy(sp) ; sp->type = C_DOUBLE ; sp->dval = (double) ret_val ; return sp ; } #else /* DOS */ CELL *bi_system( sp ) register CELL *sp ; { rt_error("no system call in MsDos --yet") ; return sp ; } /* prints errmsgs for DOS */ CELL *bi_errmsg(sp) register CELL *sp ; { cast1_to_s(sp) ; fprintf(stderr, "%s\n", string(sp)->str) ; free_STRING(string(sp)) ; sp->type = C_DOUBLE ; sp->dval = 0.0 ; return sp ; } #endif /* getline() */ /* if type == 0 : stack is 0 , target address if type == F_IN : stack is F_IN, expr(filename), target address if type == PIPE_IN : stack is PIPE_IN, target address, expr(pipename) */ CELL *bi_getline(sp) register CELL *sp ; { CELL tc , *cp ; char *p ; unsigned len ; FIN *fin_p ; switch( sp->type ) { case 0 : sp-- ; if ( main_fin == (FIN *) -1 && ! open_main() ) goto open_failure ; if ( ! main_fin || !(p = FINgets(main_fin, &len)) ) goto eof ; cp = (CELL *) sp->ptr ; if ( TEST2(bi_vars+NR) != TWO_DOUBLES ) cast2_to_d(bi_vars+NR) ; bi_vars[NR].dval += 1.0 ; bi_vars[FNR].dval += 1.0 ; break ; case F_IN : sp-- ; if ( sp->type < C_STRING ) cast1_to_s(sp) ; fin_p = (FIN *) file_find(sp->ptr, F_IN) ; free_STRING(string(sp) ) ; sp-- ; if ( ! fin_p ) goto open_failure ; if ( ! (p = FINgets(fin_p, &len)) ) goto eof ; cp = (CELL *) sp->ptr ; break ; case PIPE_IN : sp -= 2 ; if ( sp->type < C_STRING ) cast1_to_s(sp) ; fin_p = (FIN *) file_find(sp->ptr, PIPE_IN) ; free_STRING(string(sp)) ; if ( ! fin_p ) goto open_failure ; if ( ! (p = FINgets(fin_p, &len)) ) goto eof ; cp = (CELL *) (sp+1)->ptr ; break ; default : bozo("type in bi_getline") ; } /* we've read a line , store it */ if ( len == 0 ) { tc.type = C_STRING ; tc.ptr = (PTR) &null_str ; null_str.ref_cnt++ ; } else { tc.type = C_MBSTRN ; tc.ptr = (PTR) new_STRING((char *) 0, len) ; (void) memcpy( string(&tc)->str, p, len) ; } if ( cp >= field && cp < field+NUM_FIELDS ) field_assign(cp-field, &tc) ; else { cell_destroy(cp) ; (void) cellcpy(cp, &tc) ; } cell_destroy(&tc) ; sp->dval = 1.0 ; goto done ; open_failure : sp->dval = -1.0 ; goto done ; eof : sp->dval = 0.0 ; /* fall thru to done */ done : sp->type = C_DOUBLE ; return sp ; } /********************************************** sub() and gsub() **********************************************/ /* entry: sp[0] = address of CELL to sub on sp[-1] = substitution CELL sp[-2] = regular expression to match */ CELL *bi_sub( sp ) register CELL *sp ; { CELL *cp ; /* pointer to the replacement target */ CELL tc ; /* build the new string here */ CELL sc ; /* copy of the target CELL */ char *front, *middle, *back ; /* pieces */ unsigned front_len, middle_len, back_len ; sp -= 2 ; if ( sp->type != C_RE ) cast_to_RE(sp) ; if ( sp[1].type != C_REPL && sp[1].type != C_REPLV ) cast_to_REPL(sp+1) ; cp = (CELL *) (sp+2)->ptr ; /* make a copy of the target, because we won't change anything including type unless the match works */ (void) cellcpy(&sc, cp) ; if ( sc.type < C_STRING ) cast1_to_s(&sc) ; front = string(&sc)->str ; if ( middle = REmatch(front, sp->ptr, &middle_len) ) { front_len = middle - front ; back = middle + middle_len ; back_len = string(&sc)->len - front_len - middle_len ; if ( (sp+1)->type == C_REPLV ) { STRING *sval = new_STRING((char *) 0, middle_len) ; (void) memcpy(sval->str, middle, middle_len) ; (void) replv_to_repl(sp+1, sval) ; free_STRING(sval) ; } tc.type = C_STRING ; tc.ptr = (PTR) new_STRING((char *) 0, front_len + string(sp+1)->len + back_len ) ; { char *p = string(&tc)->str ; if ( front_len ) { (void) memcpy(p, front, front_len) ; p += front_len ; } if ( string(sp+1)->len ) { (void) memcpy(p, string(sp+1)->str, string(sp+1)->len) ; p += string(sp+1)->len ; } if ( back_len ) (void) memcpy(p, back, back_len) ; } if ( cp >= field && cp < field+NUM_FIELDS ) field_assign(cp-field, &tc) ; else { cell_destroy(cp) ; (void) cellcpy(cp, &tc) ; } free_STRING(string(&tc)) ; } free_STRING(string(&sc)) ; repl_destroy(sp+1) ; sp->type = C_DOUBLE ; sp->dval = middle != (char *) 0 ? 1.0 : 0.0 ; return sp ; } static unsigned repl_cnt ; /* number of global replacements */ /* recursive global subsitution dealing with empty matches makes this mildly painful */ static STRING *gsub( re, repl, target, flag) PTR re ; CELL *repl ; /* always of type REPL or REPLV */ char *target ; int flag ; /* if on, match of empty string at front is OK */ { char *front, *middle ; STRING *back ; unsigned front_len, middle_len ; STRING *ret_val ; CELL xrepl ; /* a copy of repl so we can change repl */ if ( ! (middle = REmatch(target, re, &middle_len)) ) return new_STRING(target) ; /* no match */ (void) cellcpy(&xrepl, repl) ; if ( !flag && middle_len == 0 && middle == target ) { /* match at front that's not allowed */ if ( *target == 0 ) /* target is empty string */ { null_str.ref_cnt++ ; return & null_str ; } else { char xbuff[2] ; front_len = 0 ; /* make new repl with target[0] */ repl_destroy(repl) ; xbuff[0] = *target++ ; xbuff[1] = 0 ; repl->type = C_REPL ; repl->ptr = (PTR) new_STRING( xbuff ) ; back = gsub(re, &xrepl, target, 1) ; } } else /* a match that counts */ { repl_cnt++ ; front = target ; front_len = middle - target ; if ( *middle == 0 ) /* matched back of target */ { back = &null_str ; null_str.ref_cnt++ ; } else back = gsub(re, &xrepl, middle + middle_len, 0) ; /* patch the &'s if needed */ if ( repl->type == C_REPLV ) { STRING *sval = new_STRING((char *) 0, middle_len) ; (void) memcpy(sval->str, middle, middle_len) ; (void) replv_to_repl(repl, sval) ; free_STRING(sval) ; } } /* put the three pieces together */ ret_val = new_STRING((char *)0, front_len + string(repl)->len + back->len); { char *p = ret_val->str ; if ( front_len ) { (void) memcpy(p, front, front_len) ; p += front_len ; } if ( string(repl)->len ) { (void) memcpy(p, string(repl)->str, string(repl)->len) ; p += string(repl)->len ; } if ( back->len ) (void) memcpy(p, back->str, back->len) ; } /* cleanup, repl is freed by the caller */ repl_destroy(&xrepl) ; free_STRING(back) ; return ret_val ; } /* set up for call to gsub() */ CELL *bi_gsub( sp ) register CELL *sp ; { CELL *cp ; /* pts at the replacement target */ CELL sc ; /* copy of replacement target */ CELL tc ; /* build the result here */ sp -= 2 ; if ( sp->type != C_RE ) cast_to_RE(sp) ; if ( (sp+1)->type != C_REPL && (sp+1)->type != C_REPLV ) cast_to_REPL(sp+1) ; (void) cellcpy(&sc, cp = (CELL *)(sp+2)->ptr) ; if ( sc.type < C_STRING ) cast1_to_s(&sc) ; repl_cnt = 0 ; tc.ptr = (PTR) gsub(sp->ptr, sp+1, string(&sc)->str, 1) ; if ( repl_cnt ) { tc.type = C_STRING ; if ( cp >= field && cp < field + NUM_FIELDS ) field_assign(cp-field, &tc) ; else { cell_destroy(cp) ; (void) cellcpy(cp, &tc) ; } } /* cleanup */ free_STRING(string(&sc)) ; free_STRING(string(&tc)) ; repl_destroy(sp+1) ; sp->type = C_DOUBLE ; sp->dval = (double) repl_cnt ; return sp ; } @//E*O*F mawk0.97/bi_funct.c// chmod u=rw,g=r,o=r mawk0.97/bi_funct.c echo x - mawk0.97/bi_funct.h sed 's/^@//' > "mawk0.97/bi_funct.h" <<'@//E*O*F mawk0.97/bi_funct.h//' /******************************************** bi_funct.h copyright 1991, Michael D. Brennan This is a source file for mawk, an implementation of the Awk programming language as defined in Aho, Kernighan and Weinberger, The AWK Programming Language, Addison-Wesley, 1988. See the accompaning file, LIMITATIONS, for restrictions regarding modification and redistribution of this program in source or binary form. ********************************************/ /* $Log: bi_funct.h,v $ * Revision 2.2 91/04/22 08:00:13 brennan * prototype for bi_errmsg() under DOS * * Revision 2.1 91/04/08 08:22:20 brennan * VERSION 0.97 * */ #ifndef BI_FUNCT_H #define BI_FUNCT_H 1 #include "symtype.h" extern BI_REC bi_funct[] ; void PROTO(bi_init, (void) ) ; /* builtin string functions */ CELL *PROTO( bi_print, (CELL *) ) ; CELL *PROTO( bi_printf, (CELL *) ) ; CELL *PROTO( bi_length, (CELL *) ) ; CELL *PROTO( bi_index, (CELL *) ) ; CELL *PROTO( bi_substr, (CELL *) ) ; CELL *PROTO( bi_sprintf, (CELL *) ) ; CELL *PROTO( bi_split, (CELL *) ) ; CELL *PROTO( bi_match, (CELL *) ) ; CELL *PROTO( bi_getline, (CELL *) ) ; CELL *PROTO( bi_sub, (CELL *) ) ; CELL *PROTO( bi_gsub, (CELL *) ) ; /* builtin arith functions */ CELL *PROTO( bi_sin, (CELL *) ) ; CELL *PROTO( bi_cos, (CELL *) ) ; CELL *PROTO( bi_atan2, (CELL *) ) ; CELL *PROTO( bi_log, (CELL *) ) ; CELL *PROTO( bi_exp, (CELL *) ) ; CELL *PROTO( bi_int, (CELL *) ) ; CELL *PROTO( bi_sqrt, (CELL *) ) ; CELL *PROTO( bi_srand, (CELL *) ) ; CELL *PROTO( bi_rand, (CELL *) ) ; /* other builtins */ CELL *PROTO( bi_close, (CELL *) ) ; CELL *PROTO( bi_system, (CELL *) ) ; #if DOS CELL *PROTO(bi_errmsg, (CELL *) ) ; #endif #endif /* BI_FUNCT_H */ @//E*O*F mawk0.97/bi_funct.h// chmod u=rw,g=r,o=r mawk0.97/bi_funct.h echo x - mawk0.97/bi_vars.c sed 's/^@//' > "mawk0.97/bi_vars.c" <<'@//E*O*F mawk0.97/bi_vars.c//' /******************************************** bi_vars.c copyright 1991, Michael D. Brennan This is a source file for mawk, an implementation of the Awk programming language as defined in Aho, Kernighan and Weinberger, The AWK Programming Language, Addison-Wesley, 1988. See the accompaning file, LIMITATIONS, for restrictions regarding modification and redistribution of this program in source or binary form. ********************************************/ /* $Log: bi_vars.c,v $ * Revision 2.1 91/04/08 08:22:22 brennan * VERSION 0.97 * */ /* bi_vars.c */ #include "mawk.h" #include "symtype.h" #include "bi_vars.h" #include "field.h" #include "init.h" #include "memory.h" /* the builtin variables */ CELL bi_vars[NUM_BI_VAR] ; /* the order here must match the order in bi_vars.h */ static char *bi_var_names[NUM_BI_VAR] = { "ARGC" , "FILENAME" , "NR" , "FNR" , "OFS" , "ORS" , "RLENGTH" , "RSTART" , "SUBSEP", "VERSION" } ; /* insert the builtin vars in the hash table */ void bi_vars_init() { register int i ; register SYMTAB *s ; for ( i = 0 ; i < NUM_BI_VAR ; i++ ) { s = insert( bi_var_names[i] ) ; s->type = ST_VAR ; s->stval.cp = bi_vars + i ; /* bi_vars[i].type = 0 which is C_NOINIT */ } /* set defaults */ bi_vars[FILENAME].type = C_STRING ; bi_vars[FILENAME].ptr = (PTR) new_STRING( "" ) ; bi_vars[ OFS ].type = C_STRING ; bi_vars[OFS].ptr = (PTR) new_STRING( " " ) ; bi_vars[ ORS ].type = C_STRING ; bi_vars[ORS].ptr = (PTR) new_STRING( "\n" ) ; bi_vars[ SUBSEP ].type = C_STRING ; bi_vars[SUBSEP].ptr = (PTR) new_STRING( "\034" ) ; bi_vars[VERSION].type = C_STRING ; bi_vars[VERSION].ptr = (PTR) new_STRING( VERSION_STRING ) ; bi_vars[NR].type = bi_vars[FNR].type = C_DOUBLE ; /* dval is already 0.0 */ cell_zero.type = C_DOUBLE ; cell_one.type = C_DOUBLE ; cell_one.dval = 1.0 ; } CELL cell_zero ; CELL cell_one ; @//E*O*F mawk0.97/bi_vars.c// chmod u=rw,g=r,o=r mawk0.97/bi_vars.c echo x - mawk0.97/bi_vars.h sed 's/^@//' > "mawk0.97/bi_vars.h" <<'@//E*O*F mawk0.97/bi_vars.h//' /******************************************** bi_vars.h copyright 1991, Michael D. Brennan This is a source file for mawk, an implementation of the Awk programming language as defined in Aho, Kernighan and Weinberger, The AWK Programming Language, Addison-Wesley, 1988. See the accompaning file, LIMITATIONS, for restrictions regarding modification and redistribution of this program in source or binary form. ********************************************/ /* $Log: bi_vars.h,v $ * Revision 2.1 91/04/08 08:26:30 brennan * VERSION 0.97 * */ /* bi_vars.h */ #ifndef BI_VARS_H #define BI_VARS_H 1 #define VERSION_STRING \ "mawk 0.97 Mar 1991, Copyright (C) Michael D. Brennan" /* If use different command line syntax for DOS mark that in VERSION */ #if DOS && ! HAVE_REARGV #undef VERSION_STRING #define VERSION_STRING \ "mawk 0.97DOS Mar 1991, Copyright (C) Michael D. Brennan" #endif /* builtin variables NF, RS, FS, OFMT are stored internally in field[], so side effects of assignment can be handled */ #define ARGC 0 #define FILENAME 1 #define NR 2 /* NR must be exactly one in front of FNR */ #define FNR 3 #define OFS 4 #define ORS 5 #define RLENGTH 6 #define RSTART 7 #define SUBSEP 8 #define VERSION 9 #define NUM_BI_VAR 10 extern CELL bi_vars[NUM_BI_VAR] ; #endif @//E*O*F mawk0.97/bi_vars.h// chmod u=rw,g=r,o=r mawk0.97/bi_vars.h echo x - mawk0.97/cast.c sed 's/^@//' > "mawk0.97/cast.c" <<'@//E*O*F mawk0.97/cast.c//' /******************************************** cast.c copyright 1991, Michael D. Brennan This is a source file for mawk, an implementation of the Awk programming language as defined in Aho, Kernighan and Weinberger, The AWK Programming Language, Addison-Wesley, 1988. See the accompaning file, LIMITATIONS, for restrictions regarding modification and redistribution of this program in source or binary form. ********************************************/ /* $Log: cast.c,v $ * Revision 2.1 91/04/08 08:22:44 brennan * VERSION 0.97 * */ /* cast.c */ #include "mawk.h" #include "field.h" #include "memory.h" #include "scan.h" #include "repl.h" #include <string.h> int pow2[NUM_CELL_TYPES] = {1,2,4,8,16,32,64,128,256,512} ; void cast1_to_d( cp ) register CELL *cp ; { switch( cp->type ) { case C_NOINIT : cp->dval = 0.0 ; break ; case C_DOUBLE : return ; case C_MBSTRN : case C_STRING : { register STRING *s = (STRING *) cp->ptr ; #if FPE_TRAPS /* look for overflow error */ errno = 0 ; cp->dval = strtod(s->str,(char **)0) ; if ( errno && cp->dval != 0.0 ) /* ignore underflow */ rt_error("overflow converting %s to double", s) ; #else cp->dval = strtod(s->str,(char **)0) ; #endif free_STRING(s) ; } break ; case C_STRNUM : /* don't need to convert, but do need to free the STRING part */ free_STRING( string(cp) ) ; break ; default : bozo("cast on bad type") ; } cp->type = C_DOUBLE ; } void cast2_to_d( cp ) register CELL *cp ; { register STRING *s ; switch( cp->type )