djones@decwrl.dec.com (Dave Jones) (12/20/90)
> I want to hash the one hundred + opcodes for the Intel 386 processor > into the smallest possible table with the fewest collisions. Rather than using a hash-table, why not use a nested, string-matching switch-statement? (Well, if you are strapped for memory, that's one reason why not.) Anyway, that's what I did for the Pascal compiler I just completed. (Because it is to be used as part of an interactive debugger, the overriding consideration was speed of compilation.) Needless to say, I did not write the switch by hand (shudder). I wrote a little program to write the switch. I am enclosing it here. I am not posting it to a sources-group because it needs a little work to make it more presentable. Specifically, the way it "knows" the keyword/token-value mapping is a kluge: The pairs are compiled right into the program by means of an include-file. (The ones included here are for Pascal). They should instead be read from a file, so the program does not need to be recompiled on every change to the keyword map. Indeed, the way it is set up, you may want to hack the source to get exactly what your want. I've put in some comments to guide. If you want to polish this up a bit and post it to a sources-group, please feel free. One more thing: There are some pretty good lex-substitutes floating around, or so I've heard. You might find that if you give one of those programs only a list of keywords, you may get back a super-efficient switch much like the one this program generates. Okay, yet one more thing: If you have a lot of keywords, and you compile this on an old, brain-damaged BSD compiler, remember to use the -J flag, which means, "Do not generate pseudo-random jumps." :-( [I'd be interested to hear some speed comparisons with a flex lexer would be interesting. A quick look at the code for both doesn't make it obvious which would be faster. -John] snip ^^^ snip ^^^ snip ^^^ snip ^^^ snip ^^^ snip ^^^ snip ^^^ snip ^^^ #! /bin/sh # This is a shell archive. Remove anything before this line, then unpack # it by saving it into a file and typing "sh file". To overwrite existing # files, type "sh file -c". You can also feed this as standard input via # unshar, or by typing "sh <file", e.g.. If this archive is complete, you # will see the following message at the end: # "End of shell archive." # Contents: keyfinder.c keywords.h # Wrapped by djones@goofy on Wed Dec 19 16:14:36 1990 PATH=/bin:/usr/bin:/usr/ucb ; export PATH if test -f keyfinder.c -a "${1}" != "-c" ; then echo shar: Will not over-write existing file \"keyfinder.c\" else echo shar: Extracting \"keyfinder.c\" \(3780 characters\) sed "s/^X//" >keyfinder.c <<'END_OF_keyfinder.c' X#include <stdio.h> X#define print (void)printf X Xchar* Emalloc(); X Xtypedef struct { X char *string; /* keyword string to be matched */ X char *value; /* value to return on match */ X}Keyword_def ; X X/* X * Kluge to set up keyword/value pairs: X * compile them into the program. It would be X * better, probably, if we read them from a file. X */ XKeyword_def init_keys[] = X{ X#include "keywords.h" X , 0, 0 X}; X Xmain() X{ X X /* X * In the resulting switch, the macro EOSTR(chr,len) X * calculates whether or not the scan of [len] X * chars, ending looking at [chr] has inspected the whole string. X * Thus if the string is null-terminated, you use this (default): X * X * #define EOSTR(chr,len) ((chr) == 0) X * X * Using counted strings, you can substitute a macro that compares [len] X * against the known length of the string. X * X * #define EOSTR(chr,len) ((len) == known_len) X * X * Another possibility would be ... X * X * #define EOSTR(chr,len) (!isalnum(chr)) X * X */ X X print("#ifndef EOSTR\n# define EOSTR(chr,length) ((chr)==0)\n#endif\n"); X X /* We generate a function called, "keyis()". X */ X print("keyis(str)\n char* str;\n{\n"); X X gen_switch(""); X X /* In the resulting function, if no match is found, return -1. X */ X print(" return -1;\n}\n"); X X if(ferror(stdout)) { X perror("stdout"); X exit(-1); X } X} X X Xchar* Xextend(str, ch) X char* str; X char ch; X{ X int len = strlen(str); X char* retval = Emalloc(len+2); X strcpy(retval, str); X retval[len] = ch; X retval[len+1] = '\0'; X return retval; X} X Xchar* Xfind(str) X char* str; X{ X Keyword_def *key = init_keys; X X for(;key->string;key++) { X if(strcmp(str, key->string)==0) X return key->value; X } X X return 0; X X} X X/* X * Generate a switch under the assumption that X * the chars in [str] have already been matched. X */ Xgen_switch(str) X char* str; X{ X int index = strlen(str); X char done[128]; X X { int i; X for(i =0; i<128; i++) /* 128 == number of chars. (ASCII specific) */ X done[i] = 0; X } X X { Keyword_def *key = init_keys; X char* value; X int do_switch; X X /* If there is only one possible case, no need to generate X * a switch. X */ X X do_switch = cases(str); X X if(do_switch) { X X blanks(3*index); X print("switch(*str++) /* %s */\n", str); X blanks(3*index); X print("{ default: "); X X if(value = find(str)) X print("if(EOSTR(*str,%d)) return %s; ", index, value); X X print("break;\n"); X X for(;key->string;key++) { X char ch; X X if(index<strlen(key->string)) X ch = key->string[index]; X else X ch = 0; X X if(ch) { X if(!match(key->string, str) != 0) X continue; X X if(done[ch]) X continue; X else X done[ch] = 1; X X blanks(3*index + 2); X print("case '%c': /* %s%c */\n", ch,str,ch); X gen_switch(extend(str), ch); X blanks(3*index + 2); X print("break;\n"); X } X } X X blanks(3*index); X print("}\n"); X } X else { X /* One case only. */ X blanks(3*index); X if(value = find(str)) X print("if(EOSTR(*str,%d)) return %s;\n", index, value); X } X } X} X Xcases(str) X char* str; X{ X int index = strlen(str); X Keyword_def *key = init_keys; X X for(;key->string;key++) { X char ch; X X if(index<strlen(key->string)) X ch = key->string[index]; X else X ch = 0; X X if(ch) { X if(!match(key->string, str) != 0) X continue; X X return 1; X } X } X return 0; X} X Xblanks(i) X{ X print(" "); X while(i--) X (void)putchar(' '); X} X Xextern char* malloc(); X Xchar* Emalloc(size) X{ X char *retval = malloc((unsigned)size); X X if(retval == 0) { X perror("malloc()"); X exit(-1); X } X return retval; X} X Xmatch(key, str) X char* key; X char* str; X{ X return strncmp(key,str, strlen(str)) == 0; X} END_OF_keyfinder.c if test 3780 -ne `wc -c <keyfinder.c`; then echo shar: \"keyfinder.c\" unpacked with wrong size! fi # end of overwriting check fi if test -f keywords.h -a "${1}" != "-c" ; then echo shar: Will not over-write existing file \"keywords.h\" else echo shar: Extracting \"keywords.h\" \(743 characters\) sed "s/^X//" >keywords.h <<'END_OF_keywords.h' X X "and", "AND" X, "array", "ARRAY" X, "begin", "BEGIN" X, "case", "CASE" X, "const", "CONST" X, "div", "DIV" X, "do", "DO" X, "downto", "DOWNTO" X, "else", "ELSE" X, "end", "END" X, "external", "EXTERNAL" X, "file", "FILEX" X, "for", "FOR" X, "forward", "FORWARD" X, "function", "FUNCTION" X, "goto", "GOTO" X, "if", "IF" X, "in", "IN" X, "insert", "INSERT" X, "label", "LABEL" X, "mod", "MOD" X, "nil", "NIL" X, "not", "NOT" X, "of", "OF" X, "or", "OR" X, "others", "OTHERS" X, "packed", "PACKED" X, "procedure", "PROCEDURE" X, "program", "PROGRAM" X, "record", "RECORD" X, "repeat", "REPEAT" X, "set", "SET" X, "then", "THEN" X, "to", "TO" X, "type", "TYPE" X, "until", "UNTIL" X, "var", "VAR" X, "while", "WHILE" X, "with", "WITH" END_OF_keywords.h if test 743 -ne `wc -c <keywords.h`; then echo shar: \"keywords.h\" unpacked with wrong size! fi # end of overwriting check fi echo shar: End of shell archive. exit 0 -- Send compilers articles to compilers@iecc.cambridge.ma.us or {ima | spdcc | world}!iecc!compilers. Meta-mail to compilers-request.