[gnu.g++] Perfect hash function for g++ reserved words

schmidt%siam.ics.uci.edu@PARIS.ICS.UCI.EDU ("Douglas C. Schmidt") (09/20/88)

Howdy,

   After turning in too many bug reports I decided to be constructive
for a change!  The following file contains the diffs to install a
perfect hash function lookup routine into the G++ 1.27 reserved word
symbol table (found in lex.c).  A similar version of this concept has
already been installed in the GNU C compiler, starting with the 1.28
distribution.

   As always, the big advantage of a perfect hash function is the O(1)
(constant) time to determine whether a given token is a G++ reserved
word.  The perfect hash function used here is very simple and fast,
the basic idea is described in the following code.  My timing tests
show a decrease in the parse time after installing the code.  Your
mileage may vary, since the actual speed up depends on the
characteristics of each source file.  However, with the perfect hash
scheme there are never any *bad* input sequences, i.e., there is no
``worst-case'' performance, since all operations are essentially
identical for any input token.

   Users of libg++ can look forward to a perfect hash function
generator class at some point in the future.  This would enable the
generation of perfect hash functions ``on-the-fly.''  In the mean
time, ENJOY!

   Doug Schmidt

p.s. Suggestions for improvement of the basic approach are always welcome.

------------------------------( Cut Here )------------------------------ 
*** lex.c.old	Mon Sep 19 22:29:36 1988
--- lex.c	Mon Sep 19 22:29:21 1988
***************
*** 467,540 ****
  char *token_buffer;		/* Pointer to token buffer.
  				 Actual allocated length is maxtoken + 2.  */
  
- #define MAXRESERVED 9
  
! /* frw[I] is index in `reswords' of the first word whose length is I;
!    frw[I+1] is one plus the index of the last word whose length is I.
!    The length of frw must be MAXRESERVED + 2 so there is an element
!    at MAXRESERVED+1 for the case I == MAXRESERVED.  */
  
! static char frw[MAXRESERVED+2] =
! /*  { 0, 0, 0, 2, 5, 13, 19, 29, 31, 35 }; non-extended */
!   { 0, 0, 0, 2, 6, 15, 22, 35, 40, 46, 48, };
  
  /* Table of reserved words.  */
  
! struct resword { char *name; short token; enum rid rid;};
  
  #define NORID RID_UNUSED
  
- /* If this is not in lexicographical order, you will lose.  */
  static struct resword reswords[]
!   = {{"do", DO, NORID},
!      {"if", IF, NORID},
!      {"asm", ASM, NORID},
!      {"for", FOR, NORID},
!      {"int", TYPESPEC, RID_INT},
!      {"new", NEW, NORID},
!      {"auto", SCSPEC, RID_AUTO},
!      {"case", CASE, NORID},
!      {"char", TYPESPEC, RID_CHAR},
!      {"else", ELSE, NORID},
!      {"enum", ENUM, NORID},
!      {"goto", GOTO, NORID},
!      {"long", TYPESPEC, RID_LONG},
!      {"this", THIS, NORID},
!      {"void", TYPESPEC, RID_VOID},
!      {"break", BREAK, NORID},
!      {"class", AGGR, RID_CLASS},
!      {"const", TYPE_QUAL, RID_CONST},
!      {"float", TYPESPEC, RID_FLOAT},
!      {"short", TYPESPEC, RID_SHORT},
!      {"union", AGGR, RID_UNION},
!      {"while", WHILE, NORID},
!      {"delete", DELETE, NORID},
!      {"double", TYPESPEC, RID_DOUBLE},
!      {"extern", SCSPEC, RID_EXTERN},
!      {"friend", TYPE_QUAL, RID_FRIEND},
!      {"inline", SCSPEC, RID_INLINE},
!      {"public", PUBLIC, NORID},
!      {"return", RETURN, NORID},
!      {"signed", TYPESPEC, RID_SIGNED},
!      {"sizeof", SIZEOF, NORID},
!      {"static", SCSPEC, RID_STATIC},
!      {"struct", AGGR, RID_RECORD},
!      {"switch", SWITCH, NORID},
!      {"typeof", TYPEOF, NORID},
!      {"default", DEFAULT, NORID},
!      {"dynamic", DYNAMIC, NORID},
!      {"private", PRIVATE, NORID},
!      {"typedef", SCSPEC, RID_TYPEDEF},
!      {"virtual", SCSPEC, RID_VIRTUAL},
!      {"continue", CONTINUE, NORID},
!      {"operator", OPERATOR, NORID},
!      {"overload", OVERLOAD, NORID},
!      {"register", SCSPEC, RID_REGISTER},
!      {"unsigned", TYPESPEC, RID_UNSIGNED},
!      {"volatile", TYPE_QUAL, RID_VOLATILE},
!      {"__alignof", ALIGNOF, NORID},
!      {"protected", PROTECTED, NORID}};
  
  /* The elements of `ridpointers' are identifier nodes
     for the reserved type names and storage classes.
     It is indexed by a RID_... value.  */
--- 467,718 ----
  char *token_buffer;		/* Pointer to token buffer.
  				 Actual allocated length is maxtoken + 2.  */
  
  
! #define MIN_WORD_SIZE       2      /* minimum size for C++ reserved words */
! #define MAX_WORD_SIZE       9      /* maximum size for C++ reserved words */
! #define MIN_KEY_SIZE        2      /* range of the hash keys for the */
! #define MAX_KEY_SIZE        67     /* minimum perfect hash generator */
  
! /* the following defines make subsequent code more comprehensible */
  
+ #define Do 0
+ #define For 2
+ #define Goto 5
+ #define New 9
+ #define Struct 12
+ #define Break 18
+ #define If 23
+ #define Int 25
+ #define Default 28
+ #define Const 35
+ #define Return 40
+ #define Register 46
+ #define Continue 54
+ #define Long 62
+ #define Extern 66
+ #define Asm 72
+ #define Auto 75
+ #define Double 79
+ #define Case 85
+ #define Float 89
+ #define Delete 94
+ #define Private 100
+ #define Volatile 107
+ #define Else 115
+ #define __Alignof 119
+ #define Enum 128
+ #define Inline 132
+ #define Sizeof 138
+ #define Operator 144
+ #define Dynamic 152
+ #define Static 159
+ #define Union 165
+ #define Protected 170
+ #define Short 179
+ #define Overload 184
+ #define Public 192
+ #define Typeof 198
+ #define Typedef 204
+ #define Class 211
+ #define Unsigned 216
+ #define Virtual 224
+ #define Char 231
+ #define Void 235
+ #define Friend 239
+ #define While 245
+ #define Signed 250
+ #define Switch 256
+ #define This 262
+ #define Dummy 266
+ 
  /* Table of reserved words.  */
  
! struct resword { short offset; short token; enum rid rid;};
  
  #define NORID RID_UNUSED
  
  static struct resword reswords[]
!   = {{0,0,NORID},   /* these locations are not used. */
!      {0,0,NORID},   /* they simplify the hashing.    */
!      {Do, DO, NORID},
!      {For, FOR, NORID},
!      {Goto, GOTO, NORID},
!      {New, NEW, NORID},
!      {Struct, AGGR, RID_RECORD},
!      {Break, BREAK, NORID},
!      {If, IF, NORID},
!      {Int, TYPESPEC, RID_INT},
!      {Default, DEFAULT, NORID},
!      {Const, TYPE_QUAL, RID_CONST},
!      {Return, RETURN, NORID},
!      {Register, SCSPEC, RID_REGISTER},
!      {Continue, CONTINUE, NORID},
!      {Long, TYPESPEC, RID_LONG},
!      {Extern, SCSPEC, RID_EXTERN},
!      {Asm, ASM, NORID},  /* duplicates like this are needed since */
!      {Asm, ASM, NORID},  /* the hash function is not minimum! */
!      {Auto, SCSPEC, RID_AUTO},     
!      {Auto, SCSPEC, RID_AUTO},     
!      {Auto, SCSPEC, RID_AUTO},     
!      {Auto, SCSPEC, RID_AUTO},
!      {Double, TYPESPEC, RID_DOUBLE},
!      {Double, TYPESPEC, RID_DOUBLE},
!      {Case, CASE, NORID},
!      {Float, TYPESPEC, RID_FLOAT},
!      {Float, TYPESPEC, RID_FLOAT},
!      {Delete, DELETE, NORID},
!      {Private, PRIVATE, NORID},
!      {Volatile, TYPE_QUAL, RID_VOLATILE},
!      {Else, ELSE, NORID},
!      {__Alignof, ALIGNOF, NORID},
!      {Enum, ENUM, NORID},
!      {Inline, SCSPEC, RID_INLINE},
!      {Sizeof, SIZEOF, NORID},
!      {Operator, OPERATOR, NORID},
!      {Dynamic, DYNAMIC, NORID},
!      {Static, SCSPEC, RID_STATIC},
!      {Union, AGGR, RID_UNION},
!      {Protected, PROTECTED, NORID},
!      {Short, TYPESPEC, RID_SHORT},
!      {Overload, OVERLOAD, NORID},
!      {Public, PUBLIC, NORID},
!      {Typeof, TYPEOF, NORID},
!      {Typeof, TYPEOF, NORID},
!      {Typedef, SCSPEC, RID_TYPEDEF},
!      {Class, AGGR, RID_CLASS},
!      {Class, AGGR, RID_CLASS},
!      {Unsigned, TYPESPEC, RID_UNSIGNED},
!      {Unsigned, TYPESPEC, RID_UNSIGNED},
!      {Virtual, SCSPEC, RID_VIRTUAL},
!      {Char, TYPESPEC, RID_CHAR},
!      {Char, TYPESPEC, RID_CHAR},
!      {Char, TYPESPEC, RID_CHAR},
!      {Char, TYPESPEC, RID_CHAR},
!      {Char, TYPESPEC, RID_CHAR},
!      {Void, TYPESPEC, RID_VOID},
!      {Friend, TYPE_QUAL, RID_FRIEND},
!      {Friend, TYPE_QUAL, RID_FRIEND},
!      {While, WHILE, NORID},
!      {While, WHILE, NORID},
!      {While, WHILE, NORID},
!      {While, WHILE, NORID},
!      {Signed, TYPESPEC, RID_SIGNED},
!      {Switch, SWITCH, NORID},
!      {This, THIS, NORID},
!      {This, THIS, NORID},
!      {Dummy, 0, NORID}};
  
+ /* Contains the offset into the word_table for each of
+    the reserved words.  This table is indexed by the
+    hash function value.  Some entries are dummies, needed
+    since the function is perfect, but not minimum. */
+ 
+ static short access_table[] = {
+             0,  0,  0,  2,  5,  9, 12, 18, 23, 25,
+            28, 35, 40, 46, 54, 62, 66, 72, 72, 75,
+            75, 75, 75, 79, 79, 85, 89, 89, 94,100,
+           107,115,119,128,132,138,144,152,159,165,
+           170,179,184,192,198,198,204,211,211,216,
+           216,224,231,231,231,231,231,235,239,239,
+           245,245,245,245,250,256,262,262,266 };
+ 
+ /* The word_table simply packs all the characters together
+    end-to-end. */
+ 
+ static char *word_table = 
+ "doforgotonewstructbreakifintdefaultconstreturnregister\
+ continuelongexternasmautodoublecasefloatdeleteprivatevolatileelse\
+ __alignofenuminlinesizeofoperatordynamicstaticunionprotectedshort\
+ overloadpublictypeoftypedefclassunsignedvirtualcharvoidfriendwhile\
+ signedswitchthis";
+ 
+ /* This function performs the perfect hash mapping from input
+    string to reswords table index.  It only looks at the second, third and 
+    last characters in the string, thus assuring the O(1) lookup time 
+    (this keeps our constant down to an insignificant amount!).  Compiling
+    the following 2 functions as inline removes all overhead of the
+    function calls. */
+ 
+ #ifdef __GNUC__
+ inline static int hash (char *str,int len)
+ #else
+ static int hash (str,len)
+ register char	*str;             
+ register int	len;
+ #endif
+ {
+   /* This table is used to build the hash table index that recognizes
+      reserved words in 0(1) steps.  It is larger than strictly necessary, 
+      but I'm trading off the space for the time-saving luxury of avoiding 
+      subtraction of an offset. */
+ 
+ 	static int   cval[]
+    = {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+ 		  0,  0,  0,  0,  0,  4,  0, 16,  3, 16,
+ 		 31,  0,  3,  5, 36, 22,  0,  2, 22,  5,
+ 		  6,  0, 28,  0,  0,  5,  0, 18,  3,  1,
+ 		  4,  8,  4,  0,  0,  0,  0,  0, 
+    	};
+ 	register int	hval;
+ 
+   /* The hash function couldn't be simpler: add the length of the string
+      to the Hash_Table value of its 2nd and 3rd character (if the string
+      is not that long then we simply ignore these values). */
+ 
+ 	switch (hval = len) {
+ 		default:
+ 			hval += cval[ (int)str[2]];
+ 		case 2:
+ 			hval += cval[ (int)str[1]] + cval[ (int)str[len - 1]];
+ 	}
+    return (hval);
+ }
+ 
+ /* This routine attempts to match the string found in the Word_Table with
+    the one from the input stream.  If all the relevant details match an
+    actual character-by-character comparison is performed.  */
+ 
+ #ifdef __GNUC__
+ inline struct resword *is_reserved_word (char *str,int len)
+ #else
+ struct resword *is_reserved_word (str,len)
+ register char *str;
+ register int len;
+ #endif
+ {
+   if (len <= MAX_WORD_SIZE && len >= MIN_WORD_SIZE)
+     {
+       register int key = hash (str,len);
+ 
+  		if (key >= MIN_KEY_SIZE && key <= MAX_KEY_SIZE) 
+         {	register int str_start = reswords[key].offset;   
+             register int str_end   = reswords[key+1].offset;
+  
+             while (str_start < str_end) 
+               {
+                 if (*str++ != word_table[str_start++]) 
+                   {
+                     return (NULL); /* bail if no match */
+                   }
+               }
+  
+             if (*str == '\0') 
+               {
+                 return (&reswords[key]); /* got it! a reserved word */
+               }
+           }
+     }
+   return (NULL);                 /* no luck */
+ }
+ 
  /* The elements of `ridpointers' are identifier nodes
     for the reserved type names and storage classes.
     It is indexed by a RID_... value.  */
***************
*** 1356,1411 ****
  	value = IDENTIFIER;
  	yylval.itype = 0;
  
! 	/* Try to recognize a keyword.  */
  
! 	if (p - token_buffer <= MAXRESERVED)
! 	  {
! 	    /* Use binary search to find match.  */
! 	    register int lo = frw [p - token_buffer];
! 	    register int hi = frw [p - token_buffer + 1] - 1;
! 	    register int mid = (lo + hi) >> 1;
! 	    register char cmp, ch;
! 
! 	    p = token_buffer;
! 	    ch = *p++;
! 	    while (lo <= hi)
! 	      {
! 		register struct resword *r = &reswords[mid];
! 		if ((cmp = (r->name[0] - ch))
! 		    || (cmp = strcmp (r->name+1, p)))
! 		  {
! 		    if (cmp < 0)
! 		      lo = mid + 1;
! 		    else
! 		      hi = mid - 1;
! 		    mid = (lo + hi) >> 1;
! 		  }
! 		else
! 		  {
! 		    if (r->rid)
! 		      yylval.ttype = ridpointers[(int) r->rid];
! 		    if ((! flag_no_asm
! 			 || ((int) r->token != ASM
! 			     && (int) r->token != TYPEOF
! 			 && r->rid != RID_INLINE))
! 			/* -ftraditional means don't recognize
! 			   typeof, const, volatile, dynamic, signed or inline.  */
! 			&& (! flag_traditional
! 			    || ((int) r->token != TYPE_QUAL
! 				&& (int) r->token != TYPEOF
! 				&& r->rid != RID_SIGNED
! 				&& r->rid != RID_INLINE
! 				&& strcmp (r->name, "dynamic")))
  #ifdef SOS
! 			&& (flag_all_virtual == 2
! 			    || strcmp (r->name, "dynamic"))
  #endif
! 			)
! 		      value = (int) r->token;
! 		    break;
! 		  }
! 	      }
! 	  }
  
  	/* If we did not find a keyword, look for an identifier
  	   (or a typename).  */
--- 1534,1568 ----
  	value = IDENTIFIER;
  	yylval.itype = 0;
  
! 	/* Try to recognize a keyword.  Use perfect hash to find match.  */
!    { 
!      register struct resword *ptr;
  
!      if (ptr = is_reserved_word (token_buffer,p - token_buffer))
!        {
!          if (ptr->rid)
!            yylval.ttype = ridpointers[(int) ptr->rid];
!          if ((! flag_no_asm
!               || ((int) ptr->token != ASM
!                   && (int) ptr->token != TYPEOF
!                   && ptr->rid == RID_INLINE))
!              /* -ftraditional means don't recognize
!                 typeof, const, volatile, dynamic, signed or inline.  */
!              && (! flag_traditional
!                  || ((int) ptr->token != TYPE_QUAL
!                      && (int) ptr->token != TYPEOF
!                      && ptr->rid   != RID_SIGNED
!                      && ptr->rid   != RID_INLINE
!                      && ptr->token != DYNAMIC))
!                      /* not sure what to do with this last one... */
  #ifdef SOS
!              && (flag_all_virtual == 2
!                  || ptr->token != DYNAMIC)
  #endif
!              )
!            value = (int) ptr->token;
!        }
!    }      
  
  	/* If we did not find a keyword, look for an identifier
  	   (or a typename).  */

dsb@Rational.COM (David S. Bakin) (09/20/88)

I don't want to make a big fuss, especially as it is someone else who is
being so good about writing code and posting it, but ...

A perfect hash function isn't the best for the purpose ... if any effort
is made for it to be a minimal perfect hash function.  I notice that the
g++ hash function just posted isn't minimal but at the same time some
table entries are duplicated instead of being left NULL.

Since 1) a test against NULL is faster than a string comparison
  and 2) most (statistics anyone?) calls to the hash function produce
         the result MISMATCH because most calls are for identifiers not
         for reserved words (especially in C, maybe not as much for Ada)
  and 3) memory is cheap (at least, you can't convince me that gcc/g++
         is designed otherwise)
then it is a good idea to have a perfect hash function that hashes randomly
into a large table -- say one with at least 50% or even 75% null entries,
so that only 1-in-2 or 1-in-4 probes results in a string comparison while
the rest terminate immediately.

Again -- I appreciate the effort of other people to write the code and post
it!

-- Dave 
-- 
----------------------------------------------------------
Dave Bakin				    (408) 496-3600
c/o Rational; 3320 Scott Blvd.; Santa Clara, CA 95054-3197
Internet:  dsb@rational.com	 Uucp:  ...!uunet!igor!dsb