rro@csu-cs.UUCP (10/06/83)
This is the program I promised in a recently posted article in net.lang. hplabs!csu-cs!rro program lhashd( input, output ); (* minimal perfect hash table generation program input: a list of words which are to be organized into a dense list so that the hash function f(word) = length(word) + value(firstletter(word)) + value(lastletter(word)) will find the word in one probe, or the word is not in the list. output: the values associated with each letter the minimal perfect hash table containing the words. reference: cook, curtis r. and oldehoeft, r. r., "more on minimal perfect hash tables," colorado state university technical report, april 1981. *) const maxwordsize = 16; (* up to 16-char words *) maxlistsize = 110; (* up to 110 input words *) maxhashsize = 400; (* hash table max index = 400 *) cha = 'a'; chz = 'z'; type listsize = 1..maxlistsize; wordsize = 1..maxwordsize; letters = cha..chz; hashsize = 1..maxhashsize; wordinfo = record row,col : char; index,length : integer end; freqinfo = record letter : char; num : integer end; parray = packed array[wordsize] of char; var graph : array[listsize] of wordinfo; (* word triples *) order : array[1..26] of char; (* decreasing freq letters *) pnames : array[listsize] of parray; (* input words *) name : array[wordsize] of char; lorder : array[letters] of integer; (* index of ordered letters *) values : array[letters] of integer; (* letter values *) freq : array[1..26] of freqinfo; (* letter frequencies *) slot : array[1..30] of integer; hashtable: array[hashsize] of integer; (* perfect hash table *) diagonal : array[letters] of boolean; (* occur twice in a word *) twordrec : wordinfo; temprec : freqinfo; (* miscellaneous constants *) i, j, k, size, pos, len, lastletter, numslots, point, val, entime, sttime, lowest, highest: integer; ch, ch1, ch2: char; (**************** procedure initialize ************************) procedure initialize; begin for i := 1 to maxlistsize do graph[i].length := 0; for ch := cha to chz do begin values [ch] := 0; diagonal [ch] := false end; for i := 1 to maxhashsize do hashtable [i] := 0; lowest := 100; highest := 0; k := ord('a') - 1; for i := 1 to 26 do begin k := k +1; freq [i].num := 0; freq [i].letter := chr(k) end end; (* initialize *) (******************** procedure readnames ****************************** readnames inputs words separated by commas or blanks. finds first and last letter and length of each word. computes frequency of each letter (number of times it occurs as first or last letter in a word). pnames[i]............ i th word graph[i].row ........ first letter of i th word graph[i].col ........ last letter of i th word graph[i].length...... length of i th word diagonal[ch] ........ true if character ch is first and last letter of same word; otherwise it is false. freq[i]......num .... frequency of i th letter ( a = 1, b = 2,...) size ................ number of words input *) procedure readnames; var current : integer; begin current := 0; k := ord('a')-1; read(ch); while (ch = ' ') or (ch = ',') do read(ch); while not eof do begin current := current + 1; len := 0; repeat len := len + 1; name[len] := ch; read(ch); until (ch = ' ') or (ch = ','); (* blank fill rest of name and place name in pnames *) for i := len + 1 to maxwordsize do name[i] := ' '; pack(name,1,pnames[current]); ch1 := name[1]; ch2 := name[len]; if ch1 = ch2 then diagonal [ch1] := true; i := ord(ch1) - k; freq[i].num := freq[i].num + 1; i := ord(ch2) - k; freq[i].num := freq[i].num + 1; graph[current].row := ch1; graph[current].col := ch2; graph[current].length := len; graph[current].index := current; while ((ch = ' ') or (ch = ',')) and (not eof) do read(ch); end; size := current end; (* readnames *) (****************** procedure findorder ************************ findorder computes an ordering of the letters based on following algorithm: 1. if letter appears as first and last letter of same word add 100 to its frequency. 2. multiply frequency of each letter by 100. 3. for each letter and each word it appears in, add the original fre- quency of the other letter in the word to the frequency of the letter. 4. sort letters in order of decreasing frequency. order[i] ............. letter in i th position in ordering lorder[ch] ........... position of letter ch in ordering *) procedure findorder; label 99; var max,j,j1,j2 : integer; tarray : array [1..26] of integer; begin (* order by frequency *) k := ord('a') -1; for ch := cha to chz do begin j := ord(ch) -k; tarray[j] := freq[j].num; if diagonal[ch] then freq[j].num := freq[j].num + 100; freq[j].num := freq[j].num * 100 end; for i := 1 to size do begin j1 := ord(graph[i].row) - k; j2 := ord(graph[i].col) - k; freq[j1].num := freq[j1].num + tarray[j2]; freq[j2].num := freq[j2].num + tarray[j1]; end; for i := 1 to 25 do begin max := i; for j := i + 1 to 26 do if freq [j].num > freq [max].num then max := j; if freq [max].num = 0 then goto 99; if max <> i then begin temprec := freq[ i ]; freq [ i ] := freq [max]; freq [max] := temprec end; end; 99: i := 1; while i <= 26 do begin if freq[i].num > 0 then begin ch := freq [i].letter; order [i] := ch; lorder [ch] := i; lastletter := i; i := i + 1 end else i := 27 end end; (* findorder *) (*************************** procedure ordergraph ********************** each word is represented by a triple (first letter,last letter,length) or (graph[i].row, graph[i].col, graph[i].length). for each triple, inter- change first letter and last letter if last letter precedes first letter in letter ordering. then sort triples according to the letter ordering using graph[i].col as key. *) procedure ordergraph; begin (* order row col pairs *) for i := 1 to size do if lorder [ graph [i].col ] < lorder[graph[i].row] then begin ch := graph[i].col; graph[i].col := graph[i].row; graph[i].row := ch end; (* order points of graph *) pos := 0; for j := 1 to lastletter do begin ch := order[j]; for i := pos + 1 to size do if graph[i].col = ch then begin pos := pos + 1; twordrec := graph[i]; graph[i] := graph[pos]; graph[pos] := twordrec end; end; i := 1; while i <= size do begin ch := graph[i].col ; if diagonal[ch] then begin j := i + 1; while graph[j].col = ch do j := j + 1; k := i; i := j; j := j -1; while k < j do begin if graph[k].row = ch then begin twordrec := graph[j]; graph[j] := graph[k]; graph[k] := twordrec; j := j -1 end; k := k + 1 end end else i := size + 1 end end; (* ordergraph *) procedure backup (var ch:char; var point : integer); forward; (************************ function checkdiag **************************** function checkdiag(ch) returns the value false if ch is the first and last letter of the same word and the value assignment for ch hashes one of these words to a non-empty slot. otherwise it returns true. *) function checkdiag(ch:char;val,point:integer): boolean; var check : boolean; begin check := true; if diagonal[ch] then begin len := graph[point].length; while graph[point].col = ch do begin check := check and (hashtable[2*val + len] = 0); point := point + 1; end end; checkdiag := check; end; (* checkdiag *) (************************* procedure undo ******************************** undo(ch) deletes the value assignment for letter ch and all associated hash table entries for triples with middle component ch. *) procedure undo(ch:char); label 99; var j : integer; begin (* undo *) point := point -1; val := values[ch]; while graph[point].col = ch do begin j := graph[point].length + values[graph[point].row] + val; hashtable[j] := 0; if j = lowest then if lowest = highest then begin lowest := 100; highest := 0; goto 99 end else while hashtable[lowest] = 0 do lowest := lowest + 1 else if j = highest then while hashtable[highest] = 0 do highest := highest - 1; point := point - 1 end; point := point + 1; 99:end; (* undo *) (************************* procedure findchval *************************** for all triples with middle component ch, findchval attempts to find a value assignment for ch so that all the triples hash to empty hash table slots. findchval begins the search with the starting value start. result is returned via val. findchval is successful if the value re- turned via is not -1000. if val = -1000 then no value assignment is possible because two of the triples have the same other letter value and length sum. *) procedure findchval (ch:char;start,point:integer;var val:integer); label 99; var empty : boolean; i,j,min,temp : integer; begin numslots := 0; while (graph[point].col = ch) and (graph[point].row <> ch) and (point <= size) do begin numslots := numslots + 1; slot[numslots] := graph[point].length + values[graph[point].row]; point := point + 1 end; if numslots > 0 then begin if numslots > 1 then begin for i := 1 to numslots do begin min := i; for j := i + 1 to numslots do if slot[j] < slot[min] then min := j; if min <> i then begin temp := slot[min]; slot[min] := slot[i]; slot[i] := temp end end; for i := 1 to numslots - 1 do if slot[i] = slot[i+1] then begin point := point - 1; while graph[point].length + values[graph[point].row] <> slot[i] do point := point - 1; ch1 := graph[point].row; point := point - 1; while graph[point].length + values[graph[point].row] <> slot[i] do point := point - 1; ch2 := graph[point].row; val := -1000; goto 99 end end; (* handle special case of letter with frequency one *) j := lorder[ch]; if freq[j].num < 200 then start := -slot[1] + 1 + lowest; empty := false; val := start -1; while not empty do begin val := val + 1; empty := true; for k := 1 to numslots do empty := empty and (hashtable[val+slot[k]] = 0); empty := empty and checkdiag(ch,val,point); end; (* while *) end (* then *) else begin val := start; while not checkdiag(ch,val,point) do val := val + 1 end; 99:end; (* findchval *) (*********************** procedure createtable **************************** createtable finds letter value assignments and makes hash table entries for a minimal perfect hash. it does the letter value assignments by considering all groups of triples with the same middle component begin- ning with the first letter in the letter ordering. *) procedure createtable; label 99 , 98 ; begin point := 1; i := 1; while i <= lastletter do begin ch := order[i]; findchval(ch,0,point,val); 98: if val = -1000 then begin repeat i := i -1; ch := order[i]; undo(ch); until (ch = ch1) or (ch = ch2); val := values[ch] + 1; findchval(ch,val,point,val); goto 98 end; values[ch] := val; while graph[point].col = ch do begin j := graph[point].length + values[graph[point].row] + val; if ( j <= highest - size) then begin undo(ch); findchval(ch,val+1,point,val); goto 98 end; if (j >= lowest + size) then begin backup(ch,point); goto 98 end else if hashtable[j] <> 0 then begin ch := graph[point].col; undo(ch); val := values[ch]; findchval(ch,val+1,point,val); goto 98 end else begin hashtable[j] := graph[point].index; if j < lowest then lowest := j; if j > highest then highest := j; if point >= size then goto 99 else point := point + 1 end end; i := i + 1 end; 99:end; (* createtable *) (************************* procedure backup ******************************** letter value assignment returned by findchval does not yield minimal table. backup undoes value assignment and any associated hash table entries for the group of triples with the current letter and the group of triples for the previous letter. *) procedure backup; begin if graph[point-1].col = ch then undo(ch); repeat i := i-1; ch := order[i]; until graph[point -1].col = ch; undo(ch); findchval(ch,val+1,point,val); values[ch] := val end; (* backup *) begin (* main program *) initialize; readnames; findorder; ordergraph; sttime := clock; createtable; entime := clock; writeln(' execution time = ', (entime-sttime)); writeln(' letter value '); for ch := cha to chz do writeln(' ',ch,values[ch] :8); for i := lowest to highest do begin j := hashtable[i]; writeln(' hash table[',i:3,'] ',pnames[j]) end end. (* main program *)