[net.sources] Perfect hash function finder - run this script in empty directory

charlie@genrad.UUCP (Charlie Havener) (10/07/83)
PATH=:/bin:/usr/bin:/usr/ucb
export PATH
all=FALSE
if [ $1x = -ax ]; then
	all=TRUE
fi
/bin/echo 'Extracting perf.h'
sed 's/^X//' <<'//go.sysin dd *' >perf.h
X/*	perf.h - a subset of a "std.h"
*/
X/* the pseudo storage classes
 */
#define AFAST	register
#define FAST	register
#define GLOBAL	extern
#define IMPORT	extern
#define INTERN	static
#define LOCAL	static

X/* the pseudo types
 */
typedef char TEXT, TINY;
typedef double DOUBLE;
typedef int ARGINT, BOOL, VOID;
typedef long LONG;
typedef short COUNT, FILE;
typedef unsigned short BITS;
typedef unsigned BYTES;
typedef unsigned char UTINY;
typedef unsigned long ULONG;
typedef unsigned short UCOUNT;

X/* system parameters
 */
#define STDIN	0
#define STDOUT	1
#define STDERR	2
#define YES		1
#define TRUE	1
#define NO		0
#define FALSE	0
#define NULL	0
#define FOREVER	for (;;)
#define BUFSIZE	512
#define BWRITE	-1
#define READ	0
#define WRITE	1
#define UPDATE	2
#define EOF		-1

X/*	macros
 */
#define isalpha(c)	(islower(c) || isupper(c))
#define isdigit(c)	('0' <= (c) && (c) <= '9')
#define islower(c)	('a' <= (c) && (c) <= 'z')
#define isupper(c)	('A' <= (c) && (c) <= 'Z')
#define iswhite(c)	((c) <= ' ' || 0177 <= (c)) /* ASCII ONLY */
#define tolower(c)	(isupper(c) ? ((c) + ('a' - 'A')) : (c))
#define toupper(c)	(islower(c) ? ((c) - ('a' - 'A')) : (c))
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 perf.h
	/bin/echo -n '	'; /bin/ls -ld perf.h
fi
/bin/echo 'Extracting perfect.c'
sed 's/^X//' <<'//go.sysin dd *' >perfect.c
X/*
	Program to search for Minimal Perfect Hash Functions for
	use in lexical analyzers. C.D. Havener Jan 26 1982
	GenRad Inc. 37 Great Road, Bolton Mass. 01740
	Based on paper "Minimal Perfect Hash Functions Made Simple"
	by Richard J. Cichelli - Comm. of ACM Jan 1980 pp 17-19

	Synopsis: The hash function is h = assoc value of 1st
	letter + length of keyword + assoc value of last letter

	This program finds the associated values of the letters
	given a list of keywords, 1 per line. It works most of
	the time for up to about 40 keywords but certain
	pathalogical cases exist. A semi-perfect hash is usually
	found by the program. The user can then tighten the
	default limits for max associated char value or the
	table limit using the -v and -t options. Sometimes the
	presort heuristics actually make the search process
	much more difficult. The user can try his luck at manual
	sorting using the -n option. Since the hash function
	produces such a limited range of numbers it can only work
	for up to about 40 keywords. If a language needs say 80
	keywords just split them up into two tables and let the
	lexical analyzer look in first one then the other, this
	will still be much faster than any other keyword lookup.

	This program has run sucessfully on a VAX 11/780 under
	4.1BSD. 
*/

#include "perf.h"	/* misc standard defines used at GenRad */
#define UNIX 1
#ifdef UNIX
#include <signal.h>
#undef EOF
#include <stdio.h> /* needed for the stream I/O */
#endif
#define STATIC static
#define MAXKEYS 100
#define MAXCHARS 0377
#ifdef UNIX
#define BPW 4		/* the number of bytes per word on this computer */
#else
#define BPW 2
#endif
#define UNDEF -1	/* the undefined value */


	/* no attempt to be efficient on a VAX with mucho memory */
TINY cval[MAXCHARS];	/* a place for every char known to Ascii */
COUNT cused[MAXCHARS];	/* keep count of how many times char used */
COUNT order[MAXKEYS];	/* current ordering of key words by subscript */
COUNT neworder[MAXKEYS];	/* the new supposedly improved ordering */
COUNT hashval[MAXKEYS];	/* the current hashvalue of the key word */
COUNT value[MAXCHARS];	/* the associated value of the character */
BOOL mapped[MAXKEYS];	/* keeps track of which table entries are in use */
TEXT name[50];		/* much bigger than any keyword should be ! */
TEXT *keywds[MAXKEYS];
#ifdef UNIX
VOID status();
LONG time();
TEXT *ctime();
#endif

COUNT debug = 0;
COUNT nkeys = sizeof(keywds)/BPW; 
COUNT tablesize = sizeof(keywds)/BPW;	
COUNT trys = 0;
COUNT nletters = 0;
COUNT kilotrys = 0;
COUNT atime = 600;	/* default alarm status time 10 min. */
TEXT *malloc();
COUNT hash();
BOOL aredefined();
TEXT *klimit = 0;
TEXT *textp =0;
LONG bigcount = 0;
COUNT depth = 0;
COUNT k_now = 0;	/* the present value of k for status reports */
BOOL sort = TRUE;

TEXT lettrs[37];	/* string of defined chars */
COUNT ptr = 0;

X/*-----------------------------------------------------------*/

main(argc,argv)
	int argc;
	char *argv[];
{
	COUNT i,j,k,m = 0;
	STATIC COUNT vlimit = 0;
	STATIC COUNT keylimit = 0;
	STATIC COUNT tlimit = 0;
	LONG start,stop;		/* the start and finish times */
	char *u;

	while ( --argc > 0 && (*++argv)[0] == '-')
		for ( u = argv[0]+1 ; *u != '\0' ; u++ )
			switch(*u)
			{
			/* things like vlimit,tlimit don't work as stack */
			/* variables! sscanf doesn't like them on stack */
			case 'v': /* assoc value limit */
				textp = *++argv;
				sscanf(textp,"%d",&vlimit);
				break;
			case 't': /* tablesize limit */
				textp = *++argv;
				sscanf(textp,"%d",&tlimit);
				break;
			case 'a': /* alarm time for status */
				textp = *++argv;
				sscanf(textp,"%d",&atime);
				break;
			case 'k': /* keyword limit */
				klimit = *++argv;
				sscanf(klimit,"%d",&keylimit);
				break;
			case 'n': /* no heuristic sort wanted */
				sort = FALSE;
				break;
			case '1':	/* debug printout level */
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			case '8':
			case '9':
				debug = *u - '0';
				break;
			default:
				printf("\nt32: illegal option %c\n", *u);
				printf("\nUsage: -n -t # -k # -v # -# -a #"); 

			}
#ifdef UNIX
	start = time(0);
	signal(SIGALRM,status);
	signal(SIGTERM,status);	/* on kill -TERM pid , give status */
	alarm(atime);	/* print status every N secs */
#endif

	i = 0;
	while ( (scanf("%s",name)) != EOF )
		{
		keywds[i++] = u = malloc(strlen(name)+1);
		if ( u == 0 )
			{
			printf("\n malloc() failure ");
			exit();
			}
		strcpy(u,name);
		}
	nkeys = ( keylimit == 0 ? i : keylimit );

	for ( i=0 ; i<MAXKEYS ; i++)
		{
		hashval[i] = UNDEF;
		order[i] = i;
		mapped[i] = FALSE;
		}

	for ( i=0 ; i<MAXCHARS ; i++ )
		{
		cval[i] = 0;
		value[i] = UNDEF;
		}

	if ( precheck() == FALSE )
		{
		printf("\nPerfect hash search terminated \n");
		exit();
		}


	for ( i=0 ; i<nkeys ; i++ )
		{
		j = *keywds[i];
		k = *(keywds[i] + strlen(keywds[i]) -1  );
		cval[j] = j;	/* put char val in array at chars subscript value */
		cval[k] = k;
		if ( cused[j] == 0 )
			++nletters;
		++cused[j];	/* keep count of how often each letter is used */
		if ( cused[k] == 0 )
			++nletters;	/* also count distinct letters */
		++cused[k];
		}
	tablesize = (tlimit == 0 ? MAXKEYS : tlimit );
	printf("\nPERFECT HASH FUNCTION FINDER , CDH Ver. 2.9\n");
	printf("\nStart time = %s",ctime(&start) );
	printf("Number of keywords = %d",nkeys);
	printf("\nNumber of distinct letters = %d",nletters);
	nletters = ( vlimit > 0 ? vlimit : nletters );
	printf("\nThe associated char value limit is %d",nletters );
	printf("\nThe table size limit is %d",tablesize);
	/* usually make it at least nkeys + 1 since first hash */
	/* value is usually 1 or 2 even if both assoc char values */
	/* are zero since the keyword length is included !  */


	if ( sort == FALSE )
		{
		printf("\n ----> Presorting of keywords is turned off");
		goto begin;
		}

	/* first order by sum of frequencies of occurrences of each */
	/* keys 1st and last letter */

	for ( i=0 ; i < nkeys ; i++ )
		{
		order[i] = cused[*keywds[i]] + cused[*(keywds[i]
						+ strlen(keywds[i]) -1)];
		}

	for ( m=0 ; m<nkeys ; m++ )
		{
		for ( k = -1 , i=0 ; i<nkeys ; i++ )
			{
			if ( order[i] > k )
				{
				k = order[i];
				j = i;			/* remember the keywd subscript */
				}
			}
		order[j] = 0;
		neworder[m] = j;
		}

	for ( i=0 ; i<nkeys; i++ )
		order[i] = neworder[i];

	if ( debug > 2 )
		{
		printf("\nAfter first ordering\n");
		prntorder();
		}

X/* the second ordering follows */
X/* keywds whose values are defined by keywds earlier in the */
X/* order are placed immediately after they are defined. */
X/* This causes hash value conflicts to occur as early during */
X/* the search as possible */


	for ( i=0 ; i<sizeof(lettrs) ; i++ )
		lettrs[i] = '\0';
	ptr = 0;
	merge(order[0]);	/* prime the pump */
	neworder[0] = order[0];
	order[0] = UNDEF;
	for ( i=1 ; i<nkeys ; )
		{
		BOOL newvalues = TRUE;
		while ( newvalues )
			{
			newvalues = FALSE;
			for ( k=0 ; k<nkeys ; k++ )
				{
				if ( order[k] == UNDEF )
					continue;
				if ( aredefined(order[k]) )
					{
					neworder[i++] = order[k];
					order[k] = UNDEF;
					continue;
					}
				}
			for ( k=0; k<nkeys ; k++ )
				if ( order[k] != UNDEF )
					{
					neworder[i++] = order[k];
					merge(order[k]);
					order[k] = UNDEF;
					newvalues = TRUE;
					break;
					}
			}
		}

	for ( i=0 ; i<nkeys ; i++ )
		order[i] = neworder[i];

	if ( precheck() == FALSE)
		{
		printf("\nOOPS - call a Guru, the presort botched it");
		prntorder();
		exit();
		}

X/* - - - - - - - BEGIN SEARCHING - - - - - - - - - - - - - - - */

begin:	printf("\n The search ordering is \n");
	prntorder();
	fflush(stdout); /* needed to force initial text out */

	if ( search(0) )
		{
		printf("\nSUCCESS ! Associated Char Values Follow: \n");
		prntvals();
		prnthash();
		}
	else
		{
		printf("\nFAILED to find char values for hash function");
		}
	printf("\nTotal search() invocations = %ld ",bigcount);
	stop = time(0);
	printf("\nStarted %s",ctime(&start));
	printf("Finished %s",ctime(&stop));
	printf("\n");
}

X/*-----------------------------------------------------------*/
X/* merge - adds keywd letters to the string of those defined */
VOID merge(n)
	ARGINT n;
{
	BOOL install_a,install_b = TRUE;
	TEXT a,b;
	COUNT i;

	if ( debug > 2 )
		printf("\nmerging in %s",keywds[n]);
	a = *(keywds[n]);
	b = *(keywds[n] + strlen(keywds[n]) - 1 );
	for ( i=0 ; i <= ptr ; i++ )
		{
		if ( a == lettrs[i] )
			install_a = FALSE;
		if ( b == lettrs[i] )
			install_b = FALSE;
		}
	if ( install_a )
		lettrs[ptr++] = a;
	if ( install_b && ( a != b ) )
		lettrs[ptr++] = b;
}

X/*----------------------------------------------------------*/
X/* aredefined - see if 1st & last char of keywd are defined */

BOOL aredefined(n)
	ARGINT n;
{
	TEXT a,b;
	COUNT i,k;

	a = *keywds[n];
	b = *(keywds[n] + strlen(keywds[n]) - 1 );
	for ( i=0, k=0 ; i <= ptr ; i++ )
		{
		if ( (a == lettrs[i]) || (b == lettrs[i]) )
			k++;
		}
	return ( k==2 ? TRUE : FALSE );
}

X/*---------------------------------------------------------*/
X/* precheck - all keywds length,1st and last char disjoint */

BOOL precheck()
{
	BOOL pretest = TRUE;
	COUNT i,j,a,b,c,d,leni,lenj;
	COUNT m,k;

	for ( m = 0 ; m < nkeys ; m++ )
		{
		i = order[m];
		for ( k = m+1; k < nkeys-1 ; k++ )
			{
			j = order[k];
			if ( (leni=strlen(keywds[i]) ) ==
				( lenj=strlen(keywds[j]) ) )
				{
				a = *keywds[i];
				b = *(keywds[i] + leni - 1);
				c = *keywds[j];
				d = *(keywds[j] + lenj -1 );
				if ( (a==c && b==d) || ( a==d && b==c) )
					{
					pretest = FALSE;
					printf("\nPrecheck fails on %s and %s",
									keywds[i],keywds[j]);

					}
				}
			}
		}
	return pretest;
}
X/*--------------------------------------------------------*/
X/* prntorder - printout the current order of the keywords */

VOID prntorder()
{
	COUNT i,j;
	for( i=0, j=0 ; i<nkeys ; i++, j++ )
		{
		if ( j >= 8 )
			{
			printf("\n");
			j = 0;
			}
		printf("%s ",keywds[order[i]]);
		}
	printf("\n");
}

X/*----------------------------------------------------*/
X/* prntvals - prints out the letter associated values */

prntvals()
{

		COUNT i,j;
		for( i=0, j=0 ; i<MAXCHARS ; j++ ,i++ )
			if( cval[i] )
				{
				if( j > 10 )
					{
					j = 0;
					printf("\n");
					}
				printf("%c  %d,",cval[i],value[i]);
				}
	printf("\n");
}

X/*------------------------------------------------------*/
X/* prnthash - prints out the hash values for the keywds */

VOID prnthash()
{

		COUNT i,j;
		BOOL swap = TRUE;
		COUNT hmin = MAXKEYS;
		COUNT hmax = 0;
		COUNT spread = 0;

		for ( i = 0 ; i < nkeys ; i++ )
			{
			j = hashval[i] = hash( keywds[i] );
			hmin = ( hmin < j ? hmin : j );
			hmax = ( hmax > j ? hmax : j );
			order[i] = i;
			}
		while ( swap )		/* plain vanilla bubble sort */
			{
			swap = FALSE;
			for ( i = 0 ; i < nkeys-1 ; i++ )
					{
					if ( hashval[order[i+1]] < hashval[order[i]] )
						{	
						swap = TRUE;
						j = order[i];
						order[i] = order[i+1];
						order[i+1] = j ;
						}
					}
			}
		printf("\nHash min =%d, max =%d, spread = %d\n",
					hmin,hmax,hmax-hmin+1);

		for ( i=0, j=0 ; i<nkeys ; i++, j++)
				{
				if ( j >= 8 )
					{
					printf("\n");
					j = 0;
					}
				printf("%s %d,",keywds[order[i]],
							hash(keywds[order[i]]));
				}
		printf("\n");
}

X/*--------------------------------------------------------*/

X/* hash - passed ptr to string. Returns hash value 	*/
COUNT hash(p)
	TEXT *p;
{
	return( value[*p] + strlen(p) + value[ *(p + strlen(p) -1) ] );
}

X/*-------------------------------------------------------*/
X/* search - calls itself recursively to find char values */

BOOL search(k)
	COUNT k;
{
	TEXT *p;
	COUNT v1,v2,num,m;
	COUNT sub1,sub2,subn;
	BOOL thesame = FALSE;

	bigcount++;
	k_now = k;			/* for status reporting */
	if ( k >= nkeys )	/* hey - we may be all done */
		return TRUE;

	if ( k > depth )
		depth = k;		/* keep track of how deep we searched */

	m = order[k];
	p = keywds[m];
	sub1 = *p;
	sub2 = *(p + strlen(p) -1);
	if ( sub1 == sub2 )
		thesame = TRUE;
	v1 = value[sub1];
	v2 = value[sub2];

	if ( defined(v1)  && defined(v2)  )
		{
		num = hash(p);
		if ( used(num) )
			return FALSE;	/* bad hash value, already in use */
		else
			{
			hashval[m] = num;	/* install it */
			mapped[num] = TRUE;
			if ( search(k+1) )
				return TRUE;
			else
				{
				hashval[m] = UNDEF;
				mapped[num] = FALSE;
				return FALSE;
				}
			}
		}
	else if ( defined(v1) )
			{
			COUNT j;
			for ( j=0 ; j<=nletters ; j++)
				{
				v2 = j;
				num = v1 + strlen(p) + v2;
				if ( !used(num) )
					{
					hashval[m] = num;
					mapped[num] = TRUE;
					value[sub2] = v2;
					subn = sub2;
					if ( search(k+1) )
						return TRUE;
					else
						remove(m,sub2);
					}
				}
			return FALSE;
			}
	else if ( defined(v2) )
			{
			COUNT j;
			for ( j=0 ; j<=nletters ; j++ )
				{
				v1 = j;
				num = v1 + strlen(p) + v2;
				if ( !used(num) )
					{
					hashval[m] = num;
					mapped[num] =TRUE;
					value[sub1] = v1;
					subn = sub1;
					if ( search(k+1) )
						return TRUE;
					else
						remove(m,sub1);
					}
				}
			return FALSE;
			}

	else		/* neither defined */
		{
		COUNT j;
		for ( j=0 ; j <= nletters ; j++ )
			{
			if ( thesame )
				{
				v1 = v2 = j;
				num = v1 + strlen(p) + v2;
				if ( !used(num) )
					{
					hashval[m] = num;
					mapped[num] = TRUE;
					value[sub1] = v1;	/* same place as value[sub2] */
					subn = sub1;
					if ( search(k+1) )
						return TRUE;
					else
						remove(m,subn);
					}
				}
			else
				{
				value[sub1] = j;
				if ( search(k) )	/* if never TRUE 
								thru for loop, then FALSE */
					return TRUE;
				else
					value[sub1] = UNDEF;
				}
			}
		return FALSE;
		}
}

X/*-------------------------------------------------------*/
X/* remove - backup by deleting keywds hash value etc */

VOID remove(m,subn)
	FAST COUNT m;
	FAST COUNT subn;
{
	if ( debug > 6)
		printf("\nremoving %s, subn = %d",keywds[m],subn);
	mapped[ hashval[m] ] = FALSE;
	hashval[m] = UNDEF;
	value[subn] = UNDEF;
}

X/*--------------------------------------------------------*/
X/* used - tests if the hash value is in use or is illegal */

BOOL used(num)
	FAST COUNT num;
{
	return( num > tablesize || num < 0 ? TRUE : mapped[num] );
}

X/*-----------------------------------------------------*/
X/* defined - checks if the character value is defined yet */

BOOL defined(value)
	FAST COUNT value;
{
	return( value != UNDEF ? TRUE : FALSE );
}
X/*--------------------------------------------------------*/
X/* status - on signal this reports the current statistics */

#ifdef UNIX
VOID status()
{
	fprintf(stderr,
		"\nSTATUS: nkeys=%d depth=%d k_now=%d, bigcount=%ld\n"
					,nkeys,depth,k_now,bigcount);
	fflush();

	signal(SIGTERM,status);
	signal(SIGALRM,status);
	alarm(atime);
}
#endif

//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 perfect.c
	/bin/echo -n '	'; /bin/ls -ld perfect.c
fi
/bin/echo 'Extracting status.c'
sed 's/^X//' <<'//go.sysin dd *' >status.c
X/* interprocess communication program for use with the perfect hash */
X/* finding program. C.D. Havener Feb 19 1982. GenRad CTD Boloton Mass. */

#include <stdio.h>
#include <signal.h>

int wakeup();
int pid;
int interval;

main()
{

	printf("\nEnter PID: ");
	scanf("%d",&pid);
	printf("\nEnter reporting interval in seconds: ");
	scanf("%d",&interval);
	printf("\nWill send SIGTERM to process %d every %d seconds\n",pid,interval);
	signal(SIGALRM,wakeup);
	alarm(3);
	for(;;)
	{
		wait();	/* can't sleep() here it turns off alarm !! */
	}
}

wakeup()
{
	kill(pid,SIGTERM);
	signal(SIGALRM,wakeup);
	alarm(interval);
}
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 status.c
	/bin/echo -n '	'; /bin/ls -ld status.c
fi
/bin/echo 'Extracting Makefile'
sed 's/^X//' <<'//go.sysin dd *' >Makefile
#
# Makefile for perfect hash program
#
X.SUFFIXES:
X.SUFFIXES: .o .c
#
#
# Specify -g in both the .c.o line and the perf: line to provide output
# for sdb.
#
X.c.o:
	cc -c  $*.c
#
OBJECTS= perfect.o 
#
HEADERS= perf.h
#
#
perf: $(OBJECTS) Makefile
	cc -o  perf $(OBJECTS)
	cc -o status status.c
#
$(OBJECTS) : $(HEADERS)
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 Makefile
	/bin/echo -n '	'; /bin/ls -ld Makefile
fi
/bin/echo 'Extracting 2230.txt'
sed 's/^X//' <<'//go.sysin dd *' >2230.txt
EE
REG
LABEL
IF
RANGE
RP
GP
CP
CS
INTEGRATE
TO
RDC
Q
CONTACT
D
SENSE
VDC
FREQ
GUARD
ESR
RATIO
GDC
LOW
BIAS
IDC
LS
MEMBER
EXCLUDE
AND
OR
NOT
DISPLAY
RETURN
PASS
FAIL
CALL
PAUSE
POP
DELAY
PERFORM
HIGH

//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 2230.txt
	/bin/echo -n '	'; /bin/ls -ld 2230.txt
fi
/bin/echo 'Extracting ckeywds.txt'
sed 's/^X//' <<'//go.sysin dd *' >ckeywds.txt
int
char
float
double
struct
union
long
short
unsigned
auto
extern
register
typedef
static
goto
return
sizeof
break
continue
if
else
for
do
while
switch
case
default
entry

//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 ckeywds.txt
	/bin/echo -n '	'; /bin/ls -ld ckeywds.txt
fi
/bin/echo 'Extracting kwds.txt'
sed 's/^X//' <<'//go.sysin dd *' >kwds.txt
GET
TEXT
RESET
OUTPUT
MAXINT
INPUT
TRUE
INTEGER
EOF
REWRITE
FALSE
CHR
CHAR
TRUNC
REAL
SQR
SQRT
WRITE
PUT
ORD
READ
ROUND
READLN
EXP
PAGE
EOLN
COS
SUCC
DISPOSE
NEW
ABS
LN
BOOLEAN
WRITELN
SIN
PACK
UNPACK
ARCTAN
PRED

//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 kwds.txt
	/bin/echo -n '	'; /bin/ls -ld kwds.txt
fi
/bin/echo 'Extracting perf.doc'
sed 's/^X//' <<'//go.sysin dd *' >perf.doc






          PERFECT HASHING - A PROGRAM TO FIND THE HASH FUNCTION

          Charlie Havener GenRad Inc. Component Test Division Bolton Mass.



                        This technical note describes an implementation  of
          a  pragmatic  algorithm  for finding perfect or semi-perfect hash
          functions for lists of keywords. The resulting hash function  can
          be  used  to  speed  up lexical analyzers used in translators and
          compilers. The algorithm was described by R.J.  Cichelli  in  The
          January  1980  issue of Communications of the ACM under the title
          "Minimal Perfect Hash Functions made Simple." The article did not
          include a computer language description of the algorithm and some
          important implementation details were unclear. It is assumed that
          the  user will know what to do with the output of this program. A
          marginally commented example of part of  a  lexical  analyzer  is
          included  in  Appendix B.  Another reference for those wishing to
          pursue the topic further is "More On Minimal Perfect Hash tables"
          by  Curtis  Cook  and  R.  Oldehoeft, a Colorado State University
          Technical Report.


                        The program takes a list of keywords and sorts them
          in  such  a  way that the search time for a hash function will be
          reasonable.  Once sorted a recursive trial  and  error  procedure
          hunts  for  a  hash  function  satisfying user supplied bounds of
          table size  and  associated  character  value  limits.  The  hash
          function is

          hash = assoc value of first character + keyword length
                                           + assoc value pf last character

          It is critically important to  select  a  good  ordering  of  the
          keywords  before searching begins. I ran up 100 hours of VAX time
          searching for a hash function with an unordered  list,  and  gave
          up.  Once  the  sort  heuristics  were debugged it found the hash
          function in minutes. Typically  it  will  find  the  function  in
          minutes or you may as well give up. A status reporting feature is
          built into the program on the UNIX system that  lets  you  follow
          the  progress  of  the  search depth.  If it has trouble, you can
          tell just which word it can't  get  past,  and  take  appropriate
          action. If it has trouble you can attempt to alter it's choice of
          pre-ordering by moving troublesome words  to  the  front  of  the
          list.  In          the sort heuristics makes the search longer.  There  is  also  no
          guarentee that a hash function can be found. The program does the
          obvious precheck that no two keywords have  the  same  first  and
          last  letter  and length in common ( e.g. BAK KAB ). Nonetheless,
          as pointed out by Jaeschke  and  Osterberg  in  an  overly  harsh
          criticism there are many pathalogical sets of keywords that fail.
          ( On Cichelli's minimal perfect hash functions method. Comm.  ACM
          Dec  1980  pp  728-729  ) The algorithm also only works for up to
          about 40 keywords due to the limited range of numbers the formula


                                        - 1 -









          can  generate.  If  you have, say 80 keywords, just make two hash
          tables and look in each  one.  Here  are  some  examples  of  the
          programs output :


          PERFECT HASH FUNCTION FINDER , CDH Ver. 2.9

          Start time = Thu Mar 18 15:50:22 1982
          Number of keywords = 41
          Number of distinct letters = 21
          The associated char value limit is 21
          The table size limit is 42
           The search ordering is
          EE RANGE ESR EXCLUDE RP PAUSE RDC CP
          POP SENSE CS PASS CALL LS REG GP
          GDC LABEL INTEGRATE IDC D GUARD RATIO OR
          CONTACT TO MEMBER PERFORM RETURN NOT VDC FAIL
          IF DISPLAY DELAY LOW AND Q FREQ BIAS
          HIGH

          SUCCESS ! Associated Char Values Follow:

          A  18,B  19,C  7,D  17,E  0,F  20,G  19,H  0,I  18,
          L  8,M  20,N  2,O  18,P  4,Q  18,R  0,S  7,T  19,V  4,
          W  7,Y  12,

          Hash min =2, max =42, spread = 41
          EE 2,ESR 3,HIGH 4,RANGE 5,RP 6,EXCLUDE 7,RETURN 8,PAUSE 9,
          RDC 10,POP 11,SENSE 12,CP 13,VDC 14,PASS 15,CS 16,LS 17,
          LOW 18,CALL 19,OR 20,LABEL 21,REG 22,RATIO 23,NOT 24,GP 25,
          MEMBER 26,INTEGRATE 27,IDC 28,GDC 29,BIAS 30,PERFORM 31,FAIL 32,CONTACT 33,
          DELAY 34,D 35,DISPLAY 36,Q 37,AND 38,TO 39,IF 40,GUARD 41,
          FREQ 42,

          Total search() invocations = 6984
          Started Thu Mar 18 15:50:22 1982
          Finished Thu Mar 18 15:50:48 1982


          PERFECT HASH FUNCTION FINDER , CDH Ver. 2.9

          Start time = Fri Feb 26 21:55:35 1982
          Number of keywords = 39
          Number of distinct letters = 19
          The associated char value limit is 19
          The table size limit is 41
           The search ordering is
          TEXT RESET TRUE REWRITE READLN EOLN SQRT SQR
          SIN TRUNC CHR CHAR COS SUCC PUT EXP
          PAGE READ ROUND DISPOSE PRED OUTPUT ORD INPUT
          INTEGER GET MAXINT REAL LN WRITE NEW WRITELN
          EOF FALSE ARCTAN ABS BOOLEAN PACK UNPACK

          SUCCESS ! Associated Char Values Follow:


                                        - 2 -










          A  15,B  9,C  11,D  19,E  5,F  3,G  0,I  3,K  16,
          L  13,M  1,N  19,O  0,P  18,R  0,S  15,T  0,U  17,
          W  10,

          Hash min =3, max =41, spread = 39
          GET 3,TEXT 4,RESET 5,OUTPUT 6,MAXINT 7,INPUT 8,TRUE 9,INTEGER 10,
          EOF 11,REWRITE 12,FALSE 13,CHR 14,CHAR 15,TRUNC 16,REAL 17,SQR 18,
          SQRT 19,WRITE 20,PUT 21,ORD 22,READ 23,ROUND 24,READLN 25,EXP 26,
          PAGE 27,EOLN 28,COS 29,SUCC 30,DISPOSE 31,NEW 32,ABS 33,LN 34,
          BOOLEAN 35,WRITELN 36,SIN 37,PACK 38,UNPACK 39,ARCTAN 40,PRED 41,

          Total search() invocations = 218463
          Started Fri Feb 26 21:55:35 1982
          Finished Fri Feb 26 21:57:39 1982


          Usage: %perf < input

          Options: -n             no presort of the input keyword list
                   -t number      limits the max table size ( 100 default )
                   -v number      limits the max associated char value
                   -k number      takes only the first number keywords in
                                  the input list
                   -a number      specifies alarm time in seconds for status report.
                                  Default is every 10 minutes to stderr.
                   -number        1 thru 9 causes debug printouts

          %perf -n -t 42 -v 15 < in.txt > perf.out

          Usually the first time, just let everything default, second time
          just use the t option to limit the table size to the first hash
          value plus the number of keywords. The program will accept SIGTERM
          signals for an update status report since it may take quite a while
          to find the hash function values. The program status.c is a convenient
          way to send signals to perfect.c every so many seconds.
          Examples included are 2230.txt, keywords for a GenRad test system,
          kwds.txt, something to do with PASCAL ( I think that is a computer
          language used by students ), and lastly ckeywds.txt which is the
          set of key words for C.


                   An excerpt from a lexical analyzer program that uses the
          perfect  hash function is shown in appendix A. It shows the way a
          structure can be used as the perfect hash table.











                                        - 3 -










          Appendix B       Lexical Analyzer Data  Structure  using  Perfect
          Hash Function


          /* the following character associated values and the hash table
          ordering were computed by the program perfect.c on the UNIX vax.
          written by C.D. Havener
          Note that COUNT TEXT and FAST are #defined in a standard header
          somewhere to be int char and register respectively.
          This is just a program fragment! The user must supply the
          match function etc.
          */
          COUNT assocval[] = { 18, 19, 7, 17, 0, 20, 19, 0, 18,
                                   0, 0, 8, 20, 2, 18, 4, 18, 0, 7, 19, 0, 4,
                                   7, 0, 12, 0 };

          COUNT *a = assocval - 'A';       /* speeds up hash calculation */
          typedef struct keystuff {
                   TEXT *keytxt;
                   COUNT ktype;
                   COUNT lexv;
                   }KEYS,*KEYPTR;

          KEYS table[] = {
                           "EE",EE,64,
                           "ESR",SIMPLREG,18,
                           "HIGH",SIMPLREG,0,
                           "RANGE",SIMPLREG,3,
                           "RP",SIMPLREG,15,
                           "EXCLUDE",EXCLUDE,27,
                           "RETURN",SIMPLOP,35,
                           "PAUSE",SIMPLOP,46,
                           "RDC",SIMPLREG,9,
                           "POP",POP,47,
                           "SENSE",ARRAYREG,13,
                           "CP",SIMPLREG,5,
                           "VDC",SIMPLREG,14,
                           "PASS",SIMPLOP,40,
                           "CS",SIMPLREG,6,
                           "LS",SIMPLREG,24,
                           "LOW",SIMPLREG,21,
                           "CALL",TRANSFEROP,37,
                           "OR",OR,30,
                           "LABEL",LABELKEY,39,
                           "REG",ARRAYREG,67,
                           "RATIO",SIMPLREG,19,
                           "NOT",NOT,31,
                           "GP",SIMPLREG,4,
                           "MEMBER",MEMBER,26,
                           "INTEGRATE",SIMPLREG,7,
                           "IDC",SIMPLREG,23,
                           "GDC",SIMPLREG,20,
                           "BIAS",SIMPLREG,22,


                                        - 4 -









                           "PERFORM",TRANSFEROP,49,
                           "FAIL",SIMPLOP,43,
                           "CONTACT",SIMPLREG,11,
                           "DELAY",SIMPLREG,48,
                           "D",SIMPLREG,12,
                           "DISPLAY",SIMPLREG,34,
                           "Q",SIMPLREG,10,
                           "AND",AND,29,
                           "TO",TO,59,
                           "IF",IF,32,
                           "GUARD",SIMPLREG,17,
                           "FREQ",SIMPLREG,16,
                           };

          /*---------------------------------------------------------*/
          /* iskeywd - determines if next token is a keywd or not */
          iskeywd(p)
          FAST char *p;
          {
                   COUNT hash;
                   FAST char *t;
                   COUNT len = -2; /* since hash values start at 2 and table at 0 */

                   t = p;
                   while ( *t <= 'Z' && 'A' <= *t )
                           {
                           t++;
                           len++;
                           }
          /*       hash = assocval[*p - 'A' ] + len + assocval[*(t-1) - 'A' ]; */
                   hash = *(a + *p ) + len + *(a + *(t-1) );
                   if ( 0 <= hash && hash <= 41 && match(p,table[hash].keytxt) )
                           {
                           type = table[hash].ktype;
                           yylval = table[hash].lexv;
                           return TRUE;
                           }
                   return(FALSE);
          }
          /*---------------------------------------------------------*/
















                                        - 5 -



//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 perf.doc
	/bin/echo -n '	'; /bin/ls -ld perf.doc
fi