[comp.sources.misc] v06i042: dynamic hashing version of hsearch

ejp@ausmelb.oz.AU (Esmond Pitt) (03/08/89)

Posting-number: Volume 6, Issue 42
Submitted-By: Esmond Pitt <ejp@ausmelb.oz.AU>
Archive-name: dynamic-hash


/*
** Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
** Coded into C, with minor code improvements, and with hsearch(3) interface,
** by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
** also, hcreate/hdestroy routines added to simulate hsearch(3).
**
** These routines simulate hsearch(3) and family, with the important
** difference that the hash table is dynamic - can grow indefinitely
** beyond its original size (as supplied to hcreate()).
**
** Performance appears to be comparable to that of hsearch(3).
** The 'source-code' options referred to in hsearch(3)'s 'man' page
** are not implemented; otherwise functionality is identical.
**
** Compilation controls:
** DEBUG controls some informative traces, mainly for debugging.
** HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
** when combined with DEBUG, these are displayed by hdestroy().
**
** Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
** concatenation property, in probably unnecessary code 'optimisation'.
*/

# include	<stdio.h>
# include	<search.h>
# include	<assert.h>
# include	<string.h>

/*
** Constants
*/

# define SegmentSize		256
# define SegmentSizeShift	8	/* log2(SegmentSize)	*/
# define DirectorySize		256
# define DirectorySizeShift	8	/* log2(DirectorySize)	*/
# define Prime1			37
# define Prime2			1048583
# define DefaultMaxLoadFactor	5

/*
** Fast arithmetic, relying on powers of 2,
** and on pre-processor concatenation property
*/

# define MUL(x,y)		((x) << (y/**/Shift))
# define DIV(x,y)		((x) >> (y/**/Shift))
# define MOD(x,y)		((x) & ((y)-1))

/*
** local data templates
*/

typedef struct element
    {
    /*
    ** The user only sees the first two fields,
    ** as we pretend to pass back only a pointer to ENTRY.
    ** {S}he doesn't know what else is in here.
    */
    char		*Key;
    char		*Data;
    struct element	*Next;	/* secret from user	*/
    } Element,*Segment;

typedef struct
    {
    short	p;		/* Next bucket to be split	*/
    short	maxp;		/* upper bound on p during expansion	*/
    long	KeyCount;	/* current # keys	*/
    short	SegmentCount;	/* current # segments	*/
    short	MinLoadFactor;
    short	MaxLoadFactor;
    Segment	*Directory[DirectorySize];
    } HashTable;

typedef unsigned long	Address;

/*
** external routines
*/

extern char	*calloc();
extern int	free();

/*
** Entry points
*/

extern int	hcreate();
extern void	hdestroy();
extern ENTRY	*hsearch();

/*
** Internal routines
*/

static Address	Hash();
static void	ExpandTable();

/*
** Local data
*/

static HashTable	*Table = NULL;
# if HASH_STATISTICS
static long		HashAccesses, HashCollisions;
# endif

/*
** Code
*/

int
hcreate(Count)
unsigned Count;
{
    int		i;

    /*
    ** Adjust Count to be nearest higher power of 2,
    ** minimum SegmentSize, then convert into segments.
    */
    i = SegmentSize;
    while (i < Count)
	i <<= 1;
    Count = DIV(i,SegmentSize);

    Table = (HashTable*)calloc((sizeof(HashTable),1));
    if (Table == NULL)
	return(0);
    /*
    ** resets are redundant - done by calloc(3)
    **
    Table->SegmentCount = Table->p = Table->KeyCount = 0;
    */
    /*
    ** Allocate initial 'i' segments of buckets
    */
    for (i = 0; i < Count; i++)
    {
	Table->Directory[i] = (Segment*)calloc(sizeof(Segment),SegmentSize);
	if (Table->Directory[i] == NULL)
	{
	    hdestroy();
	    return(0);
	}
	Table->SegmentCount++;
    }
    Table->maxp = MUL(Count,SegmentSize);
    Table->MinLoadFactor = 1;
    Table->MaxLoadFactor = DefaultMaxLoadFactor;
# if DEBUG
    fprintf(
	    stderr,
	    "[hcreate] Table %x Count %d maxp %d SegmentCount %d\n",
	    Table,
	    Count,
	    Table->maxp,
	    Table->SegmentCount
	    );
# endif
# if HASH_STATISTICS
	HashAccesses = HashCollisions = 0;
# endif
    return(1);
}

void
hdestroy()
{
    int		i,j;
    Segment	*s;
    Element	*p,*q;

    if (Table != NULL)
    {
	for (i = 0; i < Table->SegmentCount; i++)
	{
	    /* test probably unnecessary	*/
	    if ((s = Table->Directory[i]) != NULL)
	    {
		for (j = 0; j < SegmentSize; j++)
		{
		    p = s[j];
		    while (p != NULL)
		    {
			q = p->Next;
			free((char*)p);
			p = q;
		    }
		free(Table->Directory[i]);
		}
	    }
	}
	free(Table);
	Table = NULL;
# if HASH_STATISTICS && DEBUG
	fprintf(
		stderr,
		"[hdestroy] Accesses %ld Collisions %ld\n",
		HashAccesses,
		HashCollisions
		);
# endif
    }
}

ENTRY *
hsearch(item,action)
ENTRY   item;
ACTION       action;	/* FIND/ENTER	*/
{
    Address	h;
    Segment	*CurrentSegment;
    int		SegmentIndex;
    int		SegmentDir;
    Segment	*p,q;

    assert(Table != NULL);	/* Kinder really than return(NULL);	*/
# if HASH_STATISTICS
    HashAccesses++;
# endif
    h = Hash(item.key);
    SegmentDir = DIV(h,SegmentSize);
    SegmentIndex = MOD(h,SegmentSize);
    /*
    ** valid segment ensured by Hash()
    */
    CurrentSegment = Table->Directory[SegmentDir];
    assert(CurrentSegment != NULL);	/* bad failure if tripped	*/
    p = &CurrentSegment[SegmentIndex];
    q = *p;
    /*
    ** Follow collision chain
    */
    while (q != NULL && strcmp(q->Key,item.key))
    {
	p = &q->Next;
	q = *p;
# if HASH_STATISTICS
	HashCollisions++;
# endif
    }
    if (
	q != NULL	/* found	*/
	||
	action == FIND	/* not found, search only	*/
	||
	(q = (Element*)calloc(sizeof(Element),1))
	==
	NULL		/* not found, no room	*/
	)
	return((ENTRY*)q);
    *p = q;			/* link into chain	*/
    /*
    ** Initialize new element
    */
    q->Key = item.key;
    q->Data = item.data;
    /*
    ** cleared by calloc(3)
    q->Next = NULL;
    */
    /*
    ** Table over-full?
    */
    if (++Table->KeyCount / MUL(Table->SegmentCount,SegmentSize) > Table->MaxLoadFactor)
	ExpandTable();	/* doesn't affect q	*/
    return((ENTRY*)q);
}

/*
** Internal routines
*/

static Address
Hash(Key)
char *Key;
{
    Address		h,address;
    register unsigned char	*k = (unsigned char*)Key;

    h = 0;
    /*
    ** Convert string to integer
    */
    while (*k)
	h = h*Prime1 ^ (*k++ - ' ');
    h %= Prime2;
    address = MOD(h,Table->maxp);
    if (address < Table->p)
	address = MOD(h,(Table->maxp << 1));	/* h % (2*Table->maxp)	*/
    return(address);
}

void
ExpandTable()
{
    Address	NewAddress;
    int		OldSegmentIndex,NewSegmentIndex;
    int		OldSegmentDir,NewSegmentDir;
    Segment	*OldSegment,*NewSegment;
    Element	*Current,**Previous,**LastOfNew;

    if (Table->maxp + Table->p < MUL(DirectorySize,SegmentSize))
    {
	/*
	** Locate the bucket to be split
	*/
	OldSegmentDir = DIV(Table->p,SegmentSize);
	OldSegment = Table->Directory[OldSegmentDir];
	OldSegmentIndex = MOD(Table->p,SegmentSize);
	/*
	** Expand address space; if necessary create a new segment
	*/
	NewAddress = Table->maxp + Table->p;
	NewSegmentDir = DIV(NewAddress,SegmentSize);
	NewSegmentIndex = MOD(NewAddress,SegmentSize);
	if (NewSegmentIndex == 0)
	    Table->Directory[NewSegmentDir] = (Segment*)calloc(sizeof(Segment),SegmentSize);
	NewSegment = Table->Directory[NewSegmentDir];
	/*
	** Adjust state variables
	*/
	Table->p++;
	if (Table->p == Table->maxp)
	{
	    Table->maxp <<= 1;	/* Table->maxp *= 2	*/
	    Table->p = 0;
	}
	Table->SegmentCount++;
	/*
	** Relocate records to the new bucket
	*/
	Previous = &OldSegment[OldSegmentIndex];
	Current = *Previous;
	LastOfNew = &NewSegment[NewSegmentIndex];
	*LastOfNew = NULL;
	while (Current != NULL)
	{
	    if (Hash(Current->Key) == NewAddress)
	    {
		/*
		** Attach it to the end of the new chain
		*/
		*LastOfNew = Current;
		/*
		** Remove it from old chain
		*/
		*Previous = Current->Next;
		LastOfNew = &Current->Next;
		Current = Current->Next;
		*LastOfNew = NULL;
	    }
	    else
	    {
		/*
		** leave it on the old chain
		*/
		Previous = &Current->Next;
		Current = Current->Next;
	    }
	}
    }
}
-- 
Esmond Pitt, Austec (Asia/Pacific) Ltd
...!uunet.UU.NET!munnari!ausmelb!ejp,ejp@ausmelb.oz