[alt.sources] [comp.lang.c] Re: hash function for text in C

chris@mimsy.umd.edu (Chris Torek) (10/19/90)

Archive-name: hash/18-Oct-90
Original-posting-by: chris@mimsy.umd.edu (Chris Torek)
Original-subject: Re: hash function for text in C
Reposted-by: emv@math.lsa.umich.edu (Edward Vielmetti)

[Reposted from comp.lang.c.
Comments on this service to emv@math.lsa.umich.edu (Edward Vielmetti).]

The following implements expanding chained hash tables for strings and
numbers.  This is unmodified from a special-purpose program (having to
do with reading a large number of files with name=id pairs and doing
various operations on them).  Chain lengths are not monitored; instead,
the table is invariably expanded whenever it becomes 2/3 full.  Entries
may be neither removed nor modified.

An essentially-unrelated function called `string' maintains a string
pool such that string(x) == string(y) iff strcmp(x,y)==0 (the program
typically stores each name several times, and often needs to test for
name equality, so this helps there).

: Run this shell script with "sh" not "csh"
PATH=/bin:/usr/bin:/usr/ucb:/etc:$PATH
export PATH
all=false
if [ x$1 = x-a ]; then
	all=true
fi
echo Extracting hash.h
sed 's/^X//' <<'//go.sysin dd *' >hash.h
X/*
X * $Id: hash.h,v 1.1 90/09/24 23:58:38 chris Exp $
X *
X * Hash table entries keep track of name (or id) = value pairs.
X * Values are simply pointers.  Hash tables themselves are named
X * (for debugging).
X */
X
Xstruct	hashtab;
X
X/*
X * The `ins' functions return nonzero if the old value existed,
X * without changing the value.
X * The iterate functions calls a given function with name,value
X * or id,value pairs.
X */
Xstruct	hashtab *ht_new();	/* given name, create new hash table */
Xchar	*ht_nget();		/* given table and name, get value */
Xchar	*ht_iget();		/* given table and ID, get value */
Xint	ht_nins();		/* given table and name, insert new value */
Xint	ht_iins();		/* given table and id, insert new value */
Xvoid	ht_niterate();		/* given table and function, iterate */
Xvoid	ht_iiterate();		/* given table and function, iterate */
X
X/*
X * Some things that ought not to be here, but are anyway.
X * goodnumber() takes a number of objects and a size and returns
X * a new number of objects, such that malloc(goodnumber(n,size)*size)
X * calls malloc with a `good' size value (resulting in less wasted
X * memory).  emalloc is malloc with program-abort on out-of-memory.
X * string() makes a `read-only' copy of a string, reusing the previous
X * copy if any.
X */
Xint	goodnumber();		/* given n & sizeof, return new n */
Xchar	*emalloc();		/* malloc, exit on error */
Xchar	*string();		/* make an `ideal' copy of a string */
//go.sysin dd *
if [ `wc -c < hash.h` != 1415 ]; then
	made=false
	echo error transmitting hash.h --
	echo length should be 1415, not `wc -c < hash.h`
else
	made=true
fi
if $all; then
	echo '	Changing owner to bin'
	chown bin hash.h
else
	echo '	Original owner was bin'
fi
if $made; then
	chmod 444 hash.h
	echo -n '	'; ls -ld hash.h
fi
echo Extracting hash.c
sed 's/^X//' <<'//go.sysin dd *' >hash.c
X#ifndef lint
Xstatic char rcsid[] = "$Id: hash.c,v 1.2 90/10/02 13:32:23 chris Exp $";
X#endif
X
X/*
X * Hash table routines.
X */
X
X#include <stdio.h>
X#include <stdlib.h>
X#include <string.h>
X#include "hash.h"
X
X/*
X * Hash table entries keep track of name=value pairs.
X * The names may be numeric IDs instead (by having a null name).
X */
Xstruct hashent {
X	struct	hashent *h_next;/* next in chain */
X	int	h_hash;		/* hash value or ID */
X	char	*h_name;	/* name (null if from numeric ID) */
X	char	*h_value;	/* value */
X};
X
Xstruct hashtab {
X	int	ht_size;	/* size (power of 2) */
X	int	ht_mask;	/* == ht_size - 1 */
X	int	ht_used;	/* number of entries used */
X	int	ht_lim;		/* when to expand */
X	struct	hashent **ht_tab;/* base of table */
X	char	ht_name[1];	/* table name; actually larger */
X};
X
Xextern char *progname;
X
Xchar *
Xemalloc(n)
X	size_t n;
X{
X	register char *p = malloc(n);
X
X	if (p == NULL) {
X		(void) fprintf(stderr, "%s: out of memory\n", progname);
X		exit(1);
X	}
X	return (p);
X}
X
X/* round up to next multiple of y, where y is a power of 2 */
X#define	ROUND(x, y) (((x) + (y) - 1) & ~((y) - 1))
X
X/* compute a `good' number of objects to allocate via malloc */
Xint
Xgoodnumber(n, s)
X	int n;
X	size_t s;
X{
X
X	/* 16384 is a guess at a good page size for malloc */
X	/* 32 is a guess at malloc's overhead */
X	return ((int)((ROUND(n * s + 32, 16384) - 32) / s));
X}
X
X/*
X * Make a new hash table.
X */
Xstruct hashtab *
Xht_new(name)
X	char *name;
X{
X	register struct hashtab *ht;
X	register struct hashent **h;
X	register int n;
X
X	ht = (struct hashtab *)emalloc(sizeof *ht + strlen(name));
X	ht->ht_tab = h = (struct hashent **)emalloc(128 * sizeof *h);
X	ht->ht_size = 128;
X	ht->ht_mask = 127;
X	for (n = 128; --n >= 0;)
X		*h++ = NULL;
X	ht->ht_used = 0;
X	ht->ht_lim = (128 * 2) / 3;
X	(void) strcpy(ht->ht_name, name);
X	return (ht);
X}
X
X/*
X * Expand an existing hash table.
X */
Xstatic void
Xht_expand(ht)
X	register struct hashtab *ht;
X{
X	register int n = ht->ht_size * 2, i;
X	register struct hashent *p, **h, **oldh, *q;
X
X	h = (struct hashent **)emalloc(n * sizeof *h);
X	for (i = n; --i >= 0;)
X		*h++ = NULL;
X	h -= n;
X	oldh = ht->ht_tab;
X	n--;
X	for (i = ht->ht_size; --i >= 0;) {
X		for (p = *oldh++; p != NULL; p = q) {
X			q = p->h_next;
X			p->h_next = h[p->h_hash & n];
X			h[p->h_hash & n] = p;
X		}
X	}
X	free((char *)ht->ht_tab);
X	ht->ht_tab = h;
X	ht->ht_mask = n;
X	ht->ht_size = ++n;
X	ht->ht_lim = (n * 2) / 3;
X}
X	
X/*
X * Make a new hash entry.  Its h_next will be NULL.
X */
Xstatic struct hashent *
Xnewhashent(hash, name, value)
X	int hash;
X	char *name, *value;
X{
X	static struct hashent *hfree;
X	register struct hashent *h;
X	register int n, nalloc;
X
X	if ((h = hfree) != NULL)
X		hfree = h->h_next;
X	else {
X		nalloc = goodnumber(2, sizeof *h);	/* need at least 2 */
X		hfree = h = (struct hashent *)emalloc(nalloc * sizeof *h) + 1;
X		for (n = nalloc - 2; --n >= 0; h++)
X			h->h_next = h + 1;
X		h->h_next = NULL;
X		h -= nalloc - 1;
X	}
X	h->h_next = NULL;
X	h->h_hash = hash;
X	h->h_name = name;
X	h->h_value = value;
X	return (h);
X}
X
X#define	HASH(str, h, p) \
X	for (p = str, h = 0; *p;) h = (h << 5) - h + *p++
X
X/*
X * Look up a name=value.
X */
Xchar *
Xht_nget(ht, name)
X	register struct hashtab *ht;
X	char *name;
X{
X	register char *p;
X	register int hash;
X	register struct hashent *h;
X
X	HASH(name, hash, p);
X	p = name;
X	for (h = ht->ht_tab[hash & ht->ht_mask]; h != NULL; h = h->h_next)
X		if (h->h_hash == hash && h->h_name != NULL &&
X		    strcmp(h->h_name, p) == 0)
X			return (h->h_value);
X	return (NULL);
X}
X
X/*
X * Look up an ID=value.
X */
Xchar *
Xht_iget(ht, id)
X	register struct hashtab *ht;
X	register int id;
X{
X	register struct hashent *h;
X
X	for (h = ht->ht_tab[id & ht->ht_mask]; h != NULL; h = h->h_next)
X		if (h->h_hash == id && h->h_name == NULL)
X			return (h->h_value);
X	return (NULL);
X}
X
X/*
X * Insert (do not clobber) a name=value.
X * Return zero on success.
X */
Xint
Xht_nins(ht, name, value)
X	register struct hashtab *ht;
X	char *name, *value;
X{
X	register char *p;
X	register int hash;
X	register struct hashent *h, **hp;
X
X	HASH(name, hash, p);
X	p = name;
X	for (hp = &ht->ht_tab[hash & ht->ht_mask]; (h = *hp) != NULL;
X	     hp = &h->h_next)
X		if (h->h_hash == hash && h->h_name != NULL &&
X		    strcmp(h->h_name, p) == 0)
X			return (-1);
X	*hp = newhashent(hash, name, value);
X	if (++ht->ht_used > ht->ht_lim)
X		ht_expand(ht);
X	return (0);
X}
X
X/*
X * Insert (do not clobber) an ID=value.
X * Return zero on success.
X */
Xint
Xht_iins(ht, id, value)
X	register struct hashtab *ht;
X	register int id;
X	char *value;
X{
X	register struct hashent *h, **hp;
X
X	for (hp = &ht->ht_tab[id & ht->ht_mask]; (h = *hp) != NULL;
X	     hp = &h->h_next)
X		if (h->h_hash == id && h->h_name == NULL)
X			return (-1);
X	*hp = newhashent(id, (char *)NULL, value);
X	if (++ht->ht_used > ht->ht_lim)
X		ht_expand(ht);
X	return (0);
X}
X
X/*
X * Stash a copy of a string away; it will never be freed.
X */
Xstatic char *
Xpoolstr(s)
X	char *s;
X{
X	register char *p;
X	register size_t l = strlen(s) + 1;
X	static char *poolp;
X	static size_t nleft;
X
X	if (nleft < l)
X		poolp = emalloc(nleft = goodnumber(l, 1));
X	bcopy(s, p = poolp, l);
X	poolp += l;
X	return (p);
X}
X
X/*
X * Generate a single unique copy of the given string.
X */
Xchar *
Xstring(s)
X	char *s;
X{
X	register char *p;
X	register int hash;
X	register struct hashent *h, **hp;
X	static struct hashtab *ht;
X
X	if (ht == NULL)
X		ht = ht_new("strings");
X	HASH(s, hash, p);
X	p = s;
X	for (hp = &ht->ht_tab[hash & ht->ht_mask]; (h = *hp) != NULL;
X	     hp = &h->h_next)
X		if (h->h_hash == hash && strcmp(h->h_name, p) == 0)
X			return (h->h_name);
X	*hp = h = newhashent(hash, poolstr(s), (char *)NULL);
X	if (++ht->ht_used > ht->ht_lim)
X		ht_expand(ht);
X	return (h->h_name);
X}
X
X/*
X * Call fn on all the name=value pairs.
X */
Xvoid
Xht_niterate(ht, fn)
X	struct hashtab *ht;
X	register void (*fn)();
X{
X	register struct hashent *h, **hp;
X	register int n;
X
X	for (n = ht->ht_size, hp = ht->ht_tab; --n >= 0;)
X		for (h = *hp++; h != NULL; h = h->h_next)
X			if (h->h_name != NULL)
X				(*fn)(h->h_name, h->h_value);
X}
X
X/*
X * Call fn on all the id=value pairs.
X */
Xvoid
Xht_iiterate(ht, fn)
X	struct hashtab *ht;
X	register void (*fn)();
X{
X	register struct hashent *h, **hp;
X	register int n;
X
X	for (n = ht->ht_size, hp = ht->ht_tab; --n >= 0;)
X		for (h = *hp++; h != NULL; h = h->h_next)
X			if (h->h_name == NULL)
X				(*fn)(h->h_hash, h->h_value);
X}
//go.sysin dd *
if [ `wc -c < hash.c` != 6291 ]; then
	made=false
	echo error transmitting hash.c --
	echo length should be 6291, not `wc -c < hash.c`
else
	made=true
fi
if $all; then
	echo '	Changing owner to bin'
	chown bin hash.c
else
	echo '	Original owner was bin'
fi
if $made; then
	chmod 444 hash.c
	echo -n '	'; ls -ld hash.c
fi
-- 
In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 405 2750)
Domain:	chris@cs.umd.edu	Path:	uunet!mimsy!chris