[comp.lang.c] count of bits in a long

peter@neccan.oz (Peter Miller) (10/05/90)

/*
 * A while back I posted a solution to the "count of bits set in a
 * long" question, suggesting that HACKMEM 169 was a very fast way to do it.
 * This program demonstrates the technique.
 * 
 * The HACKMEM 169 method uses a modulo to sum the digits.
 * A number of people have asked me "how can a division be faster than a loop?"
 * This program demonstrates how.  The modulo is a special case,
 * and can be unwound into a tight loop if your machine does not
 * have a fast divide opcode (although many modern machines have a 1 cycle
 * divide opcode, except for we-really-believe-our-press-hype risc
 * manufacturers).
 * 
 * This program was run on a 386/20 under unix with the following results:
 *	Func	sec/iter	scaled
 *	nbits	0.00018310547	 1.000	hackmem 169
 *	nbits1	0.0006980896	 3.812	hackmem 169 no % operator
 *	nbits2	0.0016822815	 9.188	count lsb's loop
 *	nbits3	0.0037384033	20.417
 *	nbits4	0.0034370422	18.771
 *	nbits5	0.0037765503	20.625
 *	nbits6	0.0035247803	19.250
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/times.h>

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))


/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}


/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit bumber to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "cating out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */

int
nbits(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}


/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, an addition.
 */
int
nbits1(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}


/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
nbits2(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
nbits3(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
nbits4(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
nbits5(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
nbits6(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

/*
 * build a long random number.
 * The standard rand() returns a 15bit number.
 */
unsigned long
bigrand()
{
	return
	(
		((unsigned long)(rand() & 0xFFF) << 24)
	|
		((unsigned long)(rand() & 0xFFF) << 12)
	|
		(unsigned long)(rand() & 0xFFF)
	);
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#define REPEAT (1L<<18)

double
measure(func)
	int	(*func)();
{
	long	j;
	struct tms	start;
	struct tms	finish;

	srand(1);
	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
}

struct table
{
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "nbits", 	nbits },
	{ "nbits1", 	nbits1 },
	{ "nbits2", 	nbits2 },
	{ "nbits3", 	nbits3 },
	{ "nbits4", 	nbits4 },
	{ "nbits5", 	nbits3 },
	{ "nbits6", 	nbits4 },
};

main()
{
	double	calibration;
	double	best;
	int	j, k, m;

	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j)
	{
		unsigned long n;
		int	bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad)
		{
			printf("wrong:\t%08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf("\t%d", table[k].trial);
			printf("\n");
		}
	}

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j)
	{
		table[j].measurement = measure(table[j].func) - calibration;
		if (!j || table[j].measurement < best)
			best = table[j].measurement;
	}
	for (j = 0; j < SIZEOF(table); ++j)
	{
		printf
		(
			"%s\t%10.8g\t%6.3f\n",
			table[j].name,
			table[j].measurement,
			table[j].measurement / best
		);
	}
	exit(0);
}

Regards
Peter Miller	UUCP	uunet!munnari!pdact.pd.necisa.oz!pmiller
		ARPA	pmiller%pdact.pd.necisa.oz@australia
/\/\*		CSNET	pmiller%pdact.pd.necisa.oz@au
		ACSnet	pmiller@pdact.pd.necisa.oz
Disclaimer:  The views expressed here are personal and do not necessarily
	reflect the view of my employer or the views or my colleagues.
D

chris@mimsy.umd.edu (Chris Torek) (10/07/90)

In article <826@neccan.oz> peter@neccan.oz (Peter Miller) sends out a
program to time various bit-counting functions.  I decided to `improve'
the program a bit :-) and add a number of additional functions.  I have
run the resulting program (which I will append below) on a number of
architectures.  The `mod' version is never the fastest of the hackmem
variants.  The fastest function is always table lookup, except on the
Sun-4.

Here are his results, for an i386 box:
> *	Func	sec/iter	scaled
> *	nbits	0.00018310547	 1.000	hackmem 169
> *	nbits1	0.0006980896	 3.812	hackmem 169 no % operator
> *	nbits2	0.0016822815	 9.188	count lsb's loop
> *	nbits3	0.0037384033	20.417
> *	nbits4	0.0034370422	18.771
> *	nbits5	0.0037765503	20.625
> *	nbits6	0.0035247803	19.250

(I find the last two results particularly interesting since his code
as posted uses functions nbits3 and nbits4 to calculate the times
printed for nbits5 and nbits6, never evaluating 5 and 6 at all.)

Here is a short key to names:
  my name (Peter's name)	  technique used
  ----------------------	  --------------
hackmemmod (nbits)		HACKMEM 169, using % operator
hackmemloop (nbits1)		HACKMEM 169, using loop to implement %
hackmemunrolled (-)		HACKMEM 169, with 5-step % (loop unrolled)
rmlsbsub (nbits2)		remove lsb with `n -= (n & -n)'
rmlsbmask (-)			remove lsb with `n &= n - 1'
testlsb (nbits4)		test n&1, then n>>=1
testmsb (nbits3)		test n&MSB, then n+=n (rather than n<<=1)
testsignandshift (-)		test n<0, then n<<=1
testeachbit (nbits5, untimed)	test n&mask, then mask+=mask
testeachbit1shl (nbits6, ")	test n&(1<<bit) for bit in [0..32)
tableshift			nbits[n>>24] + nbits[(n>>16)&255] ...
tableuchar			set p=&n, nbits[p[0]] + nbits[p[1]] ...

The results:
--------------------------------------------------
  DECstation 3100 (MIPS R2000, slow memory system)
function                   time          ratio
hackmemmod               3.1029682e-06   3.179
hackmemloop              2.4697094e-06   2.531
hackmemunrolled          1.7693996e-06   1.813
rmlsbsub                 4.8127670e-06   4.931
rmlsbmask                3.8405285e-06   3.935
testlsb                  1.0221542e-05  10.473
testmsb                  1.0899502e-05  11.168
testsignandshift         1.0780300e-05  11.046
testeachbit              1.0843626e-05  11.111
testeachbit1shl          1.6695683e-05  17.107
tableshift               1.0467396e-06   1.073
tableuchar               9.7596359e-07   1.000

--------------------------------------------------
  Encore Multimax (NS32532, proprietary bus)
function                   time          ratio
hackmemmod               6.3739853e-06   3.618
hackmemloop              3.8458633e-06   2.183
hackmemunrolled          6.4167595e-06   3.642
rmlsbsub                 9.3918610e-06   5.330
rmlsbmask                9.4905777e-06   5.386
testlsb                  2.4036179e-05  13.642
testmsb                  2.3262550e-05  13.203
testsignandshift         2.1452625e-05  12.176
testeachbit              2.5112068e-05  14.253
testeachbit1shl          2.8207516e-05  16.010
tableshift               2.2915077e-06   1.301
tableuchar               1.7619209e-06   1.000

--------------------------------------------------
  Sun-3/something (68020, ?vmebus memory?)

function                   time          ratio
hackmemmod               0.00077819824   1.821
hackmemloop              0.00059890747   1.402
hackmemunrolled          0.00050354004   1.179
rmlsbsub                 0.00069808960   1.634
rmlsbmask                0.00055313110   1.295
testlsb                  0.00107955930   2.527
testmsb                  0.00251007080   5.875
testsignandshift         0.00242614750   5.679
testeachbit              0.00254821780   5.964
testeachbit1shl          0.00362014770   8.473
tableshift               0.00083160400   1.946
tableuchar               0.00042724609   1.000

--------------------------------------------------
  SparcStation 1+ (SPARC, private memory bus)
	These times look a bit odd, and suggest that the compiler
	could use some work.
function                   time          ratio
hackmemmod               1.4972687e-06  12.077
hackmemloop              7.3432922e-07   5.923
hackmemunrolled          1.1825562e-06   9.538
rmlsbsub                 1.3351440e-07   1.077
rmlsbmask                1.4305115e-07   1.154
testlsb                  1.4305115e-07   1.154
testmsb                  1.3351440e-07   1.077
testsignandshift         1.2397766e-07   1.000
testeachbit              6.6280365e-06  53.462
testeachbit1shl          9.2029572e-06  74.231
tableshift               7.2479248e-07   5.846
tableuchar               1.1539459e-06   9.308

--------------------------------------------------
  VAX 8600 with gcc (ECL gate array, private memory bus)
function                   time          ratio
hackmemmod               1.1291504e-05   4.290
hackmemloop              1.0757446e-05   4.087
hackmemunrolled          8.8500977e-06   3.362
rmlsbsub                 1.7890930e-05   6.797
rmlsbmask                1.4801025e-05   5.623
testlsb                  5.2032471e-05  19.768
testmsb                  3.5400391e-05  13.449
testsignandshift         2.7008057e-05  10.261
testeachbit              3.0670166e-05  11.652
testeachbit1shl          4.7760010e-05  18.145
tableshift               5.3405762e-06   2.029
tableuchar               2.6321411e-06   1.000

--------------------------------------------------
  VAX 8600 with Berkeley cc
function                   time          ratio
hackmemmod               8.1634521e-06   3.292
hackmemloop              6.9808960e-06   2.815
hackmemunrolled          1.3885498e-05   5.600
rmlsbsub                 6.2942505e-06   2.538
rmlsbmask                5.9890747e-06   2.415
testlsb                  1.5754700e-05   6.354
testmsb                  3.9138794e-05  15.785
testsignandshift         3.1127930e-05  12.554
testeachbit              4.1275024e-05  16.646
testeachbit1shl          6.0615540e-05  24.446
tableshift               5.9890747e-06   2.415
tableuchar               2.4795532e-06   1.000


--------------------------------------------------

There are some noteworthy points above, particularly the example
with the 8600, where the choice of compiler makes a great deal of
difference.  The `testlsb' loop runs much faster under gcc, which
eliminated one `tstl' instruction, yet some of the other loops
run more slowly.

So that others can run their own tests, here is the program.  I
added several functions, renamed them all, changed the timing
mechanism on BSD machines, and reformatted things a bit.  I changed
the random number selection mechanism to be relatively machine
independent (assuming they all have the same `rand' function) and
to use values that are `more random'.  I also changed the output
format (among other things, it uses `%#g'; the Sun SparcStation
printf does not handle this correctly, and I added trailing zeroes
manually above).  Oh yes, I also had to make it run longer on the
MIPS and Sparc machines to get reliable times.

(You can tell which functions I added, as they are generally
uncommented. :-) )

--------------------------------------------------
#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))


/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}


/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit bumber to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "cating out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */
int
t0_hackmemmod(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}


/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, an addition.
 */
int
t1_hackmemloop(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * Above, without using while loop.  It takes at most 5 iterations of the
 * loop, so we just do all 5 in-line.  The final result is never 63
 * (this is assumed above as well).
 */
int
t2_hackmemunrolled(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	return (tmp);
}


/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
t3_rmlsbsub(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

int
t4_rmlsbmask(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; count++)
		n &= n - 1;	/* take away lsb */
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
t5_testlsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
t6_testmsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

int
t7_testsignandshift(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n <<= 1)
		if ((long)n < 0)
			count++;
	return (count);
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
t8_testeachbit(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
t9_testeachbit1shl(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

static char nbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tA_tableshift(n)
	register unsigned long n;
{

	return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
		nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tB_tableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}

/*
 * build a long random number.
 * The standard rand() returns at least a 15 bit number.
 * We use the top 9 of 15 (since the lower N bits of the usual rand()
 * repeat with a period of 2^N).
 */
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
	register int r;

	r = randbits() << 27;
	r |= randbits() << 18;
	r |= randbits() << 9;
	r |= randbits();
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<20)
#else
#define REPEAT (1L<<18)
#endif

double
measure(func)
	register int	(*func)();
{
#ifdef USE_GETRUSAGE
	struct rusage ru0, ru1;
	register long j;

	srand(1);
	(void) getrusage(RUSAGE_SELF, &ru0);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	(void) getrusage(RUSAGE_SELF, &ru1);
	ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
	if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
		ru1.ru_utime.tv_usec += 1000000;
		ru1.ru_utime.tv_sec--;
	}
	return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
		(double)REPEAT);
#else
	register long	j;
	struct tms	start;
	struct tms	finish;

	srand(1);
	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "hackmemmod",		t0_hackmemmod },
	{ "hackmemloop", 	t1_hackmemloop },
	{ "hackmemunrolled",	t2_hackmemunrolled },
	{ "rmlsbsub", 		t3_rmlsbsub },
	{ "rmlsbmask",	 	t4_rmlsbmask },
	{ "testlsb",		t5_testlsb },
	{ "testmsb", 		t6_testmsb },
	{ "testsignandshift", 	t7_testsignandshift },
	{ "testeachbit", 	t8_testeachbit },
	{ "testeachbit1shl", 	t9_testeachbit1shl },
	{ "tableshift", 	tA_tableshift },
	{ "tableuchar", 	tB_tableuchar },
};

main(argc, argv)
	int argc;
	char **argv;
{
	double calibration, v, best;
	int j, k, m, verbose;

	verbose = argc > 1;
	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j) {
		unsigned long n;
		int bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad) {
			printf("wrong: %08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf(" %3d", table[k].trial);
			printf("\n");
		}
	}

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);
	if (verbose)
		printf("calibration: %g\n", calibration);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j) {
		v = measure(table[j].func) - calibration;
		if (verbose)
			printf("%s: %g\n", table[j].name, v);
		table[j].measurement = v;
		if (!j || v < best)
			best = v;
	}
	(void) printf("%-24s   %-14sratio\n", "function", "time");
	for (j = 0; j < SIZEOF(table); ++j) {
		(void) printf("%-24s %#10.8g  %6.3f\n",
		    table[j].name,
		    table[j].measurement,
		    table[j].measurement / best);
	}
	exit(0);
}
-- 
In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 405 2750)
Domain:	chris@cs.umd.edu	Path:	uunet!mimsy!chris

gene@zeno.mn.org (Gene H. Olson) (10/12/90)

peter@neccan.oz (Peter Miller) writes:
>/*
> * A while back I posted a solution to the "count of bits set in a
> * long" question, suggesting that HACKMEM 169 was a very fast way to do it.
> * This program demonstrates the technique.
> * 
> * The HACKMEM 169 method uses a modulo to sum the digits.
> * A number of people have asked me "how can a division be faster than a loop?"
> * This program demonstrates how.  The modulo is a special case,
> * and can be unwound into a tight loop if your machine does not
> * have a fast divide opcode (although many modern machines have a 1 cycle
> * divide opcode, except for we-really-believe-our-press-hype risc
> * manufacturers).
> * 
> * This program was run on a 386/20 under unix with the following results:
> *	Func	sec/iter	scaled
> *	nbits	0.00018310547	 1.000	hackmem 169
> *	nbits1	0.0006980896	 3.812	hackmem 169 no % operator
> *	nbits2	0.0016822815	 9.188	count lsb's loop
> *	nbits3	0.0037384033	20.417
> *	nbits4	0.0034370422	18.771
> *	nbits5	0.0037765503	20.625
> *	nbits6	0.0035247803	19.250
> */
> .....  A program follows this.

To this program I have added a bug fix (the function table
was wrong) and another function (nbit7) which is MY FAVORITE
WAY of counting bits.   The patches are at the end of this
posting....

Anyway, I got much better results on both the 386 (which
has a multiply) and on a SPARC (which doesn't) than any of
the functions in the original program.

The results were:

            386                               SPARC
------------------------------      ------------------------------
nbits   0.00020980835    1.774      nbits   0.00096893311   15.875
nbits1  0.00038909912    3.290      nbits1  0.00012969971    2.125
nbits2  0.00075149536    6.355      nbits2  0.00031280518    5.125
nbits3  0.0015411377    13.032      nbits3  0.00067138672   11.000
nbits4  0.0016479492    13.935      nbits4  0.00065231323   10.688
nbits5  0.0015525818    13.129      nbits5  0.00067520142   11.062
nbits6  0.0021286011    18.000      nbits6  0.00086212158   14.125
nbits7  0.00011825562    1.000      nbits7  6.1035156e-05    1.000

The nbit7 technique is a simple routine which shifts and sums
bits in place until all bits have summed in the lower 8 bits
of the word.  The implementation here requires 32 bit longs,
but it is easily extensible to 64 bits with a mask change and
the additional shift shown.

Enjoy,

------------- Remainder of Posting is a Cdiff Patchfile ----------

*** orig.c	Fri Oct 12 08:56:22 1990
--- gene.c	Fri Oct 12 09:23:51 1990
***************
*** 215,220 ****
--- 215,236 ----
  	return count;
  }
  
+ int
+ nbits7(n)
+ 	register unsigned long n ;
+ {
+ 	n = ((n >> 1) & 0x55555555) + (n & 0x55555555) ;
+ 	n = ((n >> 2) & 0x33333333) + (n & 0x33333333) ;
+ 	n = ((n >> 4) + n) & 0x0F0F0F0F ;
+ 	n = ((n >> 8) + n) ;
+ #if LONG64
+ 	n = (n >> 16) + n ;
+ 	return ((n >> 32) + n) & 0xFF ;
+ #else
+ 	return ((n >> 16) + n) & 0xFF ;
+ #endif
+ }
+ 
  /*
   * build a long random number.
   * The standard rand() returns a 15bit number.
***************
*** 275,282 ****
  	{ "nbits2", 	nbits2 },
  	{ "nbits3", 	nbits3 },
  	{ "nbits4", 	nbits4 },
! 	{ "nbits5", 	nbits3 },
! 	{ "nbits6", 	nbits4 },
  };
  
  main()
--- 291,299 ----
  	{ "nbits2", 	nbits2 },
  	{ "nbits3", 	nbits3 },
  	{ "nbits4", 	nbits4 },
! 	{ "nbits5", 	nbits5 },
! 	{ "nbits6", 	nbits6 },
! 	{ "nbits7", 	nbits7 },
  };
  
  main()
_________________________________________________________________________
   __ 
  /  )                 Gene H. Olson             uunet!digibd!zeno!gene
 / __  _ __  _         Smartware Consulting
(__/ _(/_//_(/_        Minneapolis, MN           (612) 824-9108

ts@cup.portal.com (Tim W Smith) (10/13/90)

These times were given in a recent post for some
68020 based Sun:

	function                   time          ratio
	hackmemmod               0.00077819824   1.821
	hackmemloop              0.00059890747   1.402
	hackmemunrolled          0.00050354004   1.179
	rmlsbsub                 0.00069808960   1.634
	rmlsbmask                0.00055313110   1.295
	testlsb                  0.00107955930   2.527
	testmsb                  0.00251007080   5.875
	testsignandshift         0.00242614750   5.679
	testeachbit              0.00254821780   5.964
	testeachbit1shl          0.00362014770   8.473
	tableshift               0.00083160400   1.946
	tableuchar               0.00042724609   1.000

Something is seriously wrong with either these numbers or that
computer.  I've tried a couple of these on my Mac II, using
Think C.  Note that 1) Apple is not noted for squeezing the
last drop of performance out of the 68020, and 2) Think C
has not stressed optimal code generation.

I get 36 usec for rmlsub on longs with all 32 bits set, 20
usec if 16 bits are set, and 5 usec of no bits are set.
Note that the Sun is getting, in the average case for rmlsub,
20 times worse performance than I am getting for the worst
case.

						Tim Smith

chris@mimsy.umd.edu (Chris Torek) (10/13/90)

In article <26881@mimsy.umd.edu> I posted a modified version of
Peter Miller's bit-count timing program.  Unfortunately, my version
had a bug that effectively makes the results worthless (depending
on compiler vagaries; it `just happens' to compile to working code
on some systems).

Remarkably few people noted the bug (or rather, sent me mail about it).
Two people sent me additional functions, however.  Arthur Olson added
four (all variants on table lookup); Mike Weaver added one.  Gene Olson
has also posted a different modified version of Peter Miller's original
program; this contains a better version of the latter.

So, here are new results for the new set of (now 17) functions on the
same set of machines, along with the new program.  Noteworthy facts:

  - hackmem invariably loses to some other technique, always including
    Gene Olson's, though sometimes not by much;
  - the fastest is usually (but not always) one of the table lookups;
  - tiny differences in ratios (less than .1) are usually insignificant.

  name			  technique used
  ----			  --------------
hackmemmod		HACKMEM 169, using % operator
hackmemloop		HACKMEM 169, using loop to implement %
hackmemunrolled		HACKMEM 169, with 5-step % (loop unrolled)
rmlsbsub		remove lsb with `n -= (n & -n)'
rmlsbmask		remove lsb with `n &= n - 1'
testlsb			test n&1, then n>>=1
testmsb			test n&MSB, then n+=n (rather than n<<=1)
testsignandshift	test n<0, then n<<=1
testeachbit		test n&mask, then mask+=mask
testeachbit1shl		test n&(1<<bit) for bit in [0..32)
tableshift		nbits[n>>24] + nbits[(n>>16)&255] ...
tableuchar		set p=&n, nbits[p[0]] + nbits[p[1]] ...
tableshiftcast		nbits[n>>24] + nbits[(u_char)(n>>16)] ...
itableshift		same as tableshift, but table datatype is int
itableuchar		ditto
itableshiftcast		ditto (note, nbits table is `char')
sumbits			Gene Olson's function (see code for comments)

------------------------------------------------------------
Results:

::::::::::::::
time.ds3100
::::::::::::::
function                   time          ratio
hackmemmod               3.0992432e-06   3.250
hackmemloop              2.5330353e-06   2.656
hackmemunrolled          1.7619495e-06   1.848
rmlsbsub                 4.9207935e-06   5.160
rmlsbmask                3.9522800e-06   4.145
testlsb                  1.0545622e-05  11.059
testmsb                  1.0608948e-05  11.125
testsignandshift         1.0497196e-05  11.008
testeachbit              1.0839901e-05  11.367
testeachbit1shl          1.6718033e-05  17.531
tableshift               1.0243893e-06   1.074
tableuchar               9.5361328e-07   1.000
tableshiftcast           1.1063404e-06   1.160
itableshift              1.2814178e-06   1.344
itableuchar              1.2031918e-06   1.262
itableshiftcast          1.3223934e-06   1.387
sumbits                  1.2106419e-06   1.270
::::::::::::::
time.encore
::::::::::::::
function                   time          ratio
hackmemmod               8.0387573e-06   4.522
hackmemloop              7.2727394e-06   4.091
hackmemunrolled          4.3494453e-06   2.447
rmlsbsub                 1.0656597e-05   5.994
rmlsbmask                1.1298252e-05   6.355
testlsb                  3.1122246e-05  17.506
testmsb                  2.2745319e-05  12.794
testsignandshift         1.9783443e-05  11.128
testeachbit              2.3917282e-05  13.454
testeachbit1shl          2.9880619e-05  16.808
tableshift               3.9419632e-06   2.217
tableuchar               2.6934662e-06   1.515
tableshiftcast           2.3953590e-06   1.347
itableshift              3.0450325e-06   1.713
itableuchar              1.7777634e-06   1.000
itableshiftcast          2.1755104e-06   1.224
sumbits                  1.9636650e-06   1.105
::::::::::::::
time.sun3
::::::::::::::
function                   time          ratio
hackmemmod               1.0604858e-05   2.106
hackmemloop              1.2664795e-05   2.515
hackmemunrolled          1.0833740e-05   2.152
rmlsbsub                 2.2430420e-05   4.455
rmlsbmask                1.7318726e-05   3.439
testlsb                  4.3182373e-05   8.576
testmsb                  3.8909912e-05   7.727
testsignandshift         4.0664673e-05   8.076
testeachbit              4.4479370e-05   8.833
testeachbit1shl          5.9432983e-05  11.803
tableshift               9.9945068e-06   1.985
tableuchar               1.0910034e-05   2.167
tableshiftcast           1.4877319e-05   2.955
itableshift              7.9345703e-06   1.576
itableuchar              1.1672974e-05   2.318
itableshiftcast          1.1520386e-05   2.288
sumbits                  5.0354004e-06   1.000
::::::::::::::
time.sun4
::::::::::::::
function                   time          ratio
hackmemmod               1.3427734e-05  19.288
hackmemloop              1.6689301e-06   2.397
hackmemunrolled          1.0585785e-06   1.521
rmlsbsub                 3.9768219e-06   5.712
rmlsbmask                3.4236908e-06   4.918
testlsb                  8.4209442e-06  12.096
testmsb                  8.4686279e-06  12.164
testsignandshift         8.4018707e-06  12.068
testeachbit              8.5353851e-06  12.260
testeachbit1shl          1.1539459e-05  16.575
tableshift               6.9618225e-07   1.000
tableuchar               1.1348724e-06   1.630
tableshiftcast           7.8201294e-07   1.123
itableshift              9.7274780e-07   1.397
itableuchar              1.2016296e-06   1.726
itableshiftcast          8.8691711e-07   1.274
sumbits                  8.3923340e-07   1.205
::::::::::::::
time.vax.gcc
::::::::::::::
function                   time          ratio
hackmemmod               1.5449524e-05   7.364
hackmemloop              1.3847351e-05   6.600
hackmemunrolled          8.6975098e-06   4.145
rmlsbsub                 1.8386841e-05   8.764
rmlsbmask                1.4266968e-05   6.800
testlsb                  5.3215027e-05  25.364
testmsb                  3.4332275e-05  16.364
testsignandshift         2.6550293e-05  12.655
testeachbit              2.9983521e-05  14.291
testeachbit1shl          4.7454834e-05  22.618
tableshift               5.3405762e-06   2.545
tableuchar               2.4414062e-06   1.164
tableshiftcast           6.0272217e-06   2.873
itableshift              4.3106079e-06   2.055
itableuchar              2.0980835e-06   1.000
itableshiftcast          5.6076050e-06   2.673
sumbits                  5.7983398e-06   2.764
::::::::::::::
time.vax.ucbcc
::::::::::::::
function                   time          ratio
hackmemmod               7.6293945e-06   3.774
hackmemloop              1.6746521e-05   8.283
hackmemunrolled          1.3504028e-05   6.679
rmlsbsub                 2.0332336e-05  10.057
rmlsbmask                1.8882751e-05   9.340
testlsb                  5.6457520e-05  27.925
testmsb                  3.7384033e-05  18.491
testsignandshift         2.9602051e-05  14.642
testeachbit              3.8681030e-05  19.132
testeachbit1shl          5.8860779e-05  29.113
tableshift               5.6838989e-06   2.811
tableuchar               2.2888184e-06   1.132
tableshiftcast           5.1879883e-06   2.566
itableshift              4.3487549e-06   2.151
itableuchar              2.0217896e-06   1.000
itableshiftcast          4.5394897e-06   2.245
sumbits                  6.1798096e-06   3.057

------------------------------------------------------------
The program (and I used lint this time...):

#ifndef lint
static char rcsid[] = "$Id: bct.c,v 1.5 90/10/13 08:44:12 chris Exp $";
#endif

/*
 * bct - bitcount timing
 *
 * Runs a bunch of different functions-to-count-bits-in-a-longword
 * (many assume 32 bits per long) with a timer around each loop, and
 * tries to calcuate the time used.
 */
#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))


/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}


/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit number to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "casting out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */
int
t0_hackmemmod(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}


/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, an addition.
 */
int
t1_hackmemloop(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * Above, without using while loop.  It takes at most 5 iterations of the
 * loop, so we just do all 5 in-line.  The final result is never 63
 * (this is assumed above as well).
 */
int
t2_hackmemunrolled(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	return (tmp);
}


/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
t3_rmlsbsub(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

int
t4_rmlsbmask(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; count++)
		n &= n - 1;	/* take away lsb */
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
t5_testlsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
t6_testmsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

int
t7_testsignandshift(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n <<= 1)
		if ((long)n < 0)
			count++;
	return (count);
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
t8_testeachbit(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
t9_testeachbit1shl(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

static char nbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

static int inbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tA_tableshift(n)
	register unsigned long n;
{

	return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
		nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tB_tableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}

int
tC_tableshiftcast(n)
	register unsigned long n;
{

	return nbits[(unsigned char) n] +
		nbits[(unsigned char) (n >> 8)] +
		nbits[(unsigned char) (n >> 16)] +
		nbits[(unsigned char) (n >> 24)];
}

int
tD_itableshift(n)
	register unsigned long n;
{

	return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] +
		inbits[(n >> 16) & 0xff] + inbits[n >> 24]);
}

int
tE_itableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]);
}

int
tF_itableshiftcast(n)
	register unsigned long n;
{

	return inbits[(unsigned char) n] +
		inbits[(unsigned char) (n >> 8)] +
		inbits[(unsigned char) (n >> 16)] +
		inbits[(unsigned char) (n >> 24)];
}

/*
 * Explanation:
 * First we add 32 1-bit fields to get 16 2-bit fields.
 * Each 2-bit field is one of 00, 01, or 10 (binary).
 * We then add all the two-bit fields to get 8 4-bit fields.
 * These are all one of 0000, 0001, 0010, 0011, or 0100.
 *
 * Now we can do something different, becuase for the first
 * time the value in each k-bit field (k now being 4) is small
 * enough that adding two k-bit fields results in a value that
 * still fits in the k-bit field.  The result is four 4-bit
 * fields containing one of {0000,0001,...,0111,1000} and four
 * more 4-bit fields containing junk (sums that are uninteresting).
 * Pictorially:
 *	    n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh
 *	 n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg
 *	  sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ
 * where W, X, Y, and Z are the interesting sums (each at most 1000,
 * or 8 decimal).  Masking with 0x0f0f0f0f extracts these.
 *
 * Now we can change tactics yet again, because now we have:
 *	    n = 0000WWWW0000XXXX0000YYYY0000ZZZZ
 *	 n>>8 = 000000000000WWWW0000XXXX0000YYYY
 * so	  sum = 0000WWWW000ppppp000qqqqq000rrrrr
 * where p and r are the interesting sums (and each is at most
 * 10000, or 16 decimal).  The sum `q' is junk, like i, j, and
 * k above; but it is not necessarry to discard it this time.
 * One more fold, this time by sixteen bits, gives
 *	    n = 0000WWWW000ppppp000qqqqq000rrrrr
 *	n>>16 = 00000000000000000000WWWW000ppppp
 * so	  sum = 0000WWWW000ppppp000sssss00tttttt
 * where s is at most 11000 and t is it most 100000 (32 decimal).
 *
 * Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)),
 * or in other words, t is the number of bits set in the original
 * 32-bit longword.  So all we have to do is return the low byte
 * (or low 6 bits, but `low byte' is typically just as easy if not
 * easier).
 *
 * This technique is also applicable to 64 and 128 bit words, but
 * 256 bit or larger word sizes require at least one more masking
 * step.
 */
int
tG_sumbits(n)
	register unsigned long n;
{

	n = (n & 0x55555555) + ((n >> 1) & 0x55555555);
	n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
	n = (n + (n >> 4)) & 0x0f0f0f0f;
	n += n >> 8;
	n += n >> 16;
	return (n & 0xff);
}

/*
 * build a long random number.
 * The standard rand() returns at least a 15 bit number.
 * We use the top 9 of 15 (since the lower N bits of the usual rand()
 * repeat with a period of 2^N).
 */
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
	register int r;

	r = randbits() << 27;
	r |= randbits() << 18;
	r |= randbits() << 9;
	r |= randbits();
	return (r);
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#ifndef REPEAT
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<20)
#else
#define REPEAT (1L<<18)
#endif
#endif

double
measure(func)
	register int	(*func)();
{
#ifdef USE_GETRUSAGE
	struct rusage ru0, ru1;
	register long j;

	srand(1);
	(void) getrusage(RUSAGE_SELF, &ru0);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	(void) getrusage(RUSAGE_SELF, &ru1);
	ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
	if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
		ru1.ru_utime.tv_usec += 1000000;
		ru1.ru_utime.tv_sec--;
	}
	return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
		(double)REPEAT);
#else
	register long	j;
	struct tms	start;
	struct tms	finish;

	srand(1);
	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(bigrand());
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "hackmemmod",		t0_hackmemmod },
	{ "hackmemloop", 	t1_hackmemloop },
	{ "hackmemunrolled",	t2_hackmemunrolled },
	{ "rmlsbsub", 		t3_rmlsbsub },
	{ "rmlsbmask",	 	t4_rmlsbmask },
	{ "testlsb",		t5_testlsb },
	{ "testmsb", 		t6_testmsb },
	{ "testsignandshift", 	t7_testsignandshift },
	{ "testeachbit", 	t8_testeachbit },
	{ "testeachbit1shl", 	t9_testeachbit1shl },
	{ "tableshift", 	tA_tableshift },
	{ "tableuchar", 	tB_tableuchar },
	{ "tableshiftcast",	tC_tableshiftcast },
	{ "itableshift", 	tD_itableshift },
	{ "itableuchar", 	tE_itableuchar },
	{ "itableshiftcast",	tF_itableshiftcast },
	{ "sumbits",		tG_sumbits },
};

main(argc, argv)
	int argc;
	char **argv;
{
	double calibration, v, best;
	int j, k, m, verbose;

	verbose = argc > 1;
	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j) {
		unsigned long n;
		int bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad) {
			printf("wrong: %08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf(" %3d", table[k].trial);
			printf("\n");
		}
	}

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);
	if (verbose)
		printf("calibration: %g\n", calibration);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j) {
		v = measure(table[j].func) - calibration;
		if (verbose)
			printf("%s: %g\n", table[j].name, v);
		table[j].measurement = v;
		if (!j || v < best)
			best = v;
	}
	(void) printf("%-24s   %-14sratio\n", "function", "time");
	for (j = 0; j < SIZEOF(table); ++j) {
		(void) printf("%-24s %#10.8g  %6.3f\n",
		    table[j].name,
		    table[j].measurement,
		    table[j].measurement / best);
	}
	exit(0);
}
-- 
In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 405 2750)
Domain:	chris@cs.umd.edu	Path:	uunet!mimsy!chris

gene@zeno.mn.org (Gene H. Olson) (10/15/90)

chris@mimsy.umd.edu (Chris Torek) writes:
>In article <826@neccan.oz> peter@neccan.oz (Peter Miller) sends out a
>program to time various bit-counting functions.  I decided to `improve'
>the program a bit :-) and add a number of additional functions.  I have
>run the resulting program (which I will append below) on a number of
>architectures.  The `mod' version is never the fastest of the hackmem
>variants.  The fastest function is always table lookup, except on the
>Sun-4.

I also posted an "improved" version of this program that got
horribly erratic and odd results.

Looking at all this closely, it seems the erratic results on this
"test" are due two 3 problems:

1)	The original program had a bug in the procedure table
	which tested the wrong procedures.  Both Chris and I
	fixed this.

2)	The original program neglected to return a value in
	procedure bigrand().  On the Sun4, this means all the
	"random" data was zero.  This kinda skewed the results.

	I just fixed this by putting in the return().

3)	The bigrand() procedure was called every invokation of
	the test procedure.  Since for the fast routines it takes
	*MUCH* longer to generate these numbers than to count the
	bits, the noise level is extreme.

	I just fixed this by creating an array once, and then
	referencing it.  The test runs about 6 times faster now,
	and it is much more consistent.

So the bottom line is that most of the results posted so far are
largely nonsense.  So, in the spirit of throwing good net bandwidth
after bad,  I am posting the blasted thing again.   I hope this
discussion is useful to *someone*.

Earlier I had added a test now called "suminplace" to the list
which I believed was faster than any other way of doing it.
(Except maybe a 16-bit lookup table).  It turns out, that
an 8-bit lookup table beats it.

Blast the torpedos, I still like my method best.  Maybe
it will run faster on another machine or another compiler.
It certainly is smaller.

>>>>> SparcStation results:

function                   time          ratio
hackmemmod               0.00093555450  16.082
hackmemloop              0.00013732910   2.361
hackmemunrolled          9.0599060e-05   1.557
rmlsbsub                 0.00032043457   5.508
rmlsbmask                0.00026893616   4.623
testlsb                  0.00064945221  11.164
testmsb                  0.00065517426  11.262
testsignandshift         0.00065326691  11.230
testeachbit              0.00067424774  11.590
testeachbit1shl          0.00086975098  14.951
tableshift               5.8174133e-05   1.000
tableuchar               8.2015991e-05   1.410
suminplace               6.6757202e-05   1.148

>>>>> 386 results:

function                   time          ratio
hackmemmod               0.00022506714   1.553
hackmemloop              0.00039672852   2.737
hackmemunrolled          0.00025939941   1.789
rmlsbsub                 0.00074005127   5.105
rmlsbmask                0.00067138672   4.632
testlsb                  0.0016098022   11.105
testmsb                  0.0015449524   10.658
testsignandshift         0.0015525818   10.711
testeachbit              0.0015563965   10.737
testeachbit1shl          0.0020790100   14.342
tableshift               0.00014877319   1.026
tableuchar               0.00014495850   1.000
suminplace               0.00014877319   1.026


Perhaps Chris will find it worth his time to run the thing
on his list of machines.

-------------------- Yet another posting ----------------------

#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

#define NUMRAND	16384
unsigned long rr[NUMRAND] ;


/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}


/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit bumber to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "cating out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */
int
t0_hackmemmod(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}


/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, an addition.
 */
int
t1_hackmemloop(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * Above, without using while loop.  It takes at most 5 iterations of the
 * loop, so we just do all 5 in-line.  The final result is never 63
 * (this is assumed above as well).
 */
int
t2_hackmemunrolled(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	return (tmp);
}


/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
t3_rmlsbsub(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

int
t4_rmlsbmask(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; count++)
		n &= n - 1;	/* take away lsb */
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
t5_testlsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
t6_testmsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

int
t7_testsignandshift(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n <<= 1)
		if ((long)n < 0)
			count++;
	return (count);
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
t8_testeachbit(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
t9_testeachbit1shl(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

static char nbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tA_tableshift(n)
	register unsigned long n;
{

	return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
		nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tB_tableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}


int
tC_suminplace(n)
	register unsigned long n ;
{
	n = ((n >> 1) & 0x55555555) + (n & 0x55555555) ;
	n = ((n >> 2) & 0x33333333) + (n & 0x33333333) ;
	n = ((n >> 4) + n) & 0x0F0F0F0F ;
	n = ((n >> 8) + n) ;
	return ((n >> 16) + n) & 0xFF ;
}


/*
 * build a long random number.
 * The standard rand() returns at least a 15 bit number.
 * We use the top 9 of 15 (since the lower N bits of the usual rand()
 * repeat with a period of 2^N).
 */
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
	register int r;

	r = randbits() << 27;
	r |= randbits() << 18;
	r |= randbits() << 9;
	r |= randbits();

	return(r) ;
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<20)
#else
#define REPEAT (1L<<18)
#endif

double
measure(func)
	register int	(*func)();
{
#ifdef USE_GETRUSAGE
	struct rusage ru0, ru1;
	register long j;

	(void) getrusage(RUSAGE_SELF, &ru0);
	for (j = 0; j < REPEAT; ++j)
		func(rr[j & (NUMRAND-1)]);
	(void) getrusage(RUSAGE_SELF, &ru1);
	ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
	if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
		ru1.ru_utime.tv_usec += 1000000;
		ru1.ru_utime.tv_sec--;
	}
	return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
		(double)REPEAT);
#else
	register long	j;
	struct tms	start;
	struct tms	finish;

	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(rr[j & (NUMRAND-1)]);
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "hackmemmod",		t0_hackmemmod },
	{ "hackmemloop", 	t1_hackmemloop },
	{ "hackmemunrolled",	t2_hackmemunrolled },
	{ "rmlsbsub", 		t3_rmlsbsub },
	{ "rmlsbmask",	 	t4_rmlsbmask },
	{ "testlsb",		t5_testlsb },
	{ "testmsb", 		t6_testmsb },
	{ "testsignandshift", 	t7_testsignandshift },
	{ "testeachbit", 	t8_testeachbit },
	{ "testeachbit1shl", 	t9_testeachbit1shl },
	{ "tableshift", 	tA_tableshift },
	{ "tableuchar", 	tB_tableuchar },
	{ "suminplace", 	tC_suminplace },
};

main(argc, argv)
	int argc;
	char **argv;
{
	double calibration, v, best;
	int j, k, m, verbose;

	verbose = argc > 1;
	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j) {
		unsigned long n;
		int bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad) {
			printf("wrong: %08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf(" %3d", table[k].trial);
			printf("\n");
		}
	}

	for (k = 0 ; k < NUMRAND ; k++) rr[k] = bigrand() ;

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);
	if (verbose)
		printf("calibration: %g\n", calibration);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j) {
		v = measure(table[j].func) - calibration;
		if (verbose)
			printf("%s: %g\n", table[j].name, v);
		table[j].measurement = v;
		if (!j || v < best)
			best = v;
	}
	(void) printf("%-24s   %-14sratio\n", "function", "time");
	for (j = 0; j < SIZEOF(table); ++j) {
		(void) printf("%-24s %#10.8g  %6.3f\n",
		    table[j].name,
		    table[j].measurement,
		    table[j].measurement / best);
	}
	exit(0);
}
_________________________________________________________________________
   __ 
  /  )                 Gene H. Olson             uunet!digibd!zeno!gene
 / __  _ __  _         Smartware Consulting
(__/ _(/_//_(/_        Minneapolis, MN           (612) 824-9108

moews@math.berkeley.edu (David Moews) (10/17/90)

I took Gene Olson's test program and stuck Torek's routines into it, as well 
as adding routines to count bits by using a table with a 16-bit index, and 
a slightly sped-up version of HAKMEM 169 (`hackmemallfold').  The 16-bit 
table lookup appears to be fastest, at least on a Sun-3/50 and a 
Sparcstation 1.  `hackmemallfold' and `sumbits' both use a total of 16 
arithmetic, logical, and shift operations.  However, if gcc is used 
`hackmemallfold' is apparently faster (because gcc refuses to put constants 
in registers?)


  name			  technique used
  ----			  --------------
hackmemmod		HACKMEM 169, using % operator
hackmemloop		HACKMEM 169, using loop to implement %
hackmemunrolled		HACKMEM 169, with 5-step % (loop unrolled)
hackmemallfold		HACKMEM 169, no %, fold to the bitter end
rmlsbsub		remove lsb with `n -= (n & -n)'
rmlsbmask		remove lsb with `n &= n - 1'
testlsb			test n&1, then n>>=1
testmsb			test n&MSB, then n+=n (rather than n<<=1)
testsignandshift	test n<0, then n<<=1
testeachbit		test n&mask, then mask+=mask
testeachbit1shl		test n&(1<<bit) for bit in [0..32)
tableshift		nbits[n>>24] + nbits[(n>>16)&255] ...
tableuchar		set p=&n, nbits[p[0]] + nbits[p[1]] ...
tableshiftcast		nbits[n>>24] + nbits[(u_char)(n>>16)] ...
itableshift		same as tableshift, but table datatype is int
itableuchar		ditto
itableshiftcast		ditto (note, nbits table is `char')
16tableshift		nbits16[n>>16] + nbits16[n&65535]
16tableushort		set p=&n, nbits16[p[0]] + nbits16[p[1]]
16tableshiftcast	nbits16[n>>16] + nbits16[(u_short)(n)]
16itableshift		same as 16tableshift, but table datatype is int
16itableushort		ditto
16itableshiftcast	ditto
sumbits			Gene Olson's function (see code for comments)

Sun-3/50, SunOS 4.1, cc -O

function                   time          ratio
hackmemmod               1.0166168e-05   1.931
hackmemloop              1.2187958e-05   2.315
hackmemunrolled          1.3732910e-05   2.609
hackmemallfold           8.2588196e-06   1.569
rmlsbsub                 1.9512177e-05   3.707
rmlsbmask                1.9512177e-05   3.707
testlsb                  4.6901703e-05   8.909
testmsb                  4.8561096e-05   9.225
testsignandshift         4.0779114e-05   7.746
testeachbit              4.6749115e-05   8.880
testeachbit1shl          6.6032410e-05  12.543
tableshift               1.6460419e-05   3.127
tableuchar               1.3256073e-05   2.518
tableshiftcast           1.1730194e-05   2.228
itableshift              1.6918182e-05   3.214
itableuchar              9.5176697e-06   1.808
itableshiftcast          1.2359619e-05   2.348
16tableshift             1.0337830e-05   1.964
16tableushort            6.0081482e-06   1.141
16tableshiftcast         6.1416626e-06   1.167
16itableshift            5.2642822e-06   1.000
16itableushort           6.6184998e-06   1.257
16itableshiftcast        1.1196136e-05   2.127
sumbits                  7.1525574e-06   1.359

Sun-3/50, SunOS 4.1, gcc -O, ver. 1.37.1

function                   time          ratio
hackmemmod               1.0070801e-05   3.259
hackmemloop              1.1653900e-05   3.772
hackmemunrolled          1.0719299e-05   3.469
hackmemallfold           7.9727173e-06   2.580
rmlsbsub                 1.9207001e-05   6.216
rmlsbmask                1.9226074e-05   6.222
testlsb                  4.4364929e-05  14.358
testmsb                  3.7155151e-05  12.025
testsignandshift         4.1427612e-05  13.407
testeachbit              4.8522949e-05  15.704
testeachbit1shl          5.4588318e-05  17.667
tableshift               1.1863708e-05   3.840
tableuchar               8.7928772e-06   2.846
tableshiftcast           1.6403198e-05   5.309
itableshift              9.8037720e-06   3.173
itableuchar              6.3896179e-06   2.068
itableshiftcast          1.3904572e-05   4.500
16tableshift             5.7983398e-06   1.877
16tableushort            4.4250488e-06   1.432
16tableshiftcast         5.3215027e-06   1.722
16itableshift            4.9591064e-06   1.605
16itableushort           3.0899048e-06   1.000
16itableshiftcast        9.1934204e-06   2.975
sumbits                  9.6511841e-06   3.123

SparcStation 1, SunOS 4.0.3c, cc -O

function                   time          ratio
hackmemmod               1.4197826e-05  18.551
hackmemloop              2.2125244e-06   2.891
hackmemunrolled          1.4662743e-06   1.916
hackmemallfold           1.0657310e-06   1.393
rmlsbsub                 5.1355362e-06   6.710
rmlsbmask                4.2915344e-06   5.607
testlsb                  1.0557175e-05  13.794
testmsb                  1.0647774e-05  13.913
testsignandshift         1.0533333e-05  13.763
testeachbit              1.0833740e-05  14.156
testeachbit1shl          1.3933182e-05  18.206
tableshift               9.0837479e-07   1.187
tableuchar               1.3136864e-06   1.717
tableshiftcast           9.1314316e-07   1.193
itableshift              1.1110306e-06   1.452
itableuchar              1.5211105e-06   1.988
itableshiftcast          1.1157990e-06   1.458
16tableshift             8.2015991e-07   1.072
16tableushort            1.1849403e-06   1.548
16tableshiftcast         7.6532364e-07   1.000
16itableshift            1.6880035e-06   2.206
16itableushort           2.0956993e-06   2.738
16itableshiftcast        1.6403198e-06   2.143
sumbits                  1.0561943e-06   1.380

SparcStation 1, SunOS 4.0.3c, gcc -O, ver. 1.37

function                   time          ratio
hackmemmod               1.1713505e-05  14.241
hackmemloop              2.4437904e-06   2.971
hackmemunrolled          1.4662743e-06   1.783
hackmemallfold           1.0633469e-06   1.293
rmlsbsub                 5.1999092e-06   6.322
rmlsbmask                4.3869019e-06   5.333
testlsb                  1.2927055e-05  15.716
testmsb                  1.6040802e-05  19.501
testsignandshift         1.2907982e-05  15.693
testeachbit              1.1508465e-05  13.991
testeachbit1shl          1.4796257e-05  17.988
tableshift               1.0609627e-06   1.290
tableuchar               1.5187263e-06   1.846
tableshiftcast           1.0633469e-06   1.293
itableshift              1.2660027e-06   1.539
itableuchar              1.7237663e-06   2.096
itableshiftcast          1.2612343e-06   1.533
16tableshift             8.7499619e-07   1.064
16tableushort            1.1301041e-06   1.374
16tableshiftcast         8.2254410e-07   1.000
16itableshift            1.7356873e-06   2.110
16itableushort           1.9979477e-06   2.429
16itableshiftcast        1.6951561e-06   2.061
sumbits                  1.3136864e-06   1.597
--
David Moews                      moews@math.berkeley.edu
-----------------------------cut here for program-------------------------------
#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

#define NUMRAND	16384
unsigned long rr[NUMRAND];

/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}

/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit number to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "casting out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */
int
t0_hackmemmod(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}


/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, [sic] an addition.
 */
int
t1_hackmemloop(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * Above, without using while loop.  It takes at most 5 iterations of the
 * loop, so we just do all 5 in-line.  The final result is never 63
 * (this is assumed above as well).
 */
int
t2_hackmemunrolled(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	return (tmp);
}

/*
 * As above, but no modulus nor simulation thereof.  Instead,
 * we continue shifting and masking to the bitter end.
 */
int
t3_hackmemallfold(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp += tmp >> 6;
	return ((tmp + (tmp >> 12) + (tmp >> 24)) & 077);
}

/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
t4_rmlsbsub(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

int
t5_rmlsbmask(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; count++)
		n &= n - 1;	/* take away lsb */
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
t6_testlsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
t7_testmsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

int
t8_testsignandshift(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n <<= 1)
		if ((long)n < 0)
			count++;
	return (count);
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
t9_testeachbit(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
tA_testeachbit1shl(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

static char nbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

static int inbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tB_tableshift(n)
	register unsigned long n;
{

	return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
		nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tC_tableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}

int
tD_tableshiftcast(n)
	register unsigned long n;
{

	return nbits[(unsigned char) n] +
		nbits[(unsigned char) (n >> 8)] +
		nbits[(unsigned char) (n >> 16)] +
		nbits[n >> 24];
}

int
tE_itableshift(n)
	register unsigned long n;
{

	return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] +
		inbits[(n >> 16) & 0xff] + inbits[n >> 24]);
}

int
tF_itableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]);
}

int
tG_itableshiftcast(n)
	register unsigned long n;
{

	return inbits[(unsigned char) n] +
		inbits[(unsigned char) (n >> 8)] +
		inbits[(unsigned char) (n >> 16)] +
		inbits[n >> 24];
}

static char nbits16[65536];
static int inbits16[65536];

int
tH_16tableshift(n)
	register unsigned long n;
{

	return (nbits16[n & 0xffff] + nbits16[n >> 16]);
}

int
tI_16tableushort(n)
	unsigned long n;
{
	register unsigned short *p = (unsigned short *)&n;

	return (nbits16[p[0]] + nbits16[p[1]]);
}

int
tJ_16tableshiftcast(n)
	register unsigned long n;
{

	return nbits16[(unsigned short) n] +
		nbits16[n >> 16];
}

int
tK_16itableshift(n)
	register unsigned long n;
{

	return (inbits16[n & 0xffff] + inbits16[n >> 16]);
}

int
tL_16itableushort(n)
	unsigned long n;
{
	register unsigned short *p = (unsigned short *)&n;

	return (inbits16[p[0]] + inbits16[p[1]]);
}

int
tM_16itableshiftcast(n)
	register unsigned long n;
{

	return inbits16[(unsigned short) n] +
		inbits16[n >> 16];
}

/*
 * Explanation:
 * First we add 32 1-bit fields to get 16 2-bit fields.
 * Each 2-bit field is one of 00, 01, or 10 (binary).
 * We then add all the two-bit fields to get 8 4-bit fields.
 * These are all one of 0000, 0001, 0010, 0011, or 0100.
 *
 * Now we can do something different, becuase for the first
 * time the value in each k-bit field (k now being 4) is small
 * enough that adding two k-bit fields results in a value that
 * still fits in the k-bit field.  The result is four 4-bit
 * fields containing one of {0000,0001,...,0111,1000} and four
 * more 4-bit fields containing junk (sums that are uninteresting).
 * Pictorially:
 *	    n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh
 *	 n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg
 *	  sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ
 * where W, X, Y, and Z are the interesting sums (each at most 1000,
 * or 8 decimal).  Masking with 0x0f0f0f0f extracts these.
 *
 * Now we can change tactics yet again, because now we have:
 *	    n = 0000WWWW0000XXXX0000YYYY0000ZZZZ
 *	 n>>8 = 000000000000WWWW0000XXXX0000YYYY
 * so	  sum = 0000WWWW000ppppp000qqqqq000rrrrr
 * where p and r are the interesting sums (and each is at most
 * 10000, or 16 decimal).  The sum `q' is junk, like i, j, and
 * k above; but it is not necessarry to discard it this time.
 * One more fold, this time by sixteen bits, gives
 *	    n = 0000WWWW000ppppp000qqqqq000rrrrr
 *	n>>16 = 00000000000000000000WWWW000ppppp
 * so	  sum = 0000WWWW000ppppp000sssss00tttttt
 * where s is at most 11000 and t is it most 100000 (32 decimal).
 *
 * Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)),
 * or in other words, t is the number of bits set in the original
 * 32-bit longword.  So all we have to do is return the low byte
 * (or low 6 bits, but `low byte' is typically just as easy if not
 * easier).
 *
 * This technique is also applicable to 64 and 128 bit words, but
 * 256 bit or larger word sizes require at least one more masking
 * step.
 */
int
tN_sumbits(n)
	register unsigned long n;
{

	n = (n & 0x55555555) + ((n >> 1) & 0x55555555);
	n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
	n = (n + (n >> 4)) & 0x0f0f0f0f;
	n += n >> 8;
	n += n >> 16;
	return (n & 0xff);
}



/*
 * build a long random number.
 * The standard rand() returns at least a 15 bit number.
 * We use the top 9 of 15 (since the lower N bits of the usual rand()
 * repeat with a period of 2^N).
 */
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
	register int r;

	r = randbits() << 27;
	r |= randbits() << 18;
	r |= randbits() << 9;
	r |= randbits();

	return(r) ;
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<22)
#else
#define REPEAT (1L<<20)
#endif

double
measure(func)
	register int	(*func)();
{
#ifdef USE_GETRUSAGE
	struct rusage ru0, ru1;
	register long j;

	(void) getrusage(RUSAGE_SELF, &ru0);
	for (j = 0; j < REPEAT; ++j)
		func(rr[j & (NUMRAND-1)]);
	(void) getrusage(RUSAGE_SELF, &ru1);
	ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
	if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
		ru1.ru_utime.tv_usec += 1000000;
		ru1.ru_utime.tv_sec--;
	}
	return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
		(double)REPEAT);
#else
	register long	j;
	struct tms	start;
	struct tms	finish;

	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(rr[j & (NUMRAND-1)]);
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "hackmemmod",		t0_hackmemmod },
	{ "hackmemloop", 	t1_hackmemloop },
	{ "hackmemunrolled",	t2_hackmemunrolled },
	{ "hackmemallfold",	t3_hackmemallfold },
	{ "rmlsbsub", 		t4_rmlsbsub },
	{ "rmlsbmask",	 	t5_rmlsbmask },
	{ "testlsb",		t6_testlsb },
	{ "testmsb", 		t7_testmsb },
	{ "testsignandshift", 	t8_testsignandshift },
	{ "testeachbit", 	t9_testeachbit },
	{ "testeachbit1shl", 	tA_testeachbit1shl },
	{ "tableshift", 	tB_tableshift },
	{ "tableuchar", 	tC_tableuchar },
	{ "tableshiftcast",	tD_tableshiftcast },
	{ "itableshift", 	tE_itableshift },
	{ "itableuchar", 	tF_itableuchar },
	{ "itableshiftcast",	tG_itableshiftcast },
	{ "16tableshift", 	tH_16tableshift },
	{ "16tableushort", 	tI_16tableushort },
	{ "16tableshiftcast",	tJ_16tableshiftcast },
	{ "16itableshift", 	tK_16itableshift },
	{ "16itableushort", 	tL_16itableushort },
	{ "16itableshiftcast",	tM_16itableshiftcast },
	{ "sumbits",		tN_sumbits }
};

main(argc, argv)
	int argc;
	char **argv;
{
	double calibration, v, best;
	int j, k, m, verbose;

	/* Initialize the 16-bit lookup table. */
	for (j = 0; j < 65536; ++j) {
		nbits16[j] = inbits16[j] = tN_sumbits((unsigned long)j);
	}
	verbose = argc > 1;
	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j) {
		unsigned long n;
		int bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad) {
			printf("wrong: %08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf(" %3d", table[k].trial);
			printf("\n");
		}
	}

	for (k = 0 ; k < NUMRAND ; k++) rr[k] = bigrand() ;

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);
	if (verbose)
		printf("calibration: %g\n", calibration);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j) {
		v = measure(table[j].func) - calibration;
		if (verbose)
			printf("%s: %g\n", table[j].name, v);
		table[j].measurement = v;
		if (!j || v < best)
			best = v;
	}
	(void) printf("%-24s   %-14sratio\n", "function", "time");
	for (j = 0; j < SIZEOF(table); ++j) {
		(void) printf("%-24s %#10.8g  %6.3f\n",
		    table[j].name,
		    table[j].measurement,
		    table[j].measurement / best);
	}
	exit(0);
}

kdq@demott.COM (Kevin D. Quitt) (10/18/90)

    Something is seriously wrong with this bit-counting benchmark.  I'm
running on a Motorola Delta 3600 (25MHz 68030), and my time are 2 orders
of magnitude slower than those reported for a Sun 3/50!!!!

function                   time          ratio
hackmemmod               0.00033760071   2.810
hackmemloop              0.00034141541   2.841
hackmemunrolled          0.00017452240   1.452
hackmemallfold           0.00026226044   2.183
rmlsbsub                 0.00062084198   5.167
rmlsbmask                0.00061321259   5.103
testlsb                  0.0015430450   12.841
testmsb                  0.0012369156   10.294
testsignandshift         0.0014562607   12.119
testeachbit              0.0016183853   13.468
testeachbit1shl          0.0018253326   15.190
tableshift               0.00029087067   2.421
tableuchar               0.00018882751   1.571
tableshiftcast           0.00039577484   3.294
itableshift              0.00022602081   1.881
itableuchar              0.00012016296   1.000
itableshiftcast          0.00032424927   2.698
16tableshift             0.00033283234   2.770
16tableushort            0.00027084351   2.254
16tableshiftcast         0.00029563904   2.460
16itableshift            0.00035190582   2.929
16itableushort           0.00033092499   2.754
16itableshiftcast        0.00042724609   3.556
sumbits                  0.00023365021   1.944


    In order to get this to compile on my SYSV machine, I had to add the
line:

#undef FD_SETSIZE

    because there's no resource.h on this machine.  Oh yes, it's gcc -O.

-- 
 _
Kevin D. Quitt         demott!kdq   kdq@demott.com
DeMott Electronics Co. 14707 Keswick St.   Van Nuys, CA 91405-1266
VOICE (818) 988-4975   FAX (818) 997-1190  MODEM (818) 997-4496 PEP last

                96.37% of all statistics are made up.

ts@cup.portal.com (Tim W Smith) (10/24/90)

When benchmarking something like this, a good thing to do
is to do a back-of-the-envelope estimate of what you
expect.  For example, the Mac II runs at 16 MHz.  Assume
a few wait states, and loop overhead, and stuff like that
will limit us to 1 "useful" instruction every 8 clocks,
so we get 2 "useful" MIPS.  The operation x &= x-1 will
take three operations.  At 2 MIPS, an operation is 1/2 uSec,
so we have 1.5 uSec per bit.  The loop runs an average of
16 times, so we expect that we can do it in 24 uSec.

The measured time was something like 20 uSec, so we
pass.

If you see numbers in the hundreds of microseconds, you
can be almost sure that something is wrong with the program.

					Tim Smith