peter@neccan.oz (Peter Miller) (10/05/90)
/* * A while back I posted a solution to the "count of bits set in a * long" question, suggesting that HACKMEM 169 was a very fast way to do it. * This program demonstrates the technique. * * The HACKMEM 169 method uses a modulo to sum the digits. * A number of people have asked me "how can a division be faster than a loop?" * This program demonstrates how. The modulo is a special case, * and can be unwound into a tight loop if your machine does not * have a fast divide opcode (although many modern machines have a 1 cycle * divide opcode, except for we-really-believe-our-press-hype risc * manufacturers). * * This program was run on a 386/20 under unix with the following results: * Func sec/iter scaled * nbits 0.00018310547 1.000 hackmem 169 * nbits1 0.0006980896 3.812 hackmem 169 no % operator * nbits2 0.0016822815 9.188 count lsb's loop * nbits3 0.0037384033 20.417 * nbits4 0.0034370422 18.771 * nbits5 0.0037765503 20.625 * nbits6 0.0035247803 19.250 */ #include <stdio.h> #include <sys/types.h> #include <sys/times.h> #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) /* * This function is used to calibrate the timing mechanism. * This way we can subtract the loop and call overheads. */ int calibrate(n) register unsigned long n; { return 0; } /* * This function counts the number of bits in a long. * It is limited to 63 bit longs, but a minor mod can cope with 511 bits. * * It is so magic, an explanation is required: * Consider a 3 bit number as being * 4a+2b+c * if we shift it right 1 bit, we have * 2a+b * subtracting this from the original gives * 2a+b+c * if we shift the original 2 bits right we get * a * and so with another subtraction we have * a+b+c * which is the number of bits in the original number. * Suitable masking allows the sums of the octal digits in a * 32 bit bumber to appear in each octal digit. This isn't much help * unless we can get all of them summed together. * This can be done by modulo arithmetic (sum the digits in a number by molulo * the base of the number minus one) the old "cating out nines" trick they * taught in school before calculators were invented. * Now, using mod 7 wont help us, because our number will very likely have * more than 7 bits set. So add the octal digits together to get base64 * digits, and use modulo 63. * (Those of you with 64 bit machines need to add 3 octal digits together to * get base512 digits, and use mod 511.) * * This is HACKMEM 169, as used in X11 sources. */ int nbits(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); return ((tmp + (tmp >> 3)) & 030707070707) % 63; } /* * This is the same as the above, but does not use the % operator. * Most modern machines have clockless division, and so the modulo is as * expensive as, say, an addition. */ int nbits1(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; while (tmp > 63) tmp = (tmp & 63) + (tmp >> 6); return tmp; } /* * This function counts the bits in a long. * * It removes the lsb and counting the number of times round the loop. * The expression (n & -n) yields the lsb of a number, * but it only works on 2's compliment machines. */ int nbits2(n) register unsigned long n; { register int count; for (count = 0; n; n -= (n & -n)) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number left and testing the top bit. * On many machines shift is expensive, so it uses a cheap addition instead. */ int nbits3(n) register unsigned long n; { register int count; for (count = 0; n; n += n) if (n & ~(~(unsigned long)0 >> 1)) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number down and testing the bottom bit. */ int nbits4(n) register unsigned long n; { register int count; for (count = 0; n; n >>= 1) if (n & 1) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the second most intuitively obvious method, * and is independent of the number of bits in the long. */ int nbits5(n) register unsigned long n; { register int count; register unsigned long mask; count = 0; for (mask = 1; mask; mask += mask) if (n & mask) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the most intuitively obvious method, * but how do you a priori know how many bits in the long? * (except for ''sizeof(long) * CHAR_BITS'' expression) */ int nbits6(n) register unsigned long n; { register int count; register int bit; count = 0; for (bit = 0; bit < 32; ++bit) if (n & ((unsigned long)1 << bit)) count++; return count; } /* * build a long random number. * The standard rand() returns a 15bit number. */ unsigned long bigrand() { return ( ((unsigned long)(rand() & 0xFFF) << 24) | ((unsigned long)(rand() & 0xFFF) << 12) | (unsigned long)(rand() & 0xFFF) ); } /* * Run the test many times. * You will need a running time in the 10s of seconds * for an accurate result. * * To give a fair comparison, the random number generator * is seeded the same for each measurement. * * Return value is seconds per iteration. */ #define REPEAT (1L<<18) double measure(func) int (*func)(); { long j; struct tms start; struct tms finish; srand(1); times(&start); for (j = 0; j < REPEAT; ++j) func(bigrand()); times(&finish); return ((finish.tms_utime - start.tms_utime) / (double)REPEAT); } struct table { char *name; int (*func)(); double measurement; int trial; }; struct table table[] = { { "nbits", nbits }, { "nbits1", nbits1 }, { "nbits2", nbits2 }, { "nbits3", nbits3 }, { "nbits4", nbits4 }, { "nbits5", nbits3 }, { "nbits6", nbits4 }, }; main() { double calibration; double best; int j, k, m; /* * run a few tests to make sure they all agree */ srand(getpid()); for (j = 0; j < 10; ++j) { unsigned long n; int bad; n = bigrand(); for (k = 0; k < SIZEOF(table); ++k) table[k].trial = table[k].func(n); bad = 0; for (k = 0; k < SIZEOF(table); ++k) for (m = 0; m < SIZEOF(table); ++m) if (table[k].trial != table[m].trial) bad = 1; if (bad) { printf("wrong:\t%08lX", n); for (k = 0; k < SIZEOF(table); ++k) printf("\t%d", table[k].trial); printf("\n"); } } /* * calibrate the timing mechanism */ calibration = measure(calibrate); /* * time them all, keeping track of the best (smallest) */ for (j = 0; j < SIZEOF(table); ++j) { table[j].measurement = measure(table[j].func) - calibration; if (!j || table[j].measurement < best) best = table[j].measurement; } for (j = 0; j < SIZEOF(table); ++j) { printf ( "%s\t%10.8g\t%6.3f\n", table[j].name, table[j].measurement, table[j].measurement / best ); } exit(0); } Regards Peter Miller UUCP uunet!munnari!pdact.pd.necisa.oz!pmiller ARPA pmiller%pdact.pd.necisa.oz@australia /\/\* CSNET pmiller%pdact.pd.necisa.oz@au ACSnet pmiller@pdact.pd.necisa.oz Disclaimer: The views expressed here are personal and do not necessarily reflect the view of my employer or the views or my colleagues. D
chris@mimsy.umd.edu (Chris Torek) (10/07/90)
In article <826@neccan.oz> peter@neccan.oz (Peter Miller) sends out a program to time various bit-counting functions. I decided to `improve' the program a bit :-) and add a number of additional functions. I have run the resulting program (which I will append below) on a number of architectures. The `mod' version is never the fastest of the hackmem variants. The fastest function is always table lookup, except on the Sun-4. Here are his results, for an i386 box: > * Func sec/iter scaled > * nbits 0.00018310547 1.000 hackmem 169 > * nbits1 0.0006980896 3.812 hackmem 169 no % operator > * nbits2 0.0016822815 9.188 count lsb's loop > * nbits3 0.0037384033 20.417 > * nbits4 0.0034370422 18.771 > * nbits5 0.0037765503 20.625 > * nbits6 0.0035247803 19.250 (I find the last two results particularly interesting since his code as posted uses functions nbits3 and nbits4 to calculate the times printed for nbits5 and nbits6, never evaluating 5 and 6 at all.) Here is a short key to names: my name (Peter's name) technique used ---------------------- -------------- hackmemmod (nbits) HACKMEM 169, using % operator hackmemloop (nbits1) HACKMEM 169, using loop to implement % hackmemunrolled (-) HACKMEM 169, with 5-step % (loop unrolled) rmlsbsub (nbits2) remove lsb with `n -= (n & -n)' rmlsbmask (-) remove lsb with `n &= n - 1' testlsb (nbits4) test n&1, then n>>=1 testmsb (nbits3) test n&MSB, then n+=n (rather than n<<=1) testsignandshift (-) test n<0, then n<<=1 testeachbit (nbits5, untimed) test n&mask, then mask+=mask testeachbit1shl (nbits6, ") test n&(1<<bit) for bit in [0..32) tableshift nbits[n>>24] + nbits[(n>>16)&255] ... tableuchar set p=&n, nbits[p[0]] + nbits[p[1]] ... The results: -------------------------------------------------- DECstation 3100 (MIPS R2000, slow memory system) function time ratio hackmemmod 3.1029682e-06 3.179 hackmemloop 2.4697094e-06 2.531 hackmemunrolled 1.7693996e-06 1.813 rmlsbsub 4.8127670e-06 4.931 rmlsbmask 3.8405285e-06 3.935 testlsb 1.0221542e-05 10.473 testmsb 1.0899502e-05 11.168 testsignandshift 1.0780300e-05 11.046 testeachbit 1.0843626e-05 11.111 testeachbit1shl 1.6695683e-05 17.107 tableshift 1.0467396e-06 1.073 tableuchar 9.7596359e-07 1.000 -------------------------------------------------- Encore Multimax (NS32532, proprietary bus) function time ratio hackmemmod 6.3739853e-06 3.618 hackmemloop 3.8458633e-06 2.183 hackmemunrolled 6.4167595e-06 3.642 rmlsbsub 9.3918610e-06 5.330 rmlsbmask 9.4905777e-06 5.386 testlsb 2.4036179e-05 13.642 testmsb 2.3262550e-05 13.203 testsignandshift 2.1452625e-05 12.176 testeachbit 2.5112068e-05 14.253 testeachbit1shl 2.8207516e-05 16.010 tableshift 2.2915077e-06 1.301 tableuchar 1.7619209e-06 1.000 -------------------------------------------------- Sun-3/something (68020, ?vmebus memory?) function time ratio hackmemmod 0.00077819824 1.821 hackmemloop 0.00059890747 1.402 hackmemunrolled 0.00050354004 1.179 rmlsbsub 0.00069808960 1.634 rmlsbmask 0.00055313110 1.295 testlsb 0.00107955930 2.527 testmsb 0.00251007080 5.875 testsignandshift 0.00242614750 5.679 testeachbit 0.00254821780 5.964 testeachbit1shl 0.00362014770 8.473 tableshift 0.00083160400 1.946 tableuchar 0.00042724609 1.000 -------------------------------------------------- SparcStation 1+ (SPARC, private memory bus) These times look a bit odd, and suggest that the compiler could use some work. function time ratio hackmemmod 1.4972687e-06 12.077 hackmemloop 7.3432922e-07 5.923 hackmemunrolled 1.1825562e-06 9.538 rmlsbsub 1.3351440e-07 1.077 rmlsbmask 1.4305115e-07 1.154 testlsb 1.4305115e-07 1.154 testmsb 1.3351440e-07 1.077 testsignandshift 1.2397766e-07 1.000 testeachbit 6.6280365e-06 53.462 testeachbit1shl 9.2029572e-06 74.231 tableshift 7.2479248e-07 5.846 tableuchar 1.1539459e-06 9.308 -------------------------------------------------- VAX 8600 with gcc (ECL gate array, private memory bus) function time ratio hackmemmod 1.1291504e-05 4.290 hackmemloop 1.0757446e-05 4.087 hackmemunrolled 8.8500977e-06 3.362 rmlsbsub 1.7890930e-05 6.797 rmlsbmask 1.4801025e-05 5.623 testlsb 5.2032471e-05 19.768 testmsb 3.5400391e-05 13.449 testsignandshift 2.7008057e-05 10.261 testeachbit 3.0670166e-05 11.652 testeachbit1shl 4.7760010e-05 18.145 tableshift 5.3405762e-06 2.029 tableuchar 2.6321411e-06 1.000 -------------------------------------------------- VAX 8600 with Berkeley cc function time ratio hackmemmod 8.1634521e-06 3.292 hackmemloop 6.9808960e-06 2.815 hackmemunrolled 1.3885498e-05 5.600 rmlsbsub 6.2942505e-06 2.538 rmlsbmask 5.9890747e-06 2.415 testlsb 1.5754700e-05 6.354 testmsb 3.9138794e-05 15.785 testsignandshift 3.1127930e-05 12.554 testeachbit 4.1275024e-05 16.646 testeachbit1shl 6.0615540e-05 24.446 tableshift 5.9890747e-06 2.415 tableuchar 2.4795532e-06 1.000 -------------------------------------------------- There are some noteworthy points above, particularly the example with the 8600, where the choice of compiler makes a great deal of difference. The `testlsb' loop runs much faster under gcc, which eliminated one `tstl' instruction, yet some of the other loops run more slowly. So that others can run their own tests, here is the program. I added several functions, renamed them all, changed the timing mechanism on BSD machines, and reformatted things a bit. I changed the random number selection mechanism to be relatively machine independent (assuming they all have the same `rand' function) and to use values that are `more random'. I also changed the output format (among other things, it uses `%#g'; the Sun SparcStation printf does not handle this correctly, and I added trailing zeroes manually above). Oh yes, I also had to make it run longer on the MIPS and Sparc machines to get reliable times. (You can tell which functions I added, as they are generally uncommented. :-) ) -------------------------------------------------- #include <stdio.h> #include <sys/types.h> #ifdef FD_SETSIZE # define USE_GETRUSAGE # include <sys/time.h> # include <sys/resource.h> #else # include <sys/times.h> #endif #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) /* * This function is used to calibrate the timing mechanism. * This way we can subtract the loop and call overheads. */ int calibrate(n) register unsigned long n; { return 0; } /* * This function counts the number of bits in a long. * It is limited to 63 bit longs, but a minor mod can cope with 511 bits. * * It is so magic, an explanation is required: * Consider a 3 bit number as being * 4a+2b+c * if we shift it right 1 bit, we have * 2a+b * subtracting this from the original gives * 2a+b+c * if we shift the original 2 bits right we get * a * and so with another subtraction we have * a+b+c * which is the number of bits in the original number. * Suitable masking allows the sums of the octal digits in a * 32 bit bumber to appear in each octal digit. This isn't much help * unless we can get all of them summed together. * This can be done by modulo arithmetic (sum the digits in a number by molulo * the base of the number minus one) the old "cating out nines" trick they * taught in school before calculators were invented. * Now, using mod 7 wont help us, because our number will very likely have * more than 7 bits set. So add the octal digits together to get base64 * digits, and use modulo 63. * (Those of you with 64 bit machines need to add 3 octal digits together to * get base512 digits, and use mod 511.) * * This is HACKMEM 169, as used in X11 sources. */ int t0_hackmemmod(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); return ((tmp + (tmp >> 3)) & 030707070707) % 63; } /* * This is the same as the above, but does not use the % operator. * Most modern machines have clockless division, and so the modulo is as * expensive as, say, an addition. */ int t1_hackmemloop(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; while (tmp > 63) tmp = (tmp & 63) + (tmp >> 6); return tmp; } /* * Above, without using while loop. It takes at most 5 iterations of the * loop, so we just do all 5 in-line. The final result is never 63 * (this is assumed above as well). */ int t2_hackmemunrolled(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); return (tmp); } /* * This function counts the bits in a long. * * It removes the lsb and counting the number of times round the loop. * The expression (n & -n) yields the lsb of a number, * but it only works on 2's compliment machines. */ int t3_rmlsbsub(n) register unsigned long n; { register int count; for (count = 0; n; n -= (n & -n)) count++; return count; } int t4_rmlsbmask(n) register unsigned long n; { register int count; for (count = 0; n; count++) n &= n - 1; /* take away lsb */ return (count); } /* * This function counts the bits in a long. * * It works by shifting the number down and testing the bottom bit. */ int t5_testlsb(n) register unsigned long n; { register int count; for (count = 0; n; n >>= 1) if (n & 1) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number left and testing the top bit. * On many machines shift is expensive, so it uses a cheap addition instead. */ int t6_testmsb(n) register unsigned long n; { register int count; for (count = 0; n; n += n) if (n & ~(~(unsigned long)0 >> 1)) count++; return count; } int t7_testsignandshift(n) register unsigned long n; { register int count; for (count = 0; n; n <<= 1) if ((long)n < 0) count++; return (count); } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the second most intuitively obvious method, * and is independent of the number of bits in the long. */ int t8_testeachbit(n) register unsigned long n; { register int count; register unsigned long mask; count = 0; for (mask = 1; mask; mask += mask) if (n & mask) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the most intuitively obvious method, * but how do you a priori know how many bits in the long? * (except for ''sizeof(long) * CHAR_BITS'' expression) */ int t9_testeachbit1shl(n) register unsigned long n; { register int count; register int bit; count = 0; for (bit = 0; bit < 32; ++bit) if (n & ((unsigned long)1 << bit)) count++; return count; } static char nbits[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; int tA_tableshift(n) register unsigned long n; { return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] + nbits[(n >> 16) & 0xff] + nbits[n >> 24]); } int tB_tableuchar(n) unsigned long n; { register unsigned char *p = (unsigned char *)&n; return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]); } /* * build a long random number. * The standard rand() returns at least a 15 bit number. * We use the top 9 of 15 (since the lower N bits of the usual rand() * repeat with a period of 2^N). */ unsigned long bigrand() { #define randbits() ((unsigned long)((rand() >> 6) & 0777)) register int r; r = randbits() << 27; r |= randbits() << 18; r |= randbits() << 9; r |= randbits(); } /* * Run the test many times. * You will need a running time in the 10s of seconds * for an accurate result. * * To give a fair comparison, the random number generator * is seeded the same for each measurement. * * Return value is seconds per iteration. */ #if defined(mips) || defined(sparc) #define REPEAT (1L<<20) #else #define REPEAT (1L<<18) #endif double measure(func) register int (*func)(); { #ifdef USE_GETRUSAGE struct rusage ru0, ru1; register long j; srand(1); (void) getrusage(RUSAGE_SELF, &ru0); for (j = 0; j < REPEAT; ++j) func(bigrand()); (void) getrusage(RUSAGE_SELF, &ru1); ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec; if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) { ru1.ru_utime.tv_usec += 1000000; ru1.ru_utime.tv_sec--; } return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) / (double)REPEAT); #else register long j; struct tms start; struct tms finish; srand(1); times(&start); for (j = 0; j < REPEAT; ++j) func(bigrand()); times(&finish); return ((finish.tms_utime - start.tms_utime) / (double)REPEAT); #endif } struct table { char *name; int (*func)(); double measurement; int trial; }; struct table table[] = { { "hackmemmod", t0_hackmemmod }, { "hackmemloop", t1_hackmemloop }, { "hackmemunrolled", t2_hackmemunrolled }, { "rmlsbsub", t3_rmlsbsub }, { "rmlsbmask", t4_rmlsbmask }, { "testlsb", t5_testlsb }, { "testmsb", t6_testmsb }, { "testsignandshift", t7_testsignandshift }, { "testeachbit", t8_testeachbit }, { "testeachbit1shl", t9_testeachbit1shl }, { "tableshift", tA_tableshift }, { "tableuchar", tB_tableuchar }, }; main(argc, argv) int argc; char **argv; { double calibration, v, best; int j, k, m, verbose; verbose = argc > 1; /* * run a few tests to make sure they all agree */ srand(getpid()); for (j = 0; j < 10; ++j) { unsigned long n; int bad; n = bigrand(); for (k = 0; k < SIZEOF(table); ++k) table[k].trial = table[k].func(n); bad = 0; for (k = 0; k < SIZEOF(table); ++k) for (m = 0; m < SIZEOF(table); ++m) if (table[k].trial != table[m].trial) bad = 1; if (bad) { printf("wrong: %08lX", n); for (k = 0; k < SIZEOF(table); ++k) printf(" %3d", table[k].trial); printf("\n"); } } /* * calibrate the timing mechanism */ calibration = measure(calibrate); if (verbose) printf("calibration: %g\n", calibration); /* * time them all, keeping track of the best (smallest) */ for (j = 0; j < SIZEOF(table); ++j) { v = measure(table[j].func) - calibration; if (verbose) printf("%s: %g\n", table[j].name, v); table[j].measurement = v; if (!j || v < best) best = v; } (void) printf("%-24s %-14sratio\n", "function", "time"); for (j = 0; j < SIZEOF(table); ++j) { (void) printf("%-24s %#10.8g %6.3f\n", table[j].name, table[j].measurement, table[j].measurement / best); } exit(0); } -- In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 405 2750) Domain: chris@cs.umd.edu Path: uunet!mimsy!chris
gene@zeno.mn.org (Gene H. Olson) (10/12/90)
peter@neccan.oz (Peter Miller) writes: >/* > * A while back I posted a solution to the "count of bits set in a > * long" question, suggesting that HACKMEM 169 was a very fast way to do it. > * This program demonstrates the technique. > * > * The HACKMEM 169 method uses a modulo to sum the digits. > * A number of people have asked me "how can a division be faster than a loop?" > * This program demonstrates how. The modulo is a special case, > * and can be unwound into a tight loop if your machine does not > * have a fast divide opcode (although many modern machines have a 1 cycle > * divide opcode, except for we-really-believe-our-press-hype risc > * manufacturers). > * > * This program was run on a 386/20 under unix with the following results: > * Func sec/iter scaled > * nbits 0.00018310547 1.000 hackmem 169 > * nbits1 0.0006980896 3.812 hackmem 169 no % operator > * nbits2 0.0016822815 9.188 count lsb's loop > * nbits3 0.0037384033 20.417 > * nbits4 0.0034370422 18.771 > * nbits5 0.0037765503 20.625 > * nbits6 0.0035247803 19.250 > */ > ..... A program follows this. To this program I have added a bug fix (the function table was wrong) and another function (nbit7) which is MY FAVORITE WAY of counting bits. The patches are at the end of this posting.... Anyway, I got much better results on both the 386 (which has a multiply) and on a SPARC (which doesn't) than any of the functions in the original program. The results were: 386 SPARC ------------------------------ ------------------------------ nbits 0.00020980835 1.774 nbits 0.00096893311 15.875 nbits1 0.00038909912 3.290 nbits1 0.00012969971 2.125 nbits2 0.00075149536 6.355 nbits2 0.00031280518 5.125 nbits3 0.0015411377 13.032 nbits3 0.00067138672 11.000 nbits4 0.0016479492 13.935 nbits4 0.00065231323 10.688 nbits5 0.0015525818 13.129 nbits5 0.00067520142 11.062 nbits6 0.0021286011 18.000 nbits6 0.00086212158 14.125 nbits7 0.00011825562 1.000 nbits7 6.1035156e-05 1.000 The nbit7 technique is a simple routine which shifts and sums bits in place until all bits have summed in the lower 8 bits of the word. The implementation here requires 32 bit longs, but it is easily extensible to 64 bits with a mask change and the additional shift shown. Enjoy, ------------- Remainder of Posting is a Cdiff Patchfile ---------- *** orig.c Fri Oct 12 08:56:22 1990 --- gene.c Fri Oct 12 09:23:51 1990 *************** *** 215,220 **** --- 215,236 ---- return count; } + int + nbits7(n) + register unsigned long n ; + { + n = ((n >> 1) & 0x55555555) + (n & 0x55555555) ; + n = ((n >> 2) & 0x33333333) + (n & 0x33333333) ; + n = ((n >> 4) + n) & 0x0F0F0F0F ; + n = ((n >> 8) + n) ; + #if LONG64 + n = (n >> 16) + n ; + return ((n >> 32) + n) & 0xFF ; + #else + return ((n >> 16) + n) & 0xFF ; + #endif + } + /* * build a long random number. * The standard rand() returns a 15bit number. *************** *** 275,282 **** { "nbits2", nbits2 }, { "nbits3", nbits3 }, { "nbits4", nbits4 }, ! { "nbits5", nbits3 }, ! { "nbits6", nbits4 }, }; main() --- 291,299 ---- { "nbits2", nbits2 }, { "nbits3", nbits3 }, { "nbits4", nbits4 }, ! { "nbits5", nbits5 }, ! { "nbits6", nbits6 }, ! { "nbits7", nbits7 }, }; main() _________________________________________________________________________ __ / ) Gene H. Olson uunet!digibd!zeno!gene / __ _ __ _ Smartware Consulting (__/ _(/_//_(/_ Minneapolis, MN (612) 824-9108
ts@cup.portal.com (Tim W Smith) (10/13/90)
These times were given in a recent post for some 68020 based Sun: function time ratio hackmemmod 0.00077819824 1.821 hackmemloop 0.00059890747 1.402 hackmemunrolled 0.00050354004 1.179 rmlsbsub 0.00069808960 1.634 rmlsbmask 0.00055313110 1.295 testlsb 0.00107955930 2.527 testmsb 0.00251007080 5.875 testsignandshift 0.00242614750 5.679 testeachbit 0.00254821780 5.964 testeachbit1shl 0.00362014770 8.473 tableshift 0.00083160400 1.946 tableuchar 0.00042724609 1.000 Something is seriously wrong with either these numbers or that computer. I've tried a couple of these on my Mac II, using Think C. Note that 1) Apple is not noted for squeezing the last drop of performance out of the 68020, and 2) Think C has not stressed optimal code generation. I get 36 usec for rmlsub on longs with all 32 bits set, 20 usec if 16 bits are set, and 5 usec of no bits are set. Note that the Sun is getting, in the average case for rmlsub, 20 times worse performance than I am getting for the worst case. Tim Smith
chris@mimsy.umd.edu (Chris Torek) (10/13/90)
In article <26881@mimsy.umd.edu> I posted a modified version of Peter Miller's bit-count timing program. Unfortunately, my version had a bug that effectively makes the results worthless (depending on compiler vagaries; it `just happens' to compile to working code on some systems). Remarkably few people noted the bug (or rather, sent me mail about it). Two people sent me additional functions, however. Arthur Olson added four (all variants on table lookup); Mike Weaver added one. Gene Olson has also posted a different modified version of Peter Miller's original program; this contains a better version of the latter. So, here are new results for the new set of (now 17) functions on the same set of machines, along with the new program. Noteworthy facts: - hackmem invariably loses to some other technique, always including Gene Olson's, though sometimes not by much; - the fastest is usually (but not always) one of the table lookups; - tiny differences in ratios (less than .1) are usually insignificant. name technique used ---- -------------- hackmemmod HACKMEM 169, using % operator hackmemloop HACKMEM 169, using loop to implement % hackmemunrolled HACKMEM 169, with 5-step % (loop unrolled) rmlsbsub remove lsb with `n -= (n & -n)' rmlsbmask remove lsb with `n &= n - 1' testlsb test n&1, then n>>=1 testmsb test n&MSB, then n+=n (rather than n<<=1) testsignandshift test n<0, then n<<=1 testeachbit test n&mask, then mask+=mask testeachbit1shl test n&(1<<bit) for bit in [0..32) tableshift nbits[n>>24] + nbits[(n>>16)&255] ... tableuchar set p=&n, nbits[p[0]] + nbits[p[1]] ... tableshiftcast nbits[n>>24] + nbits[(u_char)(n>>16)] ... itableshift same as tableshift, but table datatype is int itableuchar ditto itableshiftcast ditto (note, nbits table is `char') sumbits Gene Olson's function (see code for comments) ------------------------------------------------------------ Results: :::::::::::::: time.ds3100 :::::::::::::: function time ratio hackmemmod 3.0992432e-06 3.250 hackmemloop 2.5330353e-06 2.656 hackmemunrolled 1.7619495e-06 1.848 rmlsbsub 4.9207935e-06 5.160 rmlsbmask 3.9522800e-06 4.145 testlsb 1.0545622e-05 11.059 testmsb 1.0608948e-05 11.125 testsignandshift 1.0497196e-05 11.008 testeachbit 1.0839901e-05 11.367 testeachbit1shl 1.6718033e-05 17.531 tableshift 1.0243893e-06 1.074 tableuchar 9.5361328e-07 1.000 tableshiftcast 1.1063404e-06 1.160 itableshift 1.2814178e-06 1.344 itableuchar 1.2031918e-06 1.262 itableshiftcast 1.3223934e-06 1.387 sumbits 1.2106419e-06 1.270 :::::::::::::: time.encore :::::::::::::: function time ratio hackmemmod 8.0387573e-06 4.522 hackmemloop 7.2727394e-06 4.091 hackmemunrolled 4.3494453e-06 2.447 rmlsbsub 1.0656597e-05 5.994 rmlsbmask 1.1298252e-05 6.355 testlsb 3.1122246e-05 17.506 testmsb 2.2745319e-05 12.794 testsignandshift 1.9783443e-05 11.128 testeachbit 2.3917282e-05 13.454 testeachbit1shl 2.9880619e-05 16.808 tableshift 3.9419632e-06 2.217 tableuchar 2.6934662e-06 1.515 tableshiftcast 2.3953590e-06 1.347 itableshift 3.0450325e-06 1.713 itableuchar 1.7777634e-06 1.000 itableshiftcast 2.1755104e-06 1.224 sumbits 1.9636650e-06 1.105 :::::::::::::: time.sun3 :::::::::::::: function time ratio hackmemmod 1.0604858e-05 2.106 hackmemloop 1.2664795e-05 2.515 hackmemunrolled 1.0833740e-05 2.152 rmlsbsub 2.2430420e-05 4.455 rmlsbmask 1.7318726e-05 3.439 testlsb 4.3182373e-05 8.576 testmsb 3.8909912e-05 7.727 testsignandshift 4.0664673e-05 8.076 testeachbit 4.4479370e-05 8.833 testeachbit1shl 5.9432983e-05 11.803 tableshift 9.9945068e-06 1.985 tableuchar 1.0910034e-05 2.167 tableshiftcast 1.4877319e-05 2.955 itableshift 7.9345703e-06 1.576 itableuchar 1.1672974e-05 2.318 itableshiftcast 1.1520386e-05 2.288 sumbits 5.0354004e-06 1.000 :::::::::::::: time.sun4 :::::::::::::: function time ratio hackmemmod 1.3427734e-05 19.288 hackmemloop 1.6689301e-06 2.397 hackmemunrolled 1.0585785e-06 1.521 rmlsbsub 3.9768219e-06 5.712 rmlsbmask 3.4236908e-06 4.918 testlsb 8.4209442e-06 12.096 testmsb 8.4686279e-06 12.164 testsignandshift 8.4018707e-06 12.068 testeachbit 8.5353851e-06 12.260 testeachbit1shl 1.1539459e-05 16.575 tableshift 6.9618225e-07 1.000 tableuchar 1.1348724e-06 1.630 tableshiftcast 7.8201294e-07 1.123 itableshift 9.7274780e-07 1.397 itableuchar 1.2016296e-06 1.726 itableshiftcast 8.8691711e-07 1.274 sumbits 8.3923340e-07 1.205 :::::::::::::: time.vax.gcc :::::::::::::: function time ratio hackmemmod 1.5449524e-05 7.364 hackmemloop 1.3847351e-05 6.600 hackmemunrolled 8.6975098e-06 4.145 rmlsbsub 1.8386841e-05 8.764 rmlsbmask 1.4266968e-05 6.800 testlsb 5.3215027e-05 25.364 testmsb 3.4332275e-05 16.364 testsignandshift 2.6550293e-05 12.655 testeachbit 2.9983521e-05 14.291 testeachbit1shl 4.7454834e-05 22.618 tableshift 5.3405762e-06 2.545 tableuchar 2.4414062e-06 1.164 tableshiftcast 6.0272217e-06 2.873 itableshift 4.3106079e-06 2.055 itableuchar 2.0980835e-06 1.000 itableshiftcast 5.6076050e-06 2.673 sumbits 5.7983398e-06 2.764 :::::::::::::: time.vax.ucbcc :::::::::::::: function time ratio hackmemmod 7.6293945e-06 3.774 hackmemloop 1.6746521e-05 8.283 hackmemunrolled 1.3504028e-05 6.679 rmlsbsub 2.0332336e-05 10.057 rmlsbmask 1.8882751e-05 9.340 testlsb 5.6457520e-05 27.925 testmsb 3.7384033e-05 18.491 testsignandshift 2.9602051e-05 14.642 testeachbit 3.8681030e-05 19.132 testeachbit1shl 5.8860779e-05 29.113 tableshift 5.6838989e-06 2.811 tableuchar 2.2888184e-06 1.132 tableshiftcast 5.1879883e-06 2.566 itableshift 4.3487549e-06 2.151 itableuchar 2.0217896e-06 1.000 itableshiftcast 4.5394897e-06 2.245 sumbits 6.1798096e-06 3.057 ------------------------------------------------------------ The program (and I used lint this time...): #ifndef lint static char rcsid[] = "$Id: bct.c,v 1.5 90/10/13 08:44:12 chris Exp $"; #endif /* * bct - bitcount timing * * Runs a bunch of different functions-to-count-bits-in-a-longword * (many assume 32 bits per long) with a timer around each loop, and * tries to calcuate the time used. */ #include <stdio.h> #include <sys/types.h> #ifdef FD_SETSIZE # define USE_GETRUSAGE # include <sys/time.h> # include <sys/resource.h> #else # include <sys/times.h> #endif #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) /* * This function is used to calibrate the timing mechanism. * This way we can subtract the loop and call overheads. */ int calibrate(n) register unsigned long n; { return 0; } /* * This function counts the number of bits in a long. * It is limited to 63 bit longs, but a minor mod can cope with 511 bits. * * It is so magic, an explanation is required: * Consider a 3 bit number as being * 4a+2b+c * if we shift it right 1 bit, we have * 2a+b * subtracting this from the original gives * 2a+b+c * if we shift the original 2 bits right we get * a * and so with another subtraction we have * a+b+c * which is the number of bits in the original number. * Suitable masking allows the sums of the octal digits in a * 32 bit number to appear in each octal digit. This isn't much help * unless we can get all of them summed together. * This can be done by modulo arithmetic (sum the digits in a number by molulo * the base of the number minus one) the old "casting out nines" trick they * taught in school before calculators were invented. * Now, using mod 7 wont help us, because our number will very likely have * more than 7 bits set. So add the octal digits together to get base64 * digits, and use modulo 63. * (Those of you with 64 bit machines need to add 3 octal digits together to * get base512 digits, and use mod 511.) * * This is HACKMEM 169, as used in X11 sources. */ int t0_hackmemmod(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); return ((tmp + (tmp >> 3)) & 030707070707) % 63; } /* * This is the same as the above, but does not use the % operator. * Most modern machines have clockless division, and so the modulo is as * expensive as, say, an addition. */ int t1_hackmemloop(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; while (tmp > 63) tmp = (tmp & 63) + (tmp >> 6); return tmp; } /* * Above, without using while loop. It takes at most 5 iterations of the * loop, so we just do all 5 in-line. The final result is never 63 * (this is assumed above as well). */ int t2_hackmemunrolled(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); return (tmp); } /* * This function counts the bits in a long. * * It removes the lsb and counting the number of times round the loop. * The expression (n & -n) yields the lsb of a number, * but it only works on 2's compliment machines. */ int t3_rmlsbsub(n) register unsigned long n; { register int count; for (count = 0; n; n -= (n & -n)) count++; return count; } int t4_rmlsbmask(n) register unsigned long n; { register int count; for (count = 0; n; count++) n &= n - 1; /* take away lsb */ return (count); } /* * This function counts the bits in a long. * * It works by shifting the number down and testing the bottom bit. */ int t5_testlsb(n) register unsigned long n; { register int count; for (count = 0; n; n >>= 1) if (n & 1) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number left and testing the top bit. * On many machines shift is expensive, so it uses a cheap addition instead. */ int t6_testmsb(n) register unsigned long n; { register int count; for (count = 0; n; n += n) if (n & ~(~(unsigned long)0 >> 1)) count++; return count; } int t7_testsignandshift(n) register unsigned long n; { register int count; for (count = 0; n; n <<= 1) if ((long)n < 0) count++; return (count); } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the second most intuitively obvious method, * and is independent of the number of bits in the long. */ int t8_testeachbit(n) register unsigned long n; { register int count; register unsigned long mask; count = 0; for (mask = 1; mask; mask += mask) if (n & mask) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the most intuitively obvious method, * but how do you a priori know how many bits in the long? * (except for ''sizeof(long) * CHAR_BITS'' expression) */ int t9_testeachbit1shl(n) register unsigned long n; { register int count; register int bit; count = 0; for (bit = 0; bit < 32; ++bit) if (n & ((unsigned long)1 << bit)) count++; return count; } static char nbits[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; static int inbits[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; int tA_tableshift(n) register unsigned long n; { return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] + nbits[(n >> 16) & 0xff] + nbits[n >> 24]); } int tB_tableuchar(n) unsigned long n; { register unsigned char *p = (unsigned char *)&n; return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]); } int tC_tableshiftcast(n) register unsigned long n; { return nbits[(unsigned char) n] + nbits[(unsigned char) (n >> 8)] + nbits[(unsigned char) (n >> 16)] + nbits[(unsigned char) (n >> 24)]; } int tD_itableshift(n) register unsigned long n; { return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] + inbits[(n >> 16) & 0xff] + inbits[n >> 24]); } int tE_itableuchar(n) unsigned long n; { register unsigned char *p = (unsigned char *)&n; return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]); } int tF_itableshiftcast(n) register unsigned long n; { return inbits[(unsigned char) n] + inbits[(unsigned char) (n >> 8)] + inbits[(unsigned char) (n >> 16)] + inbits[(unsigned char) (n >> 24)]; } /* * Explanation: * First we add 32 1-bit fields to get 16 2-bit fields. * Each 2-bit field is one of 00, 01, or 10 (binary). * We then add all the two-bit fields to get 8 4-bit fields. * These are all one of 0000, 0001, 0010, 0011, or 0100. * * Now we can do something different, becuase for the first * time the value in each k-bit field (k now being 4) is small * enough that adding two k-bit fields results in a value that * still fits in the k-bit field. The result is four 4-bit * fields containing one of {0000,0001,...,0111,1000} and four * more 4-bit fields containing junk (sums that are uninteresting). * Pictorially: * n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh * n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg * sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ * where W, X, Y, and Z are the interesting sums (each at most 1000, * or 8 decimal). Masking with 0x0f0f0f0f extracts these. * * Now we can change tactics yet again, because now we have: * n = 0000WWWW0000XXXX0000YYYY0000ZZZZ * n>>8 = 000000000000WWWW0000XXXX0000YYYY * so sum = 0000WWWW000ppppp000qqqqq000rrrrr * where p and r are the interesting sums (and each is at most * 10000, or 16 decimal). The sum `q' is junk, like i, j, and * k above; but it is not necessarry to discard it this time. * One more fold, this time by sixteen bits, gives * n = 0000WWWW000ppppp000qqqqq000rrrrr * n>>16 = 00000000000000000000WWWW000ppppp * so sum = 0000WWWW000ppppp000sssss00tttttt * where s is at most 11000 and t is it most 100000 (32 decimal). * * Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)), * or in other words, t is the number of bits set in the original * 32-bit longword. So all we have to do is return the low byte * (or low 6 bits, but `low byte' is typically just as easy if not * easier). * * This technique is also applicable to 64 and 128 bit words, but * 256 bit or larger word sizes require at least one more masking * step. */ int tG_sumbits(n) register unsigned long n; { n = (n & 0x55555555) + ((n >> 1) & 0x55555555); n = (n & 0x33333333) + ((n >> 2) & 0x33333333); n = (n + (n >> 4)) & 0x0f0f0f0f; n += n >> 8; n += n >> 16; return (n & 0xff); } /* * build a long random number. * The standard rand() returns at least a 15 bit number. * We use the top 9 of 15 (since the lower N bits of the usual rand() * repeat with a period of 2^N). */ unsigned long bigrand() { #define randbits() ((unsigned long)((rand() >> 6) & 0777)) register int r; r = randbits() << 27; r |= randbits() << 18; r |= randbits() << 9; r |= randbits(); return (r); } /* * Run the test many times. * You will need a running time in the 10s of seconds * for an accurate result. * * To give a fair comparison, the random number generator * is seeded the same for each measurement. * * Return value is seconds per iteration. */ #ifndef REPEAT #if defined(mips) || defined(sparc) #define REPEAT (1L<<20) #else #define REPEAT (1L<<18) #endif #endif double measure(func) register int (*func)(); { #ifdef USE_GETRUSAGE struct rusage ru0, ru1; register long j; srand(1); (void) getrusage(RUSAGE_SELF, &ru0); for (j = 0; j < REPEAT; ++j) func(bigrand()); (void) getrusage(RUSAGE_SELF, &ru1); ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec; if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) { ru1.ru_utime.tv_usec += 1000000; ru1.ru_utime.tv_sec--; } return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) / (double)REPEAT); #else register long j; struct tms start; struct tms finish; srand(1); times(&start); for (j = 0; j < REPEAT; ++j) func(bigrand()); times(&finish); return ((finish.tms_utime - start.tms_utime) / (double)REPEAT); #endif } struct table { char *name; int (*func)(); double measurement; int trial; }; struct table table[] = { { "hackmemmod", t0_hackmemmod }, { "hackmemloop", t1_hackmemloop }, { "hackmemunrolled", t2_hackmemunrolled }, { "rmlsbsub", t3_rmlsbsub }, { "rmlsbmask", t4_rmlsbmask }, { "testlsb", t5_testlsb }, { "testmsb", t6_testmsb }, { "testsignandshift", t7_testsignandshift }, { "testeachbit", t8_testeachbit }, { "testeachbit1shl", t9_testeachbit1shl }, { "tableshift", tA_tableshift }, { "tableuchar", tB_tableuchar }, { "tableshiftcast", tC_tableshiftcast }, { "itableshift", tD_itableshift }, { "itableuchar", tE_itableuchar }, { "itableshiftcast", tF_itableshiftcast }, { "sumbits", tG_sumbits }, }; main(argc, argv) int argc; char **argv; { double calibration, v, best; int j, k, m, verbose; verbose = argc > 1; /* * run a few tests to make sure they all agree */ srand(getpid()); for (j = 0; j < 10; ++j) { unsigned long n; int bad; n = bigrand(); for (k = 0; k < SIZEOF(table); ++k) table[k].trial = table[k].func(n); bad = 0; for (k = 0; k < SIZEOF(table); ++k) for (m = 0; m < SIZEOF(table); ++m) if (table[k].trial != table[m].trial) bad = 1; if (bad) { printf("wrong: %08lX", n); for (k = 0; k < SIZEOF(table); ++k) printf(" %3d", table[k].trial); printf("\n"); } } /* * calibrate the timing mechanism */ calibration = measure(calibrate); if (verbose) printf("calibration: %g\n", calibration); /* * time them all, keeping track of the best (smallest) */ for (j = 0; j < SIZEOF(table); ++j) { v = measure(table[j].func) - calibration; if (verbose) printf("%s: %g\n", table[j].name, v); table[j].measurement = v; if (!j || v < best) best = v; } (void) printf("%-24s %-14sratio\n", "function", "time"); for (j = 0; j < SIZEOF(table); ++j) { (void) printf("%-24s %#10.8g %6.3f\n", table[j].name, table[j].measurement, table[j].measurement / best); } exit(0); } -- In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 405 2750) Domain: chris@cs.umd.edu Path: uunet!mimsy!chris
gene@zeno.mn.org (Gene H. Olson) (10/15/90)
chris@mimsy.umd.edu (Chris Torek) writes: >In article <826@neccan.oz> peter@neccan.oz (Peter Miller) sends out a >program to time various bit-counting functions. I decided to `improve' >the program a bit :-) and add a number of additional functions. I have >run the resulting program (which I will append below) on a number of >architectures. The `mod' version is never the fastest of the hackmem >variants. The fastest function is always table lookup, except on the >Sun-4. I also posted an "improved" version of this program that got horribly erratic and odd results. Looking at all this closely, it seems the erratic results on this "test" are due two 3 problems: 1) The original program had a bug in the procedure table which tested the wrong procedures. Both Chris and I fixed this. 2) The original program neglected to return a value in procedure bigrand(). On the Sun4, this means all the "random" data was zero. This kinda skewed the results. I just fixed this by putting in the return(). 3) The bigrand() procedure was called every invokation of the test procedure. Since for the fast routines it takes *MUCH* longer to generate these numbers than to count the bits, the noise level is extreme. I just fixed this by creating an array once, and then referencing it. The test runs about 6 times faster now, and it is much more consistent. So the bottom line is that most of the results posted so far are largely nonsense. So, in the spirit of throwing good net bandwidth after bad, I am posting the blasted thing again. I hope this discussion is useful to *someone*. Earlier I had added a test now called "suminplace" to the list which I believed was faster than any other way of doing it. (Except maybe a 16-bit lookup table). It turns out, that an 8-bit lookup table beats it. Blast the torpedos, I still like my method best. Maybe it will run faster on another machine or another compiler. It certainly is smaller. >>>>> SparcStation results: function time ratio hackmemmod 0.00093555450 16.082 hackmemloop 0.00013732910 2.361 hackmemunrolled 9.0599060e-05 1.557 rmlsbsub 0.00032043457 5.508 rmlsbmask 0.00026893616 4.623 testlsb 0.00064945221 11.164 testmsb 0.00065517426 11.262 testsignandshift 0.00065326691 11.230 testeachbit 0.00067424774 11.590 testeachbit1shl 0.00086975098 14.951 tableshift 5.8174133e-05 1.000 tableuchar 8.2015991e-05 1.410 suminplace 6.6757202e-05 1.148 >>>>> 386 results: function time ratio hackmemmod 0.00022506714 1.553 hackmemloop 0.00039672852 2.737 hackmemunrolled 0.00025939941 1.789 rmlsbsub 0.00074005127 5.105 rmlsbmask 0.00067138672 4.632 testlsb 0.0016098022 11.105 testmsb 0.0015449524 10.658 testsignandshift 0.0015525818 10.711 testeachbit 0.0015563965 10.737 testeachbit1shl 0.0020790100 14.342 tableshift 0.00014877319 1.026 tableuchar 0.00014495850 1.000 suminplace 0.00014877319 1.026 Perhaps Chris will find it worth his time to run the thing on his list of machines. -------------------- Yet another posting ---------------------- #include <stdio.h> #include <sys/types.h> #ifdef FD_SETSIZE # define USE_GETRUSAGE # include <sys/time.h> # include <sys/resource.h> #else # include <sys/times.h> #endif #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) #define NUMRAND 16384 unsigned long rr[NUMRAND] ; /* * This function is used to calibrate the timing mechanism. * This way we can subtract the loop and call overheads. */ int calibrate(n) register unsigned long n; { return 0; } /* * This function counts the number of bits in a long. * It is limited to 63 bit longs, but a minor mod can cope with 511 bits. * * It is so magic, an explanation is required: * Consider a 3 bit number as being * 4a+2b+c * if we shift it right 1 bit, we have * 2a+b * subtracting this from the original gives * 2a+b+c * if we shift the original 2 bits right we get * a * and so with another subtraction we have * a+b+c * which is the number of bits in the original number. * Suitable masking allows the sums of the octal digits in a * 32 bit bumber to appear in each octal digit. This isn't much help * unless we can get all of them summed together. * This can be done by modulo arithmetic (sum the digits in a number by molulo * the base of the number minus one) the old "cating out nines" trick they * taught in school before calculators were invented. * Now, using mod 7 wont help us, because our number will very likely have * more than 7 bits set. So add the octal digits together to get base64 * digits, and use modulo 63. * (Those of you with 64 bit machines need to add 3 octal digits together to * get base512 digits, and use mod 511.) * * This is HACKMEM 169, as used in X11 sources. */ int t0_hackmemmod(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); return ((tmp + (tmp >> 3)) & 030707070707) % 63; } /* * This is the same as the above, but does not use the % operator. * Most modern machines have clockless division, and so the modulo is as * expensive as, say, an addition. */ int t1_hackmemloop(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; while (tmp > 63) tmp = (tmp & 63) + (tmp >> 6); return tmp; } /* * Above, without using while loop. It takes at most 5 iterations of the * loop, so we just do all 5 in-line. The final result is never 63 * (this is assumed above as well). */ int t2_hackmemunrolled(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); return (tmp); } /* * This function counts the bits in a long. * * It removes the lsb and counting the number of times round the loop. * The expression (n & -n) yields the lsb of a number, * but it only works on 2's compliment machines. */ int t3_rmlsbsub(n) register unsigned long n; { register int count; for (count = 0; n; n -= (n & -n)) count++; return count; } int t4_rmlsbmask(n) register unsigned long n; { register int count; for (count = 0; n; count++) n &= n - 1; /* take away lsb */ return (count); } /* * This function counts the bits in a long. * * It works by shifting the number down and testing the bottom bit. */ int t5_testlsb(n) register unsigned long n; { register int count; for (count = 0; n; n >>= 1) if (n & 1) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number left and testing the top bit. * On many machines shift is expensive, so it uses a cheap addition instead. */ int t6_testmsb(n) register unsigned long n; { register int count; for (count = 0; n; n += n) if (n & ~(~(unsigned long)0 >> 1)) count++; return count; } int t7_testsignandshift(n) register unsigned long n; { register int count; for (count = 0; n; n <<= 1) if ((long)n < 0) count++; return (count); } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the second most intuitively obvious method, * and is independent of the number of bits in the long. */ int t8_testeachbit(n) register unsigned long n; { register int count; register unsigned long mask; count = 0; for (mask = 1; mask; mask += mask) if (n & mask) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the most intuitively obvious method, * but how do you a priori know how many bits in the long? * (except for ''sizeof(long) * CHAR_BITS'' expression) */ int t9_testeachbit1shl(n) register unsigned long n; { register int count; register int bit; count = 0; for (bit = 0; bit < 32; ++bit) if (n & ((unsigned long)1 << bit)) count++; return count; } static char nbits[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; int tA_tableshift(n) register unsigned long n; { return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] + nbits[(n >> 16) & 0xff] + nbits[n >> 24]); } int tB_tableuchar(n) unsigned long n; { register unsigned char *p = (unsigned char *)&n; return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]); } int tC_suminplace(n) register unsigned long n ; { n = ((n >> 1) & 0x55555555) + (n & 0x55555555) ; n = ((n >> 2) & 0x33333333) + (n & 0x33333333) ; n = ((n >> 4) + n) & 0x0F0F0F0F ; n = ((n >> 8) + n) ; return ((n >> 16) + n) & 0xFF ; } /* * build a long random number. * The standard rand() returns at least a 15 bit number. * We use the top 9 of 15 (since the lower N bits of the usual rand() * repeat with a period of 2^N). */ unsigned long bigrand() { #define randbits() ((unsigned long)((rand() >> 6) & 0777)) register int r; r = randbits() << 27; r |= randbits() << 18; r |= randbits() << 9; r |= randbits(); return(r) ; } /* * Run the test many times. * You will need a running time in the 10s of seconds * for an accurate result. * * To give a fair comparison, the random number generator * is seeded the same for each measurement. * * Return value is seconds per iteration. */ #if defined(mips) || defined(sparc) #define REPEAT (1L<<20) #else #define REPEAT (1L<<18) #endif double measure(func) register int (*func)(); { #ifdef USE_GETRUSAGE struct rusage ru0, ru1; register long j; (void) getrusage(RUSAGE_SELF, &ru0); for (j = 0; j < REPEAT; ++j) func(rr[j & (NUMRAND-1)]); (void) getrusage(RUSAGE_SELF, &ru1); ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec; if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) { ru1.ru_utime.tv_usec += 1000000; ru1.ru_utime.tv_sec--; } return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) / (double)REPEAT); #else register long j; struct tms start; struct tms finish; times(&start); for (j = 0; j < REPEAT; ++j) func(rr[j & (NUMRAND-1)]); times(&finish); return ((finish.tms_utime - start.tms_utime) / (double)REPEAT); #endif } struct table { char *name; int (*func)(); double measurement; int trial; }; struct table table[] = { { "hackmemmod", t0_hackmemmod }, { "hackmemloop", t1_hackmemloop }, { "hackmemunrolled", t2_hackmemunrolled }, { "rmlsbsub", t3_rmlsbsub }, { "rmlsbmask", t4_rmlsbmask }, { "testlsb", t5_testlsb }, { "testmsb", t6_testmsb }, { "testsignandshift", t7_testsignandshift }, { "testeachbit", t8_testeachbit }, { "testeachbit1shl", t9_testeachbit1shl }, { "tableshift", tA_tableshift }, { "tableuchar", tB_tableuchar }, { "suminplace", tC_suminplace }, }; main(argc, argv) int argc; char **argv; { double calibration, v, best; int j, k, m, verbose; verbose = argc > 1; /* * run a few tests to make sure they all agree */ srand(getpid()); for (j = 0; j < 10; ++j) { unsigned long n; int bad; n = bigrand(); for (k = 0; k < SIZEOF(table); ++k) table[k].trial = table[k].func(n); bad = 0; for (k = 0; k < SIZEOF(table); ++k) for (m = 0; m < SIZEOF(table); ++m) if (table[k].trial != table[m].trial) bad = 1; if (bad) { printf("wrong: %08lX", n); for (k = 0; k < SIZEOF(table); ++k) printf(" %3d", table[k].trial); printf("\n"); } } for (k = 0 ; k < NUMRAND ; k++) rr[k] = bigrand() ; /* * calibrate the timing mechanism */ calibration = measure(calibrate); if (verbose) printf("calibration: %g\n", calibration); /* * time them all, keeping track of the best (smallest) */ for (j = 0; j < SIZEOF(table); ++j) { v = measure(table[j].func) - calibration; if (verbose) printf("%s: %g\n", table[j].name, v); table[j].measurement = v; if (!j || v < best) best = v; } (void) printf("%-24s %-14sratio\n", "function", "time"); for (j = 0; j < SIZEOF(table); ++j) { (void) printf("%-24s %#10.8g %6.3f\n", table[j].name, table[j].measurement, table[j].measurement / best); } exit(0); } _________________________________________________________________________ __ / ) Gene H. Olson uunet!digibd!zeno!gene / __ _ __ _ Smartware Consulting (__/ _(/_//_(/_ Minneapolis, MN (612) 824-9108
moews@math.berkeley.edu (David Moews) (10/17/90)
I took Gene Olson's test program and stuck Torek's routines into it, as well as adding routines to count bits by using a table with a 16-bit index, and a slightly sped-up version of HAKMEM 169 (`hackmemallfold'). The 16-bit table lookup appears to be fastest, at least on a Sun-3/50 and a Sparcstation 1. `hackmemallfold' and `sumbits' both use a total of 16 arithmetic, logical, and shift operations. However, if gcc is used `hackmemallfold' is apparently faster (because gcc refuses to put constants in registers?) name technique used ---- -------------- hackmemmod HACKMEM 169, using % operator hackmemloop HACKMEM 169, using loop to implement % hackmemunrolled HACKMEM 169, with 5-step % (loop unrolled) hackmemallfold HACKMEM 169, no %, fold to the bitter end rmlsbsub remove lsb with `n -= (n & -n)' rmlsbmask remove lsb with `n &= n - 1' testlsb test n&1, then n>>=1 testmsb test n&MSB, then n+=n (rather than n<<=1) testsignandshift test n<0, then n<<=1 testeachbit test n&mask, then mask+=mask testeachbit1shl test n&(1<<bit) for bit in [0..32) tableshift nbits[n>>24] + nbits[(n>>16)&255] ... tableuchar set p=&n, nbits[p[0]] + nbits[p[1]] ... tableshiftcast nbits[n>>24] + nbits[(u_char)(n>>16)] ... itableshift same as tableshift, but table datatype is int itableuchar ditto itableshiftcast ditto (note, nbits table is `char') 16tableshift nbits16[n>>16] + nbits16[n&65535] 16tableushort set p=&n, nbits16[p[0]] + nbits16[p[1]] 16tableshiftcast nbits16[n>>16] + nbits16[(u_short)(n)] 16itableshift same as 16tableshift, but table datatype is int 16itableushort ditto 16itableshiftcast ditto sumbits Gene Olson's function (see code for comments) Sun-3/50, SunOS 4.1, cc -O function time ratio hackmemmod 1.0166168e-05 1.931 hackmemloop 1.2187958e-05 2.315 hackmemunrolled 1.3732910e-05 2.609 hackmemallfold 8.2588196e-06 1.569 rmlsbsub 1.9512177e-05 3.707 rmlsbmask 1.9512177e-05 3.707 testlsb 4.6901703e-05 8.909 testmsb 4.8561096e-05 9.225 testsignandshift 4.0779114e-05 7.746 testeachbit 4.6749115e-05 8.880 testeachbit1shl 6.6032410e-05 12.543 tableshift 1.6460419e-05 3.127 tableuchar 1.3256073e-05 2.518 tableshiftcast 1.1730194e-05 2.228 itableshift 1.6918182e-05 3.214 itableuchar 9.5176697e-06 1.808 itableshiftcast 1.2359619e-05 2.348 16tableshift 1.0337830e-05 1.964 16tableushort 6.0081482e-06 1.141 16tableshiftcast 6.1416626e-06 1.167 16itableshift 5.2642822e-06 1.000 16itableushort 6.6184998e-06 1.257 16itableshiftcast 1.1196136e-05 2.127 sumbits 7.1525574e-06 1.359 Sun-3/50, SunOS 4.1, gcc -O, ver. 1.37.1 function time ratio hackmemmod 1.0070801e-05 3.259 hackmemloop 1.1653900e-05 3.772 hackmemunrolled 1.0719299e-05 3.469 hackmemallfold 7.9727173e-06 2.580 rmlsbsub 1.9207001e-05 6.216 rmlsbmask 1.9226074e-05 6.222 testlsb 4.4364929e-05 14.358 testmsb 3.7155151e-05 12.025 testsignandshift 4.1427612e-05 13.407 testeachbit 4.8522949e-05 15.704 testeachbit1shl 5.4588318e-05 17.667 tableshift 1.1863708e-05 3.840 tableuchar 8.7928772e-06 2.846 tableshiftcast 1.6403198e-05 5.309 itableshift 9.8037720e-06 3.173 itableuchar 6.3896179e-06 2.068 itableshiftcast 1.3904572e-05 4.500 16tableshift 5.7983398e-06 1.877 16tableushort 4.4250488e-06 1.432 16tableshiftcast 5.3215027e-06 1.722 16itableshift 4.9591064e-06 1.605 16itableushort 3.0899048e-06 1.000 16itableshiftcast 9.1934204e-06 2.975 sumbits 9.6511841e-06 3.123 SparcStation 1, SunOS 4.0.3c, cc -O function time ratio hackmemmod 1.4197826e-05 18.551 hackmemloop 2.2125244e-06 2.891 hackmemunrolled 1.4662743e-06 1.916 hackmemallfold 1.0657310e-06 1.393 rmlsbsub 5.1355362e-06 6.710 rmlsbmask 4.2915344e-06 5.607 testlsb 1.0557175e-05 13.794 testmsb 1.0647774e-05 13.913 testsignandshift 1.0533333e-05 13.763 testeachbit 1.0833740e-05 14.156 testeachbit1shl 1.3933182e-05 18.206 tableshift 9.0837479e-07 1.187 tableuchar 1.3136864e-06 1.717 tableshiftcast 9.1314316e-07 1.193 itableshift 1.1110306e-06 1.452 itableuchar 1.5211105e-06 1.988 itableshiftcast 1.1157990e-06 1.458 16tableshift 8.2015991e-07 1.072 16tableushort 1.1849403e-06 1.548 16tableshiftcast 7.6532364e-07 1.000 16itableshift 1.6880035e-06 2.206 16itableushort 2.0956993e-06 2.738 16itableshiftcast 1.6403198e-06 2.143 sumbits 1.0561943e-06 1.380 SparcStation 1, SunOS 4.0.3c, gcc -O, ver. 1.37 function time ratio hackmemmod 1.1713505e-05 14.241 hackmemloop 2.4437904e-06 2.971 hackmemunrolled 1.4662743e-06 1.783 hackmemallfold 1.0633469e-06 1.293 rmlsbsub 5.1999092e-06 6.322 rmlsbmask 4.3869019e-06 5.333 testlsb 1.2927055e-05 15.716 testmsb 1.6040802e-05 19.501 testsignandshift 1.2907982e-05 15.693 testeachbit 1.1508465e-05 13.991 testeachbit1shl 1.4796257e-05 17.988 tableshift 1.0609627e-06 1.290 tableuchar 1.5187263e-06 1.846 tableshiftcast 1.0633469e-06 1.293 itableshift 1.2660027e-06 1.539 itableuchar 1.7237663e-06 2.096 itableshiftcast 1.2612343e-06 1.533 16tableshift 8.7499619e-07 1.064 16tableushort 1.1301041e-06 1.374 16tableshiftcast 8.2254410e-07 1.000 16itableshift 1.7356873e-06 2.110 16itableushort 1.9979477e-06 2.429 16itableshiftcast 1.6951561e-06 2.061 sumbits 1.3136864e-06 1.597 -- David Moews moews@math.berkeley.edu -----------------------------cut here for program------------------------------- #include <stdio.h> #include <sys/types.h> #ifdef FD_SETSIZE # define USE_GETRUSAGE # include <sys/time.h> # include <sys/resource.h> #else # include <sys/times.h> #endif #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) #define NUMRAND 16384 unsigned long rr[NUMRAND]; /* * This function is used to calibrate the timing mechanism. * This way we can subtract the loop and call overheads. */ int calibrate(n) register unsigned long n; { return 0; } /* * This function counts the number of bits in a long. * It is limited to 63 bit longs, but a minor mod can cope with 511 bits. * * It is so magic, an explanation is required: * Consider a 3 bit number as being * 4a+2b+c * if we shift it right 1 bit, we have * 2a+b * subtracting this from the original gives * 2a+b+c * if we shift the original 2 bits right we get * a * and so with another subtraction we have * a+b+c * which is the number of bits in the original number. * Suitable masking allows the sums of the octal digits in a * 32 bit number to appear in each octal digit. This isn't much help * unless we can get all of them summed together. * This can be done by modulo arithmetic (sum the digits in a number by molulo * the base of the number minus one) the old "casting out nines" trick they * taught in school before calculators were invented. * Now, using mod 7 wont help us, because our number will very likely have * more than 7 bits set. So add the octal digits together to get base64 * digits, and use modulo 63. * (Those of you with 64 bit machines need to add 3 octal digits together to * get base512 digits, and use mod 511.) * * This is HACKMEM 169, as used in X11 sources. */ int t0_hackmemmod(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); return ((tmp + (tmp >> 3)) & 030707070707) % 63; } /* * This is the same as the above, but does not use the % operator. * Most modern machines have clockless division, and so the modulo is as * expensive as, say, [sic] an addition. */ int t1_hackmemloop(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; while (tmp > 63) tmp = (tmp & 63) + (tmp >> 6); return tmp; } /* * Above, without using while loop. It takes at most 5 iterations of the * loop, so we just do all 5 in-line. The final result is never 63 * (this is assumed above as well). */ int t2_hackmemunrolled(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); return (tmp); } /* * As above, but no modulus nor simulation thereof. Instead, * we continue shifting and masking to the bitter end. */ int t3_hackmemallfold(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; tmp += tmp >> 6; return ((tmp + (tmp >> 12) + (tmp >> 24)) & 077); } /* * This function counts the bits in a long. * * It removes the lsb and counting the number of times round the loop. * The expression (n & -n) yields the lsb of a number, * but it only works on 2's compliment machines. */ int t4_rmlsbsub(n) register unsigned long n; { register int count; for (count = 0; n; n -= (n & -n)) count++; return count; } int t5_rmlsbmask(n) register unsigned long n; { register int count; for (count = 0; n; count++) n &= n - 1; /* take away lsb */ return (count); } /* * This function counts the bits in a long. * * It works by shifting the number down and testing the bottom bit. */ int t6_testlsb(n) register unsigned long n; { register int count; for (count = 0; n; n >>= 1) if (n & 1) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number left and testing the top bit. * On many machines shift is expensive, so it uses a cheap addition instead. */ int t7_testmsb(n) register unsigned long n; { register int count; for (count = 0; n; n += n) if (n & ~(~(unsigned long)0 >> 1)) count++; return count; } int t8_testsignandshift(n) register unsigned long n; { register int count; for (count = 0; n; n <<= 1) if ((long)n < 0) count++; return (count); } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the second most intuitively obvious method, * and is independent of the number of bits in the long. */ int t9_testeachbit(n) register unsigned long n; { register int count; register unsigned long mask; count = 0; for (mask = 1; mask; mask += mask) if (n & mask) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the most intuitively obvious method, * but how do you a priori know how many bits in the long? * (except for ''sizeof(long) * CHAR_BITS'' expression) */ int tA_testeachbit1shl(n) register unsigned long n; { register int count; register int bit; count = 0; for (bit = 0; bit < 32; ++bit) if (n & ((unsigned long)1 << bit)) count++; return count; } static char nbits[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; static int inbits[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; int tB_tableshift(n) register unsigned long n; { return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] + nbits[(n >> 16) & 0xff] + nbits[n >> 24]); } int tC_tableuchar(n) unsigned long n; { register unsigned char *p = (unsigned char *)&n; return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]); } int tD_tableshiftcast(n) register unsigned long n; { return nbits[(unsigned char) n] + nbits[(unsigned char) (n >> 8)] + nbits[(unsigned char) (n >> 16)] + nbits[n >> 24]; } int tE_itableshift(n) register unsigned long n; { return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] + inbits[(n >> 16) & 0xff] + inbits[n >> 24]); } int tF_itableuchar(n) unsigned long n; { register unsigned char *p = (unsigned char *)&n; return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]); } int tG_itableshiftcast(n) register unsigned long n; { return inbits[(unsigned char) n] + inbits[(unsigned char) (n >> 8)] + inbits[(unsigned char) (n >> 16)] + inbits[n >> 24]; } static char nbits16[65536]; static int inbits16[65536]; int tH_16tableshift(n) register unsigned long n; { return (nbits16[n & 0xffff] + nbits16[n >> 16]); } int tI_16tableushort(n) unsigned long n; { register unsigned short *p = (unsigned short *)&n; return (nbits16[p[0]] + nbits16[p[1]]); } int tJ_16tableshiftcast(n) register unsigned long n; { return nbits16[(unsigned short) n] + nbits16[n >> 16]; } int tK_16itableshift(n) register unsigned long n; { return (inbits16[n & 0xffff] + inbits16[n >> 16]); } int tL_16itableushort(n) unsigned long n; { register unsigned short *p = (unsigned short *)&n; return (inbits16[p[0]] + inbits16[p[1]]); } int tM_16itableshiftcast(n) register unsigned long n; { return inbits16[(unsigned short) n] + inbits16[n >> 16]; } /* * Explanation: * First we add 32 1-bit fields to get 16 2-bit fields. * Each 2-bit field is one of 00, 01, or 10 (binary). * We then add all the two-bit fields to get 8 4-bit fields. * These are all one of 0000, 0001, 0010, 0011, or 0100. * * Now we can do something different, becuase for the first * time the value in each k-bit field (k now being 4) is small * enough that adding two k-bit fields results in a value that * still fits in the k-bit field. The result is four 4-bit * fields containing one of {0000,0001,...,0111,1000} and four * more 4-bit fields containing junk (sums that are uninteresting). * Pictorially: * n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh * n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg * sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ * where W, X, Y, and Z are the interesting sums (each at most 1000, * or 8 decimal). Masking with 0x0f0f0f0f extracts these. * * Now we can change tactics yet again, because now we have: * n = 0000WWWW0000XXXX0000YYYY0000ZZZZ * n>>8 = 000000000000WWWW0000XXXX0000YYYY * so sum = 0000WWWW000ppppp000qqqqq000rrrrr * where p and r are the interesting sums (and each is at most * 10000, or 16 decimal). The sum `q' is junk, like i, j, and * k above; but it is not necessarry to discard it this time. * One more fold, this time by sixteen bits, gives * n = 0000WWWW000ppppp000qqqqq000rrrrr * n>>16 = 00000000000000000000WWWW000ppppp * so sum = 0000WWWW000ppppp000sssss00tttttt * where s is at most 11000 and t is it most 100000 (32 decimal). * * Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)), * or in other words, t is the number of bits set in the original * 32-bit longword. So all we have to do is return the low byte * (or low 6 bits, but `low byte' is typically just as easy if not * easier). * * This technique is also applicable to 64 and 128 bit words, but * 256 bit or larger word sizes require at least one more masking * step. */ int tN_sumbits(n) register unsigned long n; { n = (n & 0x55555555) + ((n >> 1) & 0x55555555); n = (n & 0x33333333) + ((n >> 2) & 0x33333333); n = (n + (n >> 4)) & 0x0f0f0f0f; n += n >> 8; n += n >> 16; return (n & 0xff); } /* * build a long random number. * The standard rand() returns at least a 15 bit number. * We use the top 9 of 15 (since the lower N bits of the usual rand() * repeat with a period of 2^N). */ unsigned long bigrand() { #define randbits() ((unsigned long)((rand() >> 6) & 0777)) register int r; r = randbits() << 27; r |= randbits() << 18; r |= randbits() << 9; r |= randbits(); return(r) ; } /* * Run the test many times. * You will need a running time in the 10s of seconds * for an accurate result. * * To give a fair comparison, the random number generator * is seeded the same for each measurement. * * Return value is seconds per iteration. */ #if defined(mips) || defined(sparc) #define REPEAT (1L<<22) #else #define REPEAT (1L<<20) #endif double measure(func) register int (*func)(); { #ifdef USE_GETRUSAGE struct rusage ru0, ru1; register long j; (void) getrusage(RUSAGE_SELF, &ru0); for (j = 0; j < REPEAT; ++j) func(rr[j & (NUMRAND-1)]); (void) getrusage(RUSAGE_SELF, &ru1); ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec; if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) { ru1.ru_utime.tv_usec += 1000000; ru1.ru_utime.tv_sec--; } return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) / (double)REPEAT); #else register long j; struct tms start; struct tms finish; times(&start); for (j = 0; j < REPEAT; ++j) func(rr[j & (NUMRAND-1)]); times(&finish); return ((finish.tms_utime - start.tms_utime) / (double)REPEAT); #endif } struct table { char *name; int (*func)(); double measurement; int trial; }; struct table table[] = { { "hackmemmod", t0_hackmemmod }, { "hackmemloop", t1_hackmemloop }, { "hackmemunrolled", t2_hackmemunrolled }, { "hackmemallfold", t3_hackmemallfold }, { "rmlsbsub", t4_rmlsbsub }, { "rmlsbmask", t5_rmlsbmask }, { "testlsb", t6_testlsb }, { "testmsb", t7_testmsb }, { "testsignandshift", t8_testsignandshift }, { "testeachbit", t9_testeachbit }, { "testeachbit1shl", tA_testeachbit1shl }, { "tableshift", tB_tableshift }, { "tableuchar", tC_tableuchar }, { "tableshiftcast", tD_tableshiftcast }, { "itableshift", tE_itableshift }, { "itableuchar", tF_itableuchar }, { "itableshiftcast", tG_itableshiftcast }, { "16tableshift", tH_16tableshift }, { "16tableushort", tI_16tableushort }, { "16tableshiftcast", tJ_16tableshiftcast }, { "16itableshift", tK_16itableshift }, { "16itableushort", tL_16itableushort }, { "16itableshiftcast", tM_16itableshiftcast }, { "sumbits", tN_sumbits } }; main(argc, argv) int argc; char **argv; { double calibration, v, best; int j, k, m, verbose; /* Initialize the 16-bit lookup table. */ for (j = 0; j < 65536; ++j) { nbits16[j] = inbits16[j] = tN_sumbits((unsigned long)j); } verbose = argc > 1; /* * run a few tests to make sure they all agree */ srand(getpid()); for (j = 0; j < 10; ++j) { unsigned long n; int bad; n = bigrand(); for (k = 0; k < SIZEOF(table); ++k) table[k].trial = table[k].func(n); bad = 0; for (k = 0; k < SIZEOF(table); ++k) for (m = 0; m < SIZEOF(table); ++m) if (table[k].trial != table[m].trial) bad = 1; if (bad) { printf("wrong: %08lX", n); for (k = 0; k < SIZEOF(table); ++k) printf(" %3d", table[k].trial); printf("\n"); } } for (k = 0 ; k < NUMRAND ; k++) rr[k] = bigrand() ; /* * calibrate the timing mechanism */ calibration = measure(calibrate); if (verbose) printf("calibration: %g\n", calibration); /* * time them all, keeping track of the best (smallest) */ for (j = 0; j < SIZEOF(table); ++j) { v = measure(table[j].func) - calibration; if (verbose) printf("%s: %g\n", table[j].name, v); table[j].measurement = v; if (!j || v < best) best = v; } (void) printf("%-24s %-14sratio\n", "function", "time"); for (j = 0; j < SIZEOF(table); ++j) { (void) printf("%-24s %#10.8g %6.3f\n", table[j].name, table[j].measurement, table[j].measurement / best); } exit(0); }
kdq@demott.COM (Kevin D. Quitt) (10/18/90)
Something is seriously wrong with this bit-counting benchmark. I'm running on a Motorola Delta 3600 (25MHz 68030), and my time are 2 orders of magnitude slower than those reported for a Sun 3/50!!!! function time ratio hackmemmod 0.00033760071 2.810 hackmemloop 0.00034141541 2.841 hackmemunrolled 0.00017452240 1.452 hackmemallfold 0.00026226044 2.183 rmlsbsub 0.00062084198 5.167 rmlsbmask 0.00061321259 5.103 testlsb 0.0015430450 12.841 testmsb 0.0012369156 10.294 testsignandshift 0.0014562607 12.119 testeachbit 0.0016183853 13.468 testeachbit1shl 0.0018253326 15.190 tableshift 0.00029087067 2.421 tableuchar 0.00018882751 1.571 tableshiftcast 0.00039577484 3.294 itableshift 0.00022602081 1.881 itableuchar 0.00012016296 1.000 itableshiftcast 0.00032424927 2.698 16tableshift 0.00033283234 2.770 16tableushort 0.00027084351 2.254 16tableshiftcast 0.00029563904 2.460 16itableshift 0.00035190582 2.929 16itableushort 0.00033092499 2.754 16itableshiftcast 0.00042724609 3.556 sumbits 0.00023365021 1.944 In order to get this to compile on my SYSV machine, I had to add the line: #undef FD_SETSIZE because there's no resource.h on this machine. Oh yes, it's gcc -O. -- _ Kevin D. Quitt demott!kdq kdq@demott.com DeMott Electronics Co. 14707 Keswick St. Van Nuys, CA 91405-1266 VOICE (818) 988-4975 FAX (818) 997-1190 MODEM (818) 997-4496 PEP last 96.37% of all statistics are made up.
ts@cup.portal.com (Tim W Smith) (10/24/90)
When benchmarking something like this, a good thing to do is to do a back-of-the-envelope estimate of what you expect. For example, the Mac II runs at 16 MHz. Assume a few wait states, and loop overhead, and stuff like that will limit us to 1 "useful" instruction every 8 clocks, so we get 2 "useful" MIPS. The operation x &= x-1 will take three operations. At 2 MIPS, an operation is 1/2 uSec, so we have 1.5 uSec per bit. The loop runs an average of 16 times, so we expect that we can do it in 24 uSec. The measured time was something like 20 uSec, so we pass. If you see numbers in the hundreds of microseconds, you can be almost sure that something is wrong with the program. Tim Smith