[comp.sys.sgi] C switch performance

kurt@cashew.asd.sgi.com (Kurt Akeley) (10/05/90)

In the interest of creating a _really_ fast display list format for the
VGX graphics, I have been experimenting with various token parsing schemes.
Of particular interest is a simple switch statement, looping through an
array of integer tokens.  To test performance, I wrote a test loop and
timed it with several different variations.  The base test code is:

*******************************************************************************

/*
 * Kurt Akeley
 * 4 October 1990
 *
 * Test performance of C switch statement
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/times.h>
#include <sys/param.h>

#define MAXTOKEN 20
#define ARRAYLEN 10000
#define MAXLOOP 1000

long dummy = 0;
long tokens[ARRAYLEN];

main() {
    register i;

    /* generate random values in an array */
    srand(1);
    for (i=0; i<(ARRAYLEN-1); i++)
	tokens[i] = ((float)rand() / 32767.0) * MAXTOKEN;
    tokens[ARRAYLEN-1] = MAXTOKEN;

    /* time the switch operation */
    startclock();
    for (i=0; i<MAXLOOP; i++)
	switchloop();
    stopclock(MAXLOOP,ARRAYLEN);

    /* print the result (to avoid optimization) */
    printf("dummy = %d\n",dummy);
}

switchloop() {
    register long i;
    register long localdummy;
    i = 0;
    localdummy = 0;
    while (1) {
	switch(tokens[i++]) {
	    case 0: localdummy += 0; break;
	    case 1: localdummy += 1; break;
	    case 2: localdummy += 2; break;
	    case 3: localdummy += 3; break;
	    case 4: localdummy += 4; break;
	    case 5: localdummy += 5; break;
	    case 6: localdummy += 6; break;
	    case 7: localdummy += 7; break;
	    case 8: localdummy += 8; break;
	    case 9: localdummy += 9; break;
	    case 10: localdummy += 10; break;
	    case 11: localdummy += 11; break;
	    case 12: localdummy += 12; break;
	    case 13: localdummy += 13; break;
	    case 14: localdummy += 14; break;
	    case 15: localdummy += 15; break;
	    case 16: localdummy += 16; break;
	    case 17: localdummy += 17; break;
	    case 18: localdummy += 18; break;
	    case 19: localdummy += 19; break;
	    case MAXTOKEN: dummy += localdummy; return;
	}
    }
}

struct tms tbuf;
long gtime;

startclock() {
    gtime = times(&tbuf);
}

stopclock(events,itemsperevent) {
    float period;
    float timeperevent;
    float rate;
    period = (float)(times(&tbuf) - gtime) / 100.0;
    timeperevent = period / (float)events;
    rate = 1.0 / timeperevent;
    fprintf(stdout,"  %4d events per second (%7d switches per second)\n",
	(int)rate,(int)(rate*itemsperevent));
    fflush(stdout);
}

************************************************************************
*********

Note that real computation is done throughout the code, so no part can be
legitimately optimized out.  Also, the different cases in the loop are called
equally often, and they are sorted to make the compiler's job easier.
My first surprise was that, when the inner loop

	switch(tokens[i++]) {

was replaced with pointer arithmetic, the performance dropped from 1.66 million
switches per second to 1.33 million switches per second.  OK, so array indexing
is faster than pointer arithmetic (that's not how I learned C coding, but so
what).  The big surprise, however, is that when the order of the cases is
randomized as in:

	switch(tokens[i++]) {
	    case 3: localdummy += 3; break;
	    case 16: localdummy += 16; break;
	    case 9: localdummy += 9; break;
	    case 5: localdummy += 5; break;
	    case 2: localdummy += 2; break;
	    case 4: localdummy += 4; break;
	    case 7: localdummy += 7; break;
	    case 8: localdummy += 8; break;
	    case 10: localdummy += 10; break;
	    case 11: localdummy += 11; break;
	    case 12: localdummy += 12; break;
	    case 14: localdummy += 14; break;
	    case 0: localdummy += 0; break;
	    case 18: localdummy += 18; break;
	    case 15: localdummy += 15; break;
	    case 6: localdummy += 6; break;
	    case 1: localdummy += 1; break;
	    case 17: localdummy += 17; break;
	    case 19: localdummy += 19; break;
	    case MAXTOKEN: dummy += localdummy; return;
	    case 13: localdummy += 13; break;
	}

the performance INCREASES to 1.77 million switches per second.  This strikes
me as being perverse.  Can anyone explain it?

-- kurt

kurt@cashew.asd.sgi.com (Kurt Akeley) (10/10/90)

Regarding the performance of switch statements on Silicon Graphics
R3000-based processors, there really is a performance loss when the
cases are sorted.  An unnecessary jump instruction is included in
this case.  Apparantly this bug had already been eliminated in the
SGI-internal compiler being prepared for a subsequent release.

Thanks for the many notes suggesting other explanations, and
special thanks to Dave Ciemiewicz, who had the real explanation.

-- kurt