jamie@archone.tamu.edu (James Price) (04/27/91)
Has anyone done any benchmarking of the SGI matrix functions? I was curious and wrote the program included below. It does a number of 4x4 matrix multiplies, first using software, and then using the geometry pipeline functions (loadmatrix(), multmatrix(), getmatrix()). Here are some typical results: 10000 iterations on fritz, with GL version: GL4DGT-3.3 Software - no optimization: 3.349 sec. Software - some optimization: 1.130 sec. Software - more optimization: 0.910 sec. Hardware - preserve CTM: 2.379 sec. Hardware - destroy CTM: 2.289 sec. Hardware - abandon results: 0.580 sec. The actual hardware multiplication is fast (0.580 sec/10000 multiplies) but if we call getmatrix() to access the results, it slows things down by around 400% (to 2.379 sec/10000 multiplies). I was hoping to use the speed of the hardware for my own matrix needs, but it looks like the getmatrix() call is simply too costly. Is there a better way? Jim Price jamie@archone.tamu.edu Visualization Laboratory Texas A&M University /**************************************************************************/ /* */ /* matperf.c - SGI GL matrix performance checker */ /* */ /* to compile: cc -o matperf matperf.c -lgl_s -lm */ /* */ /* to run: matperf n */ /* where n = number of matrix multiplies to perform */ /* */ /**************************************************************************/ #include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <sys/param.h> #include <gl.h> typedef float MAT44[4][4]; void Print44(MAT44 *pMat); void Identity(MAT44 *pMat); double Duration(struct timeval *ptv1, struct timeval *ptv2); void SoftMult44_1(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2); void SoftMult44_2(float pResult[], MAT44 *pm1, MAT44 *pm2); void SoftMult44_3(float pResult[], MAT44 *pm1, MAT44 *pm2); void HardMult44_1(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2); void HardMult44_2(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2); void HardMult44_3(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2); void main(int argc, char *argv[]) { register long i; long iter; MAT44 m1, m2, result; char hwver[13],hostname[MAXHOSTNAMELEN+1]; struct timeval tv1,tv2; struct timezone tz; if (argc != 2) { printf("Usage: matperf n\n"); return; } iter = atoi(argv[1]); /* put in some numbers */ Identity(m1); m1[0][1] = 1.0; m1[0][2] = 1.0; m1[0][3] = 1.0; Identity(m2); m2[0][0] = 5; m2[1][1] = 6; m2[2][2] = 7; m2[3][0] = 10; m2[3][1] = 20; m2[3][2] = 30; gethostname(hostname,MAXHOSTNAMELEN); gversion(hwver); /* winopen() necessary to use geometry pipeline */ prefposition(500,600,500,600); noport(); winopen("perf"); /* give window processes a chance to get up and running */ sleep(5); printf("\n%ld iterations on %s, with GL version: %s\n",iter,hostname,hwver); gettimeofday(&tv1,&tz); for (i=0; i<iter; i++) SoftMult44_1(result,m1,m2); gettimeofday(&tv2,&tz); printf("\nSoftware - no optimization: %7.3f sec.\n",Duration(&tv1,&tv2)); gettimeofday(&tv1,&tz); for (i=0; i<iter; i++) SoftMult44_2(result,m1,m2); gettimeofday(&tv2,&tz); printf("\nSoftware - some optimization: %7.3f sec.\n",Duration(&tv1,&tv2)); gettimeofday(&tv1,&tz); for (i=0; i<iter; i++) SoftMult44_3(result,m1,m2); gettimeofday(&tv2,&tz); printf("\nSoftware - more optimization: %7.3f sec.\n",Duration(&tv1,&tv2)); gettimeofday(&tv1,&tz); for (i=0; i<iter; i++) HardMult44_1(result,m1,m2); gettimeofday(&tv2,&tz); printf("\nHardware - preserve CTM: %7.3f sec.\n",Duration(&tv1,&tv2)); gettimeofday(&tv1,&tz); for (i=0; i<iter; i++) HardMult44_2(result,m1,m2); gettimeofday(&tv2,&tz); printf("\nHardware - destroy CTM: %7.3f sec.\n",Duration(&tv1,&tv2)); gettimeofday(&tv1,&tz); for (i=0; i<iter; i++) HardMult44_3(result,m1,m2); gettimeofday(&tv2,&tz); printf("\nHardware - abandon results: %7.3f sec.\n",Duration(&tv1,&tv2)); printf("\nDone."); } /* convert gettimeofday() values to real number */ double Duration(struct timeval *ptv1, struct timeval *ptv2) { return (((double)ptv2->tv_sec + (double)ptv2->tv_usec / 1000000.0) - ((double)ptv1->tv_sec + (double)ptv1->tv_usec / 1000000.0)); } /* 4x4 no optimization */ void SoftMult44_1(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2) { int i,j,k; for (i=0; i<4; i++) for (j=0; j<4; j++) { (*pResult)[i][j] = 0.0; for (k=0; k<4; k++) (*pResult)[i][j] += (*pm1)[i][k]*(*pm2)[k][j]; } } /* 4x4 some optimization */ void SoftMult44_2(float pResult[], MAT44 *pm1, MAT44 *pm2) { register int i,j; for (i=0; i<4; i++) for (j=0; j<4; j++) { *pResult = (*pm1)[i][0]*(*pm2)[0][j] + (*pm1)[i][1]*(*pm2)[1][j] + (*pm1)[i][2]*(*pm2)[2][j] + (*pm1)[i][3]*(*pm2)[3][j]; pResult++; } } /* 4x4 more optimization */ void SoftMult44_3(float pResult[], MAT44 *pm1, MAT44 *pm2) { register int i; for (i=0; i<4; i++) { *pResult = (*pm1)[i][0]*(*pm2)[0][0] + (*pm1)[i][1]*(*pm2)[1][0] + (*pm1)[i][2]*(*pm2)[2][0] + (*pm1)[i][3]*(*pm2)[3][0]; pResult++; *pResult = (*pm1)[i][0]*(*pm2)[0][1] + (*pm1)[i][1]*(*pm2)[1][1] + (*pm1)[i][2]*(*pm2)[2][1] + (*pm1)[i][3]*(*pm2)[3][1]; pResult++; *pResult = (*pm1)[i][0]*(*pm2)[0][2] + (*pm1)[i][1]*(*pm2)[1][2] + (*pm1)[i][2]*(*pm2)[2][2] + (*pm1)[i][3]*(*pm2)[3][2]; pResult++; *pResult = (*pm1)[i][0]*(*pm2)[0][3] + (*pm1)[i][1]*(*pm2)[1][3] + (*pm1)[i][2]*(*pm2)[2][3] + (*pm1)[i][3]*(*pm2)[3][3]; pResult++; } } /* preserve CTM */ void HardMult44_1(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2) { pushmatrix(); loadmatrix(pm2); multmatrix(pm1); getmatrix(pResult); popmatrix(); } /* destroy CTM */ void HardMult44_2(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2) { loadmatrix(pm2); multmatrix(pm1); getmatrix(pResult); } /* preserve CTM, abandon results */ void HardMult44_3(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2) { pushmatrix(); loadmatrix(pm2); multmatrix(pm1); popmatrix(); } void Print44(MAT44 *pMat) { int i,j; for (i=0; i<4; i++) { printf("\n"); for (j=0; j<4; j++) printf("%5.3f ",(*pMat)[i][j]); } } void Identity(MAT44 *pMat) { int i,j; for (i=0; i<4; i++) for (j=0; j<4; j++) (*pMat)[i][j] = (i == j) ? (1.0) : (0.0); }
zombie@voodoo.UUCP (Mike York) (04/28/91)
In article <15407@helios.TAMU.EDU> jamie@archone.tamu.edu (James Price) writes: >Has anyone done any benchmarking of the SGI matrix functions? I was curious >and wrote the program included below. It does a number of 4x4 matrix >multiplies, first using software, and then using the geometry pipeline >functions (loadmatrix(), multmatrix(), getmatrix()). > >Here are some typical results: > >10000 iterations on fritz, with GL version: GL4DGT-3.3 ^^ Aha, a GT! Our application used to run on 4D/60T's. We had one 4D/70GT. We found out that anything that required the graphics pipeline to regurgitate information (picking, getmatrix, feedback goodies, etc.), was very slow with the GT architecture (picking was up to 6 time slower with our application). We got rid of all our 60's and now have a bunch of 4D/25TG's. I would venture a guess that if you ran your program on a non-GT machine, you'd see better results. -- Mike York | Boeing Computer Services | Support your local nanobrewer -- (206) 865-6577 | No twist-off bottle caps. zombie@voodoo.boeing.com |
tarolli@westcoast.esd.sgi.com (Gary Tarolli) (04/29/91)
In article <15407@helios.TAMU.EDU>, jamie@archone.tamu.edu (James Price) writes: > Has anyone done any benchmarking of the SGI matrix functions? I was curious > and wrote the program included below. It does a number of 4x4 matrix > multiplies, first using software, and then using the geometry pipeline > functions (loadmatrix(), multmatrix(), getmatrix()). > > Here are some typical results: > > 10000 iterations on fritz, with GL version: GL4DGT-3.3 > > Software - no optimization: 3.349 sec. > > Software - some optimization: 1.130 sec. > > Software - more optimization: 0.910 sec. > > Hardware - preserve CTM: 2.379 sec. > > Hardware - destroy CTM: 2.289 sec. > > Hardware - abandon results: 0.580 sec. > > > The actual hardware multiplication is fast (0.580 sec/10000 multiplies) > but if we call getmatrix() to access the results, it slows things down > by around 400% (to 2.379 sec/10000 multiplies). I was hoping to use the > speed of the hardware for my own matrix needs, but it looks like the > getmatrix() call is simply too costly. Is there a better way? Its possible to do a complete 4x4 matrix multiply in under 310 cycles on a MIPS processor (in single precision). At 33 Mhz this works out to over 100,000 matrix multiplies per second or .010 sec for your benchmark above, more than 5 times faster than the hardware! I think one of the reasons why your software benchmark ran so slow was that you might have forgotten to compile with -float (and thus all floating point math was done in double precision). The theoretical limit for matrix multiply would be 64*4 cycles + a few. Of course, this requires writing very careful assembler code in order to overlap all the adds and load/stores with the 4 cycle multiplies. So I suspect that you could improve upon the 310 number I actually measured by about 10%. -------------------- Gary Tarolli
jmb@patton.wpd.sgi.com (Jim Barton) (05/02/91)
In all cases you must run the benchmark and average the results to get a true performance number. The reasons are many and varied, but some of the more significant ones are: 1) When you first run a program, it takes awhile to fill up the processor cache. Depending on context switching, etc., the cache can be more or less effective at various times during the run. 2) When you first execute a program, IRIX must read it from disk. However, IRIX is fanatical about caching disk blocks in memory, and it is quite likely that the second execution just picks up the pages in memory, and execution time could be significantly faster. This happens even when the timing is built into the program, since executables are almost always demand paged. 3) The way in which real memory pages are allocated to the process has a big impact on performance because the processor caches are direct mapped. For example, on a system with a 64Kb cache, real memory references modulo 64Kb will map to the same cache location. IRIX tries its best to allocate physical memory in a linear fashion, so that the probability of cache thrashing is minimized, but in the final analysis the application memory access pattern will determine the performance. 4) The 4D/20 and 4D/25 have a 1-deep write buffer. By default, C does all floating point in double precision (two words). Thus, when the compiler writes out a double precision float, the first word is buffered, but the second stalls the processor until the first write has been retired. Single precision floats (-float flag to the compiler) will eliminate this problem (unless you really need double precision). The POWERSeries machines have a 4-deep write buffer, while the 4D35 has an 8 deep write buffer. Benchmarking is Art, not Science. I suspect it always will be, despite the best efforts of SPEC, etc. -- Jim Barton Silicon Graphics Computer Systems jmb@sgi.com