[comp.sys.sgi] SGI GL matrix performance

jamie@archone.tamu.edu (James Price) (04/27/91)

Has anyone done any benchmarking of the SGI matrix functions?  I was curious
and wrote the program included below.  It does a number of 4x4 matrix 
multiplies, first using software, and then using the geometry pipeline 
functions (loadmatrix(), multmatrix(), getmatrix()).  

Here are some typical results:

10000 iterations on fritz, with GL version: GL4DGT-3.3

Software - no optimization:     3.349 sec.

Software - some optimization:   1.130 sec.

Software - more optimization:   0.910 sec.

Hardware - preserve CTM:        2.379 sec.

Hardware - destroy CTM:         2.289 sec.

Hardware - abandon results:     0.580 sec.


The actual hardware multiplication is fast (0.580 sec/10000 multiplies) 
but if we call getmatrix() to access the results, it slows things down 
by around 400% (to 2.379 sec/10000 multiplies).  I was hoping to use the 
speed of the hardware for my own matrix needs, but it looks like the 
getmatrix() call is simply too costly.  Is there a better way?

Jim Price
jamie@archone.tamu.edu
Visualization Laboratory
Texas A&M University

/**************************************************************************/
/*                                                                        */
/* matperf.c - SGI GL matrix performance checker                          */
/*                                                                        */
/* to compile:  cc -o matperf matperf.c -lgl_s -lm                        */
/*                                                                        */
/* to run:  matperf n                                                     */
/*     where n = number of matrix multiplies to perform                   */
/*                                                                        */
/**************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <sys/param.h>
#include <gl.h>

typedef float MAT44[4][4];

void Print44(MAT44 *pMat);
void Identity(MAT44 *pMat);
double Duration(struct timeval *ptv1, struct timeval *ptv2);
void SoftMult44_1(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2);
void SoftMult44_2(float pResult[], MAT44 *pm1, MAT44 *pm2);
void SoftMult44_3(float pResult[], MAT44 *pm1, MAT44 *pm2);
void HardMult44_1(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2);
void HardMult44_2(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2);
void HardMult44_3(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2);


void main(int argc, char *argv[])
{
   register long i;
   long iter;
   MAT44 m1, m2, result;
   char hwver[13],hostname[MAXHOSTNAMELEN+1];
   struct timeval tv1,tv2;
   struct timezone tz;

   if (argc != 2)
      {
      printf("Usage: matperf n\n");
      return;
      }

   iter = atoi(argv[1]);

   /* put in some numbers */
   Identity(m1);
   m1[0][1] = 1.0;
   m1[0][2] = 1.0;
   m1[0][3] = 1.0;
   
   Identity(m2);
   m2[0][0] = 5;
   m2[1][1] = 6;
   m2[2][2] = 7;
   m2[3][0] = 10;
   m2[3][1] = 20;
   m2[3][2] = 30;

   gethostname(hostname,MAXHOSTNAMELEN);
   gversion(hwver);

   /* winopen() necessary to use geometry pipeline */
   prefposition(500,600,500,600);
   noport();
   winopen("perf");

   /* give window processes a chance to get up and running */
   sleep(5);    

   printf("\n%ld iterations on %s, with GL version: %s\n",iter,hostname,hwver);
   
   gettimeofday(&tv1,&tz);
   for (i=0; i<iter; i++)
      SoftMult44_1(result,m1,m2);
   gettimeofday(&tv2,&tz);
   
   printf("\nSoftware - no optimization:   %7.3f sec.\n",Duration(&tv1,&tv2));


   gettimeofday(&tv1,&tz);
   for (i=0; i<iter; i++)
      SoftMult44_2(result,m1,m2);
   gettimeofday(&tv2,&tz);

   printf("\nSoftware - some optimization: %7.3f sec.\n",Duration(&tv1,&tv2));


   gettimeofday(&tv1,&tz);
   for (i=0; i<iter; i++)
      SoftMult44_3(result,m1,m2);
   gettimeofday(&tv2,&tz);

   printf("\nSoftware - more optimization: %7.3f sec.\n",Duration(&tv1,&tv2));


   gettimeofday(&tv1,&tz);
   for (i=0; i<iter; i++)
      HardMult44_1(result,m1,m2);
   gettimeofday(&tv2,&tz);

   printf("\nHardware - preserve CTM:      %7.3f sec.\n",Duration(&tv1,&tv2));


   gettimeofday(&tv1,&tz);
   for (i=0; i<iter; i++)
      HardMult44_2(result,m1,m2);
   gettimeofday(&tv2,&tz);

   printf("\nHardware - destroy CTM:       %7.3f sec.\n",Duration(&tv1,&tv2));


   gettimeofday(&tv1,&tz);
   for (i=0; i<iter; i++)
      HardMult44_3(result,m1,m2);
   gettimeofday(&tv2,&tz);

   printf("\nHardware - abandon results:   %7.3f sec.\n",Duration(&tv1,&tv2));
   printf("\nDone.");
}

/* convert gettimeofday() values to real number */          
double Duration(struct timeval *ptv1, struct timeval *ptv2)
{
   return (((double)ptv2->tv_sec + (double)ptv2->tv_usec / 1000000.0) - 
           ((double)ptv1->tv_sec + (double)ptv1->tv_usec / 1000000.0));
}


/* 4x4 no optimization */
void SoftMult44_1(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2)
{
   int i,j,k;

   for (i=0; i<4; i++)
      for (j=0; j<4; j++)
	 {
	 (*pResult)[i][j] = 0.0;
	 for (k=0; k<4; k++)
	    (*pResult)[i][j] += (*pm1)[i][k]*(*pm2)[k][j];
	 }
}


/* 4x4 some optimization */
void SoftMult44_2(float pResult[], MAT44 *pm1, MAT44 *pm2)
{
   register int i,j;

   for (i=0; i<4; i++)
      for (j=0; j<4; j++)
	 {
	 *pResult = (*pm1)[i][0]*(*pm2)[0][j] +
	            (*pm1)[i][1]*(*pm2)[1][j] +
	            (*pm1)[i][2]*(*pm2)[2][j] +
	            (*pm1)[i][3]*(*pm2)[3][j];
         pResult++;
	 }
}


/* 4x4 more optimization */
void SoftMult44_3(float pResult[], MAT44 *pm1, MAT44 *pm2)
{
   register int i;

   for (i=0; i<4; i++)
      {
      *pResult = (*pm1)[i][0]*(*pm2)[0][0] +
	         (*pm1)[i][1]*(*pm2)[1][0] +
	         (*pm1)[i][2]*(*pm2)[2][0] +
	         (*pm1)[i][3]*(*pm2)[3][0];
      pResult++;

      *pResult = (*pm1)[i][0]*(*pm2)[0][1] +
	         (*pm1)[i][1]*(*pm2)[1][1] +
	         (*pm1)[i][2]*(*pm2)[2][1] +
	         (*pm1)[i][3]*(*pm2)[3][1];
      pResult++;

      *pResult = (*pm1)[i][0]*(*pm2)[0][2] +
	         (*pm1)[i][1]*(*pm2)[1][2] +
	         (*pm1)[i][2]*(*pm2)[2][2] +
	         (*pm1)[i][3]*(*pm2)[3][2];
      pResult++;

      *pResult = (*pm1)[i][0]*(*pm2)[0][3] +
	         (*pm1)[i][1]*(*pm2)[1][3] +
	         (*pm1)[i][2]*(*pm2)[2][3] +
	         (*pm1)[i][3]*(*pm2)[3][3];
      pResult++;
      }
}


/* preserve CTM */
void HardMult44_1(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2)
{
   pushmatrix();
   loadmatrix(pm2);
   multmatrix(pm1);
   getmatrix(pResult);
   popmatrix();
}


/* destroy CTM */
void HardMult44_2(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2)
{
   loadmatrix(pm2);
   multmatrix(pm1);
   getmatrix(pResult);
}

/* preserve CTM, abandon results */
void HardMult44_3(MAT44 *pResult, MAT44 *pm1, MAT44 *pm2)
{
   pushmatrix();
   loadmatrix(pm2);
   multmatrix(pm1);
   popmatrix();
}

void Print44(MAT44 *pMat) 
{
   int i,j;

   for (i=0; i<4; i++)
      {
      printf("\n");
      for (j=0; j<4; j++)
	 printf("%5.3f ",(*pMat)[i][j]);
      }
}


void Identity(MAT44 *pMat) 
{
   int i,j;

   for (i=0; i<4; i++)
      for (j=0; j<4; j++)
	 (*pMat)[i][j] = (i == j) ? (1.0) : (0.0);
}

zombie@voodoo.UUCP (Mike York) (04/28/91)

In article <15407@helios.TAMU.EDU> jamie@archone.tamu.edu (James Price) writes:
>Has anyone done any benchmarking of the SGI matrix functions?  I was curious
>and wrote the program included below.  It does a number of 4x4 matrix 
>multiplies, first using software, and then using the geometry pipeline 
>functions (loadmatrix(), multmatrix(), getmatrix()).  
>
>Here are some typical results:
>
>10000 iterations on fritz, with GL version: GL4DGT-3.3
                                                 ^^
                                                 Aha, a GT!

Our application used to run on 4D/60T's.  We had one 4D/70GT.  We found out 
that anything that required the graphics pipeline to regurgitate information
(picking, getmatrix, feedback goodies, etc.), was very slow with the GT 
architecture (picking was up to 6 time slower with our application).  We got
rid of all our 60's and now have a bunch of 4D/25TG's.  I would venture a
guess that if you ran your program on a non-GT machine, you'd see better
results.  
-- 
    Mike York                     |
    Boeing Computer Services      |  Support your local nanobrewer --
    (206) 865-6577                |  No twist-off bottle caps.
    zombie@voodoo.boeing.com      |

tarolli@westcoast.esd.sgi.com (Gary Tarolli) (04/29/91)

In article <15407@helios.TAMU.EDU>, jamie@archone.tamu.edu (James Price) writes:
> Has anyone done any benchmarking of the SGI matrix functions?  I was curious
> and wrote the program included below.  It does a number of 4x4 matrix 
> multiplies, first using software, and then using the geometry pipeline 
> functions (loadmatrix(), multmatrix(), getmatrix()).  
> 
> Here are some typical results:
> 
> 10000 iterations on fritz, with GL version: GL4DGT-3.3
> 
> Software - no optimization:     3.349 sec.
> 
> Software - some optimization:   1.130 sec.
> 
> Software - more optimization:   0.910 sec.
> 
> Hardware - preserve CTM:        2.379 sec.
> 
> Hardware - destroy CTM:         2.289 sec.
> 
> Hardware - abandon results:     0.580 sec.
> 
> 
> The actual hardware multiplication is fast (0.580 sec/10000 multiplies) 
> but if we call getmatrix() to access the results, it slows things down 
> by around 400% (to 2.379 sec/10000 multiplies).  I was hoping to use the 
> speed of the hardware for my own matrix needs, but it looks like the 
> getmatrix() call is simply too costly.  Is there a better way?


Its possible to do a complete 4x4 matrix multiply in under 310 cycles on
a MIPS processor (in single precision).  At 33 Mhz this works out to over
100,000 matrix multiplies per second or .010 sec for your benchmark above,
more than 5 times faster than the hardware!

I think one of the reasons why your software benchmark ran so slow was
that you might have forgotten to compile with -float (and thus all floating
point math was done in double precision).

The theoretical limit for matrix multiply would be 64*4 cycles + a few.
Of course, this requires writing very careful assembler code in order
to overlap all the adds and load/stores with the 4 cycle multiplies.
So I suspect that you could improve upon the 310 number I actually
measured by about 10%.



--------------------
	Gary Tarolli

jmb@patton.wpd.sgi.com (Jim Barton) (05/02/91)

In all cases you must run the benchmark and average the results to get a true
performance number. The reasons are many and varied, but some of the more
significant ones are:

   1) When you first run a program, it takes awhile to fill up the processor
      cache. Depending on context switching, etc., the cache can be more or
      less effective at various times during the run.

   2) When you first execute a program, IRIX must read it from disk. However,
      IRIX is fanatical about caching disk blocks in memory, and it is quite
      likely that the second execution just picks up the pages in memory, and
      execution time could be significantly faster. This happens even when the
      timing is built into the program, since executables are almost always
      demand paged.

   3) The way in which real memory pages are allocated to the process has a big
      impact on performance because the processor caches are direct mapped.
      For example, on a system with a 64Kb cache, real memory references
      modulo 64Kb will map to the same cache location. IRIX tries its best to
      allocate physical memory in a linear fashion, so that the probability of
      cache thrashing is minimized, but in the final analysis the application
      memory access pattern will determine the performance.

   4) The 4D/20 and 4D/25 have a 1-deep write buffer. By default, C does all
      floating point in double precision (two words). Thus, when the compiler
      writes out a double precision float, the first word is buffered, but
      the second stalls the processor until the first write has been retired.
      Single precision floats (-float flag to the compiler) will eliminate this
      problem (unless you really need double precision). The POWERSeries
      machines have a 4-deep write buffer, while the 4D35 has an 8 deep write
      buffer.

Benchmarking is Art, not Science. I suspect it always will be, despite the
best efforts of SPEC, etc.

-- Jim Barton
   Silicon Graphics Computer Systems
   jmb@sgi.com