[comp.windows.x] More Speedups for X11.2 Xsun

spaf@cs.purdue.EDU (Gene Spafford) (05/29/88)

Here's another body of speedups.  This doesn't produce an incredible
difference, but it does produce a small improvement in the performance
of my profiled Xsun.

The enclosed diffs modify the DoRop macro to check explicitly for the
two most commonly used rasterops, then uses a switch statement for
the rest.  The original code only checked for the most frequent, then
did a binary search on the remaining.

Note that these diffs include some changes to maskbits.h, and the patch
file is relative to the original changes.  Thus, if you apply
this via "patch", and you applied my first set of changes, back
them out first.

As before, comments and suggestions are welcomed!

--spaf


*** ./server/ddx/mfb/maskbits.h.orig	Wed May 25 22:38:33 1988
--- ./server/ddx/mfb/maskbits.h	Sat May 28 15:33:31 1988
***************
*** 28,35 ****
--- 28,39 ----
  
  extern int starttab[];
  extern int endtab[];
+ #ifndef PURDUE
  extern int startpartial[];
  extern int endpartial[];
+ #else PURDUE
+ extern unsigned partmasks[32][32];
+ #endif PURDUE
  extern int rmask[32];
  extern int mask[32];
  
***************
*** 206,213 ****
--- 210,222 ----
      else \
  	nlw = (w) >> 5;
  
+ #ifndef PURDUE
  #define maskpartialbits(x, w, mask) \
      mask = startpartial[(x) & 0x1f] & endpartial[((x) + (w)) & 0x1f];
+ #else PURDUE
+ #define maskpartialbits(x, w, mask) \
+     mask = partmasks[(x)&0x1f][(w)&0x1f];
+ #endif PURDUE
  
  #define mask32bits(x, w, startmask, endmask) \
      startmask = starttab[(x)&0x1f]; \
***************
*** 214,219 ****
--- 223,230 ----
      endmask = endtab[((x)+(w)) & 0x1f];
  
  
+ #ifndef PURDUE
+ 
  #define getbits(psrc, x, w, dst) \
  if ( ((x) + (w)) <= 32) \
  { \
***************
*** 269,274 ****
--- 280,344 ----
      t2 = DoRop(rop, t1, *((pdst) + 1)); \
      *((pdst)+1) = (*((pdst)+1) & starttab[n]) | (t2 & endtab[n]); \
  }
+ 
+ #else PURDUE
+ #define getbits(psrc, x, w, dst) \
+ if ( ((x) + (w)) <= 32) \
+ { \
+     dst = SCRLEFT((unsigned) *(psrc), (x)); \
+ } \
+ else \
+ { \
+     dst = (SCRLEFT((unsigned) *(psrc), (x))) | \
+ 	  (SCRRIGHT((unsigned) *((psrc)+1), 32-(x))); \
+ }
+ 
+ #define putbits(src, x, w, pdst) \
+ { \
+     int n = (x)+(w)-32; \
+     if (n <= 0) \
+     { \
+ 	unsigned tmpmask; \
+ 	maskpartialbits((x), (w), tmpmask); \
+ 	*(pdst) = (*(pdst) & ~tmpmask) | \
+ 		(SCRRIGHT((unsigned) src, x) & tmpmask); \
+     } \
+     else \
+     { \
+ 	unsigned int *ptmp_ = (pdst)+1; \
+ 	int m = 32-(x); \
+ 	*(pdst) = (*(pdst) & endtab[x]) | (SCRRIGHT((unsigned) (src), x)); \
+ 	*ptmp_ = (*ptmp_ & starttab[n]) | \
+ 		(SCRLEFT((unsigned) src, m) & endtab[n]); \
+     } \
+ }
+ 
+ #define putbitsrop(src, x, w, pdst, rop) \
+ if ( ((x)+(w)) <= 32) \
+ { \
+     int tmpmask; \
+     int t1, t2; \
+     maskpartialbits((x), (w), tmpmask); \
+     t1 = SCRRIGHT((src), (x)); \
+     DoRop(t2, rop, t1, *(pdst)); \
+     *(pdst) = (*(pdst) & ~tmpmask) | (t2 & tmpmask); \
+ } \
+ else \
+ { \
+     int m; \
+     int n; \
+     int t1, t2; \
+     m = 32-(x); \
+     n = (w) - m; \
+     t1 = SCRRIGHT((src), (x)); \
+     DoRop(t2, rop, t1, *(pdst)); \
+     *(pdst) = (*(pdst) & endtab[x]) | (t2 & starttab[x]); \
+     t1 = SCRLEFT((src), m); \
+     DoRop(t2, rop, t1, *((pdst) + 1)); \
+     *((pdst)+1) = (*((pdst)+1) & starttab[n]) | (t2 & endtab[n]); \
+ }
+ 
+ #endif PURDUE
  
  #define putbitsrrop(src, x, w, pdst, rop) \
  if ( ((x)+(w)) <= 32) \
*** ./server/ddx/mfb/mfb.h.orig	Sat May 28 15:49:34 1988
--- ./server/ddx/mfb/mfb.h	Sat May 28 20:37:16 1988
***************
*** 249,254 ****
--- 249,255 ----
  #define fnNAND(src, dst)	(~(src & dst))
  #define fnSET(src, dst)		(~0)
  
+ #ifndef PURDUE
  /* Binary search to figure out what to do for the raster op.  It may
   * do 5 comparisons, but at least it does no function calls 
   * Special cases copy because it's so frequent 
***************
*** 270,275 ****
--- 271,345 ----
         (((alu) >= GXandReverse) ? \
  	 (((alu) == GXandReverse) ? ((src) & ~(dst)) : (src)) : \
  	 (((alu) == GXand) ? ((src) & (dst)) : 0)))  ) )
+ #else PURDUE
+ /*  Using a "switch" statement is much faster in most cases
+  *  since the compiler can do a look-up table or multi-way branch
+  *  instruction, depending on the architecture.  The result on
+  *  A Sun 3/50 is at least 2.5 times faster, assuming a uniform
+  *  distribution of RasterOp operation types.
+  *
+  *  However, doing some profiling on a running system reveals
+  *  GXcopy is the operation over 99.5% of the time and
+  *  GXcopy is the next most frequent (about .4%), so we make special
+  *  checks for those first.
+  *
+  *  Note that this requires a change to the "calling sequence"
+  *  since we can't engineer a "switch" statement to have an lvalue.
+  */
+ #define DoRop(result, alu, src, dst) \
+ { \
+     if (alu == GXcopy) \
+ 	result = fnCOPY (src, dst); \
+     else if (alu == GXxor) \
+         result = fnXOR (src, dst); \
+     else \
+ 	switch (alu) \
+ 	{ \
+ 	  case GXclear: \
+ 	    result = fnCLEAR (src, dst); \
+ 	    break; \
+ 	  case GXand: \
+ 	    result = fnAND (src, dst); \
+ 	    break; \
+ 	  case GXandReverse: \
+ 	    result = fnANDREVERSE (src, dst); \
+ 	    break; \
+ 	  case GXandInverted: \
+ 	    result = fnANDINVERTED (src, dst); \
+ 	    break; \
+ 	  case GXnoop: \
+ 	    result = fnNOOP (src, dst); \
+ 	    break; \
+ 	  case GXor: \
+ 	    result = fnOR (src, dst); \
+ 	    break; \
+ 	  case GXnor: \
+ 	    result = fnNOR (src, dst); \
+ 	    break; \
+ 	  case GXequiv: \
+ 	    result = fnEQUIV (src, dst); \
+ 	    break; \
+ 	  case GXinvert: \
+ 	    result = fnINVERT (src, dst); \
+ 	    break; \
+ 	  case GXorReverse: \
+ 	    result = fnORREVERSE (src, dst); \
+ 	    break; \
+ 	  case GXcopyInverted: \
+ 	    result = fnCOPYINVERTED (src, dst); \
+ 	    break; \
+ 	  case GXorInverted: \
+ 	    result = fnORINVERTED (src, dst); \
+ 	    break; \
+ 	  case GXnand: \
+ 	    result = fnNAND (src, dst); \
+ 	    break; \
+ 	  case GXset: \
+ 	    result = fnSET (src, dst); \
+ 	    break; \
+ 	} \
+ }
+ #endif PURDUE
  
  
  #define DoRRop(alu, src, dst) \
*** ./server/ddx/mfb/mfbbitblt.c.orig	Sat May 28 15:35:10 1988
--- ./server/ddx/mfb/mfbbitblt.c	Sat May 28 15:36:48 1988
***************
*** 615,621 ****
--- 615,625 ----
  		        while (nl--)
  		        {
  			    getbits(psrc, xoffSrc, 32, tmpSrc)
+ #ifndef PURDUE
  			    *pdst = DoRop(alu, tmpSrc, *pdst);
+ #else PURDUE
+ 			    DoRop(*pdst, alu, tmpSrc, *pdst);
+ #endif
  			    pdst++;
  			    psrc++;
  		        }
***************
*** 657,663 ****
--- 661,671 ----
  			    --psrc;
  			    --pdst;
  			    getbits(psrc, xoffSrc, 32, tmpSrc)
+ #ifndef PURDUE
  			    *pdst = DoRop(alu, tmpSrc, *pdst);
+ #else PURDUE
+ 			    DoRop(*pdst, alu, tmpSrc, *pdst);
+ #endif PURDUE
  		        }
  
  		        if (startmask)
*** ./server/ddx/mfb/mfbfillsp.c.orig	Sat May 28 15:36:52 1988
--- ./server/ddx/mfb/mfbfillsp.c	Sat May 28 15:37:52 1988
***************
*** 889,895 ****
--- 889,899 ----
  		    while(nlMiddle--)
  		    {
  			    getbits(psrc, nstart, 32, tmpSrc);
+ #ifndef PURDUE
  			    *pdst = DoRop(rop, tmpSrc, *pdst);
+ #else PURDUE
+ 			    DoRop(*pdst, rop, tmpSrc, *pdst);
+ #endif PURDUE
  			    pdst++;
  			    psrc++;
  		    }
*** ./server/ddx/mfb/mfbsetsp.c.orig	Sat May 28 15:37:58 1988
--- ./server/ddx/mfb/mfbsetsp.c	Sat May 28 15:38:43 1988
***************
*** 104,110 ****
--- 104,114 ----
  	while (nl--) 
  	{ 
  	    getbits(psrc, offSrc, 32, tmpSrc);
+ #ifndef PURDUE
  	    *pdst = DoRop(alu, tmpSrc, *pdst); 
+ #else PURDUE
+ 	    DoRop(*pdst, alu, tmpSrc, *pdst); 
+ #endif PURDUE
  	    pdst++; 
  	    psrc++; 
  	} 
*** ./server/ddx/mfb/mfbtile.c.orig	Sat May 28 15:38:48 1988
--- ./server/ddx/mfb/mfbtile.c	Sat May 28 15:49:28 1988
***************
*** 108,115 ****
--- 108,123 ----
  	    {
  		srcpix = psrc[iy];
  		iy = ++iy < tileHeight ? iy : 0;
+ #ifndef PURDUE
  		*p = (*p & ~startmask) |
  		     (DoRop(alu, srcpix, *p) & startmask);
+ #else PURDUE
+ 		{
+ 		    unsigned _p;
+ 		    DoRop(_p, alu, srcpix, *p);
+ 		    *p = (*p & ~startmask) | (_p & startmask);
+ 		}
+ #endif PURDUE
  		p += nlwExtra;
  	    }
  	}
***************
*** 126,141 ****
--- 134,169 ----
  		    srcpix = psrc[iy];
  		    iy = ++iy < tileHeight ? iy : 0;
  		    nlw = nlwMiddle;
+ #ifndef PURDUE
  		    *p = (*p & ~startmask) | 
  			 (DoRop(alu, srcpix, *p) & startmask);
+ #else PURDUE
+ 		    {
+ 			unsigned _p;
+ 			DoRop(_p, alu, srcpix, *p);
+ 			*p = (*p & ~startmask) | (_p & startmask);
+ 		    }
+ #endif PURDUE
  		    p++;
  		    while (nlw--)
  		    {
+ #ifndef PURDUE
  			*p = DoRop(alu, srcpix, *p);
+ #else PURDUE
+ 			DoRop(*p, alu, srcpix, *p);
+ #endif PURDUE
  			p++;
  		    }
+ #ifndef PURDUE
  		    *p = (*p & ~endmask) |
  		         (DoRop(alu, srcpix, *p) & endmask);
+ #else PURDUE
+ 		    {
+ 			unsigned _p;
+ 			DoRop(_p, alu, srcpix, *p);
+ 			*p = (*p & ~endmask) | (_p & endmask);
+ 		    }
+ #endif PURDUE
  		    p += nlwExtra;
  		}
  	    }
***************
*** 147,158 ****
--- 175,198 ----
  		    srcpix = psrc[iy];
  		    iy = ++iy < tileHeight ? iy : 0;
  		    nlw = nlwMiddle;
+ #ifndef PURDUE
  		    *p = (*p & ~startmask) | 
  			 (DoRop(alu, srcpix, *p) & startmask);
+ #else PURDUE
+ 		    {
+ 			unsigned _p;
+ 			DoRop(_p, alu, srcpix, *p);
+ 			*p = (*p & ~startmask) | (_p & startmask);
+ 		    }
+ #endif PURDUE
  		    p++;
  		    while (nlw--)
  		    {
+ #ifndef PURDUE
  			*p = DoRop(alu, srcpix, *p);
+ #else PURDUE
+ 			DoRop(*p, alu, srcpix, *p);
+ #endif PURDUE
  			p++;
  		    }
  		    p += nlwExtra;
***************
*** 167,177 ****
--- 207,229 ----
  		    nlw = nlwMiddle;
  		    while (nlw--)
  		    {
+ #ifndef PURDUE
  			*p = DoRop(alu, srcpix, *p);
+ #else PURDUE
+ 			DoRop(*p, alu, srcpix, *p);
+ #endif PURDUE
  			p++;
  		    }
+ #ifndef PURDUE
  		    *p = (*p & ~endmask) |
  		         (DoRop(alu, srcpix, *p) & endmask);
+ #else PURDUE
+ 		    {
+ 			unsigned _p;
+ 			DoRop(_p, alu, srcpix, *p);
+ 			*p = (*p & ~endmask) | (_p & endmask);
+ 		    }
+ #endif PURDUE
  		    p += nlwExtra;
  		}
  	    }
***************
*** 184,190 ****
--- 236,246 ----
  		    nlw = nlwMiddle;
  		    while (nlw--)
  		    {
+ #ifndef PURDUE
  			*p = DoRop(alu, srcpix, *p);
+ #else PURDUE
+ 			DoRop(*p, alu, srcpix, *p);
+ #endif PURDUE
  			p++;
  		    }
  		    p += nlwExtra;
-- 
Gene Spafford
NSF/Purdue/U of Florida  Software Engineering Research Center,
Dept. of Computer Sciences, Purdue University, W. Lafayette IN 47907-2004
Internet:  spaf@cs.purdue.edu	uucp:	...!{decwrl,gatech,ucbvax}!purdue!spaf

spaf@cs.purdue.EDU (Gene Spafford) (06/01/88)

A "shar" file of all Purdue mods for Sun 3/50 Xsun speed-up has
been placed on expo.lcs.mit.edu in the file
contrib/PURDUE-speedups.shar

This includes the fixed version of the first three changes, and
the second posting with the modification to the DoRop macro.

An updated "README" file describing the changes, etc. has been
included.


-- 
Gene Spafford
NSF/Purdue/U of Florida  Software Engineering Research Center,
Dept. of Computer Sciences, Purdue University, W. Lafayette IN 47907-2004
Internet:  spaf@cs.purdue.edu	uucp:	...!{decwrl,gatech,ucbvax}!purdue!spaf