spaf@cs.purdue.EDU (Gene Spafford) (05/29/88)
Here's another body of speedups. This doesn't produce an incredible difference, but it does produce a small improvement in the performance of my profiled Xsun. The enclosed diffs modify the DoRop macro to check explicitly for the two most commonly used rasterops, then uses a switch statement for the rest. The original code only checked for the most frequent, then did a binary search on the remaining. Note that these diffs include some changes to maskbits.h, and the patch file is relative to the original changes. Thus, if you apply this via "patch", and you applied my first set of changes, back them out first. As before, comments and suggestions are welcomed! --spaf *** ./server/ddx/mfb/maskbits.h.orig Wed May 25 22:38:33 1988 --- ./server/ddx/mfb/maskbits.h Sat May 28 15:33:31 1988 *************** *** 28,35 **** --- 28,39 ---- extern int starttab[]; extern int endtab[]; + #ifndef PURDUE extern int startpartial[]; extern int endpartial[]; + #else PURDUE + extern unsigned partmasks[32][32]; + #endif PURDUE extern int rmask[32]; extern int mask[32]; *************** *** 206,213 **** --- 210,222 ---- else \ nlw = (w) >> 5; + #ifndef PURDUE #define maskpartialbits(x, w, mask) \ mask = startpartial[(x) & 0x1f] & endpartial[((x) + (w)) & 0x1f]; + #else PURDUE + #define maskpartialbits(x, w, mask) \ + mask = partmasks[(x)&0x1f][(w)&0x1f]; + #endif PURDUE #define mask32bits(x, w, startmask, endmask) \ startmask = starttab[(x)&0x1f]; \ *************** *** 214,219 **** --- 223,230 ---- endmask = endtab[((x)+(w)) & 0x1f]; + #ifndef PURDUE + #define getbits(psrc, x, w, dst) \ if ( ((x) + (w)) <= 32) \ { \ *************** *** 269,274 **** --- 280,344 ---- t2 = DoRop(rop, t1, *((pdst) + 1)); \ *((pdst)+1) = (*((pdst)+1) & starttab[n]) | (t2 & endtab[n]); \ } + + #else PURDUE + #define getbits(psrc, x, w, dst) \ + if ( ((x) + (w)) <= 32) \ + { \ + dst = SCRLEFT((unsigned) *(psrc), (x)); \ + } \ + else \ + { \ + dst = (SCRLEFT((unsigned) *(psrc), (x))) | \ + (SCRRIGHT((unsigned) *((psrc)+1), 32-(x))); \ + } + + #define putbits(src, x, w, pdst) \ + { \ + int n = (x)+(w)-32; \ + if (n <= 0) \ + { \ + unsigned tmpmask; \ + maskpartialbits((x), (w), tmpmask); \ + *(pdst) = (*(pdst) & ~tmpmask) | \ + (SCRRIGHT((unsigned) src, x) & tmpmask); \ + } \ + else \ + { \ + unsigned int *ptmp_ = (pdst)+1; \ + int m = 32-(x); \ + *(pdst) = (*(pdst) & endtab[x]) | (SCRRIGHT((unsigned) (src), x)); \ + *ptmp_ = (*ptmp_ & starttab[n]) | \ + (SCRLEFT((unsigned) src, m) & endtab[n]); \ + } \ + } + + #define putbitsrop(src, x, w, pdst, rop) \ + if ( ((x)+(w)) <= 32) \ + { \ + int tmpmask; \ + int t1, t2; \ + maskpartialbits((x), (w), tmpmask); \ + t1 = SCRRIGHT((src), (x)); \ + DoRop(t2, rop, t1, *(pdst)); \ + *(pdst) = (*(pdst) & ~tmpmask) | (t2 & tmpmask); \ + } \ + else \ + { \ + int m; \ + int n; \ + int t1, t2; \ + m = 32-(x); \ + n = (w) - m; \ + t1 = SCRRIGHT((src), (x)); \ + DoRop(t2, rop, t1, *(pdst)); \ + *(pdst) = (*(pdst) & endtab[x]) | (t2 & starttab[x]); \ + t1 = SCRLEFT((src), m); \ + DoRop(t2, rop, t1, *((pdst) + 1)); \ + *((pdst)+1) = (*((pdst)+1) & starttab[n]) | (t2 & endtab[n]); \ + } + + #endif PURDUE #define putbitsrrop(src, x, w, pdst, rop) \ if ( ((x)+(w)) <= 32) \ *** ./server/ddx/mfb/mfb.h.orig Sat May 28 15:49:34 1988 --- ./server/ddx/mfb/mfb.h Sat May 28 20:37:16 1988 *************** *** 249,254 **** --- 249,255 ---- #define fnNAND(src, dst) (~(src & dst)) #define fnSET(src, dst) (~0) + #ifndef PURDUE /* Binary search to figure out what to do for the raster op. It may * do 5 comparisons, but at least it does no function calls * Special cases copy because it's so frequent *************** *** 270,275 **** --- 271,345 ---- (((alu) >= GXandReverse) ? \ (((alu) == GXandReverse) ? ((src) & ~(dst)) : (src)) : \ (((alu) == GXand) ? ((src) & (dst)) : 0))) ) ) + #else PURDUE + /* Using a "switch" statement is much faster in most cases + * since the compiler can do a look-up table or multi-way branch + * instruction, depending on the architecture. The result on + * A Sun 3/50 is at least 2.5 times faster, assuming a uniform + * distribution of RasterOp operation types. + * + * However, doing some profiling on a running system reveals + * GXcopy is the operation over 99.5% of the time and + * GXcopy is the next most frequent (about .4%), so we make special + * checks for those first. + * + * Note that this requires a change to the "calling sequence" + * since we can't engineer a "switch" statement to have an lvalue. + */ + #define DoRop(result, alu, src, dst) \ + { \ + if (alu == GXcopy) \ + result = fnCOPY (src, dst); \ + else if (alu == GXxor) \ + result = fnXOR (src, dst); \ + else \ + switch (alu) \ + { \ + case GXclear: \ + result = fnCLEAR (src, dst); \ + break; \ + case GXand: \ + result = fnAND (src, dst); \ + break; \ + case GXandReverse: \ + result = fnANDREVERSE (src, dst); \ + break; \ + case GXandInverted: \ + result = fnANDINVERTED (src, dst); \ + break; \ + case GXnoop: \ + result = fnNOOP (src, dst); \ + break; \ + case GXor: \ + result = fnOR (src, dst); \ + break; \ + case GXnor: \ + result = fnNOR (src, dst); \ + break; \ + case GXequiv: \ + result = fnEQUIV (src, dst); \ + break; \ + case GXinvert: \ + result = fnINVERT (src, dst); \ + break; \ + case GXorReverse: \ + result = fnORREVERSE (src, dst); \ + break; \ + case GXcopyInverted: \ + result = fnCOPYINVERTED (src, dst); \ + break; \ + case GXorInverted: \ + result = fnORINVERTED (src, dst); \ + break; \ + case GXnand: \ + result = fnNAND (src, dst); \ + break; \ + case GXset: \ + result = fnSET (src, dst); \ + break; \ + } \ + } + #endif PURDUE #define DoRRop(alu, src, dst) \ *** ./server/ddx/mfb/mfbbitblt.c.orig Sat May 28 15:35:10 1988 --- ./server/ddx/mfb/mfbbitblt.c Sat May 28 15:36:48 1988 *************** *** 615,621 **** --- 615,625 ---- while (nl--) { getbits(psrc, xoffSrc, 32, tmpSrc) + #ifndef PURDUE *pdst = DoRop(alu, tmpSrc, *pdst); + #else PURDUE + DoRop(*pdst, alu, tmpSrc, *pdst); + #endif pdst++; psrc++; } *************** *** 657,663 **** --- 661,671 ---- --psrc; --pdst; getbits(psrc, xoffSrc, 32, tmpSrc) + #ifndef PURDUE *pdst = DoRop(alu, tmpSrc, *pdst); + #else PURDUE + DoRop(*pdst, alu, tmpSrc, *pdst); + #endif PURDUE } if (startmask) *** ./server/ddx/mfb/mfbfillsp.c.orig Sat May 28 15:36:52 1988 --- ./server/ddx/mfb/mfbfillsp.c Sat May 28 15:37:52 1988 *************** *** 889,895 **** --- 889,899 ---- while(nlMiddle--) { getbits(psrc, nstart, 32, tmpSrc); + #ifndef PURDUE *pdst = DoRop(rop, tmpSrc, *pdst); + #else PURDUE + DoRop(*pdst, rop, tmpSrc, *pdst); + #endif PURDUE pdst++; psrc++; } *** ./server/ddx/mfb/mfbsetsp.c.orig Sat May 28 15:37:58 1988 --- ./server/ddx/mfb/mfbsetsp.c Sat May 28 15:38:43 1988 *************** *** 104,110 **** --- 104,114 ---- while (nl--) { getbits(psrc, offSrc, 32, tmpSrc); + #ifndef PURDUE *pdst = DoRop(alu, tmpSrc, *pdst); + #else PURDUE + DoRop(*pdst, alu, tmpSrc, *pdst); + #endif PURDUE pdst++; psrc++; } *** ./server/ddx/mfb/mfbtile.c.orig Sat May 28 15:38:48 1988 --- ./server/ddx/mfb/mfbtile.c Sat May 28 15:49:28 1988 *************** *** 108,115 **** --- 108,123 ---- { srcpix = psrc[iy]; iy = ++iy < tileHeight ? iy : 0; + #ifndef PURDUE *p = (*p & ~startmask) | (DoRop(alu, srcpix, *p) & startmask); + #else PURDUE + { + unsigned _p; + DoRop(_p, alu, srcpix, *p); + *p = (*p & ~startmask) | (_p & startmask); + } + #endif PURDUE p += nlwExtra; } } *************** *** 126,141 **** --- 134,169 ---- srcpix = psrc[iy]; iy = ++iy < tileHeight ? iy : 0; nlw = nlwMiddle; + #ifndef PURDUE *p = (*p & ~startmask) | (DoRop(alu, srcpix, *p) & startmask); + #else PURDUE + { + unsigned _p; + DoRop(_p, alu, srcpix, *p); + *p = (*p & ~startmask) | (_p & startmask); + } + #endif PURDUE p++; while (nlw--) { + #ifndef PURDUE *p = DoRop(alu, srcpix, *p); + #else PURDUE + DoRop(*p, alu, srcpix, *p); + #endif PURDUE p++; } + #ifndef PURDUE *p = (*p & ~endmask) | (DoRop(alu, srcpix, *p) & endmask); + #else PURDUE + { + unsigned _p; + DoRop(_p, alu, srcpix, *p); + *p = (*p & ~endmask) | (_p & endmask); + } + #endif PURDUE p += nlwExtra; } } *************** *** 147,158 **** --- 175,198 ---- srcpix = psrc[iy]; iy = ++iy < tileHeight ? iy : 0; nlw = nlwMiddle; + #ifndef PURDUE *p = (*p & ~startmask) | (DoRop(alu, srcpix, *p) & startmask); + #else PURDUE + { + unsigned _p; + DoRop(_p, alu, srcpix, *p); + *p = (*p & ~startmask) | (_p & startmask); + } + #endif PURDUE p++; while (nlw--) { + #ifndef PURDUE *p = DoRop(alu, srcpix, *p); + #else PURDUE + DoRop(*p, alu, srcpix, *p); + #endif PURDUE p++; } p += nlwExtra; *************** *** 167,177 **** --- 207,229 ---- nlw = nlwMiddle; while (nlw--) { + #ifndef PURDUE *p = DoRop(alu, srcpix, *p); + #else PURDUE + DoRop(*p, alu, srcpix, *p); + #endif PURDUE p++; } + #ifndef PURDUE *p = (*p & ~endmask) | (DoRop(alu, srcpix, *p) & endmask); + #else PURDUE + { + unsigned _p; + DoRop(_p, alu, srcpix, *p); + *p = (*p & ~endmask) | (_p & endmask); + } + #endif PURDUE p += nlwExtra; } } *************** *** 184,190 **** --- 236,246 ---- nlw = nlwMiddle; while (nlw--) { + #ifndef PURDUE *p = DoRop(alu, srcpix, *p); + #else PURDUE + DoRop(*p, alu, srcpix, *p); + #endif PURDUE p++; } p += nlwExtra; -- Gene Spafford NSF/Purdue/U of Florida Software Engineering Research Center, Dept. of Computer Sciences, Purdue University, W. Lafayette IN 47907-2004 Internet: spaf@cs.purdue.edu uucp: ...!{decwrl,gatech,ucbvax}!purdue!spaf
spaf@cs.purdue.EDU (Gene Spafford) (06/01/88)
A "shar" file of all Purdue mods for Sun 3/50 Xsun speed-up has been placed on expo.lcs.mit.edu in the file contrib/PURDUE-speedups.shar This includes the fixed version of the first three changes, and the second posting with the modification to the DoRop macro. An updated "README" file describing the changes, etc. has been included. -- Gene Spafford NSF/Purdue/U of Florida Software Engineering Research Center, Dept. of Computer Sciences, Purdue University, W. Lafayette IN 47907-2004 Internet: spaf@cs.purdue.edu uucp: ...!{decwrl,gatech,ucbvax}!purdue!spaf