malcolm@spar.SPAR.SLB.COM (Malcolm Slaney) (01/30/88)
The times shown in the table below are number of seconds to calculate 10 iterations of a 1024 point FFT. In this study the C and Lisp version of the FFT routine are identical. I started with Harry Barrow's FFT benchmark (with declarations added by Lucid) and then translated it statement by statement into C (thanks to Dave Singer for double checking this.) I believe that this is as accurate a comparison as is possible. Note, all of the Sun tests were measured with a version of Lucid Common Lisp labelled "Development Environment 2.1.1". This lisp only does single precision arithmetic using the 68881. Thus the only two lines that can be directly compared are "Single 68881 C" versus "Lucid 2.1.1". I believe it is now reasonable to conclude that Lisp compilers are as fast as C compilers. I don't have any numbers for a prodution quality Lisp compiler for the Sun/4's. I don't know of any reason the SPARC machine would run compiled Lisp slower than C. Finally, a word about register declarations in C: The times shown here were measured by putting the variables I thought were most accessed into registers. I did this with the same effort I normally go to make my own programs fast. I did not do any exhaustive testing to find the best set. When all of the register declarations are removed the 3/160 times got slower by .8 seconds and all the 2/260 times got slower by about .4 seconds. The Sun 4/260 numbers did not change any indicating they are doing global register optimization or the 4/260 is being completely dominated by the floating point work. The Sun 3 no-register times make sense since only the loop iteration and array addresses were put in registers and thus they are independent of the floating point work. Malcolm *********************************************************** Time (in seconds) to execute 10 iterations of a 1024 point FFT 3/160 3/260 4/260 Double FPA C 2.4 1.5 0.8 (*) Single FPA C 1.3 0.8 0.3 (*) Double 68881 C 5.0 4.1 0.8 (*) Single 68881 C 4.8 4.1 0.3 (*) <====== Lucid 2.1.1 4.7 4.0 <====== Symbolics Machine 4.7 (No IFU, No FPA - From Gabriel - Duplicated @ SPAR) Symbolics Machine 3.9 (with either IFU or FPA - From Gabriel) * Note that Sun4/260's only have one floating point option. They use a pair of Weitek floating point chips. And finally, here is the actual C version of the Barrow/Gabriel/Lucid FFT benchmark. The original Lisp is in comments. *********************************************************** /* * ;; FFT -- This is an FFT benchmark written by Harry Barrow. * ;; It tests a variety of floating point operations, including array * ;; references. * ;; * ;; Originally written in Lisp. * ;; Translated to C by Malcolm@spar.slb.com January 1988. * Thanks to Dave Singer @ SPAR for double checking the translation. /* * * (defvar **fft-re** * (make-array 1025. :element-type 'single-float :initial-element 0.0)) * (defvar **fft-im** * (make-array 1025. :element-type 'single-float :initial-element 0.0)) */ float fft_re[1025], fft_im[1025]; /* * (defvar s-pi (float pi 0.0)) */ #define s_pi 3.141592653589792434 /* * (defun fft (areal aimag) * (declare (type (simple-array single-float (*)) areal aimag)) */ fft(areal, aimag) float areal[], aimag[]; { /* (prog* ((ar areal) * (ai aimag) * (i 1) * (j 0) * (k 0) * (m 0) ;compute m = log(n) * (n (1- (array-dimension ar 0))) * (nv2 (floor n 2)) * (le 0) (le1 0) (ip 0) * (ur 0.0) (ui 0.0) (wr 0.0) (wi 0.0) (tr 0.0) (ti 0.0)) * (declare (type fixnum i j k n nv2 m le le1 ip)) * (declare (type (simple-array single-float (*)) ar ai)) * (declare (single-float ur ui wr wi tr ti)) */ register float *ar = areal, *ai = aimag; register int i = 1, j = 0, k = 0, m = 0; int n = 1024, nv2 = 512, le = 0, le1 = 0, ip = 0; float ur = 0.0, ui = 0.0, wr = 0.0, wi = 0.0, tr = 0.0, ti = 0.0; register int l; /* l1 (cond ((< i n) * (setq m (the fixnum (1+ m)) * i (the fixnum (+ i i))) * (go l1))) */ l1 : if (i < n){ m += 1; i += i; goto l1; } /* * (cond ((not (equal n (the fixnum (expt 2 m)))) * (princ "error ... array size not a power of two.") * (read) * (return (terpri)))) */ /* * (setq j 1 ;interchange elements * i 1) ;in bit-reversed order */ j = 1; i = 1; /* * l3 (cond ((< i j) * (setq tr (aref ar j) * ti (aref ai j)) * (setf (aref ar j) (aref ar i)) * (setf (aref ai j) (aref ai i)) * (setf (aref ar i) tr) * (setf (aref ai i) ti))) */ l3: if (i < j){ tr = ar[j]; ti = ai[j]; ar[j] = ar[i]; ai[j] = ai[i]; ar[i] = tr; ai[i] = ti; } /* * (setq k nv2) * l6 (cond ((< k j) * (setq j (the fixnum (- j k)) * k (the fixnum (/ k 2))) * (go l6))) * (setq j (the fixnum (+ j k)) * i (the fixnum (1+ i))) */ k = nv2; l6: if (k < j){ j -= k; k /= 2; goto l6; } j += k; i += 1; /* * (cond ((< i n) * (go l3))) */ if (i < n) goto l3; /* * (do ((l 1 (the fixnum (1+ l)))) * ((> (the fixnum l) m)) ;loop thru stages * (declare (type fixnum l)) * (setq le (the fixnum (expt 2 l)) * le1 (floor le 2) ;;(the (values fixnum fixnum) (floor le 2)) * ur 1.0 * ui 0.0 ;This used to be fixnum 0 --smh * wr (cos (/ s-pi le1)) * wi (sin (/ s-pi le1))) */ for (l = 1; l <= m; l++){ le = 1 << l; le1 = le / 2; ur = 1.0; ui = 0.0; wr = cos(s_pi / le1); wi = sin(s_pi / le1); /* * (do ((j 1 (the fixnum (1+ (the fixnum j))))) * ((> (the fixnum j) le1)) ;loop thru butterflies * (declare (type fixnum j)) * (do ((i j (the fixnum (+ i le)))) * ((> (the fixnum i) n)) ;do a butterfly * (declare (type fixnum i)) * (setq ip (the fixnum (+ i le1)) * tr (- (* (aref ar ip) ur) * (* (aref ai ip) ui)) * ti (+ (* (aref ar ip) ui) * (* (aref ai ip) ur))) * (setf (aref ar ip) (- (aref ar i) tr)) * (setf (aref ai ip) (- (aref ai i) ti)) * (setf (aref ar i) (+ (aref ar i) tr)) * (setf (aref ai i) (+ (aref ai i) ti)))) * (setq tr (- (* ur wr) (* ui wi)) * ti (+ (* ur wi) (* ui wr)) * ur tr * ui ti)) * (return t))) */ for (j = 1; j <= le1; j++){ for (i = j; i <= n; i += le){ ip = i + le1; tr = ar[ip]*ur - ai[ip] * ui; ti = ar[ip]*ui + ai[ip] * ur; ar[ip] = ar[i] - tr; ai[ip] = ai[i] - ti; ar[i] += tr; ai[i] += ti; } } tr = ur * wr - ui * wi; ti = ur * wi + wi * wr; ur = tr; ui = ti; } } /* * (defun fft-bench () * (dotimes (i 10) * (fft **fft-re** **fft-im**))) */ fft_bench(){ int i; for (i=0; i< 10; i++) fft(fft_re, fft_im); } /* * (defun testfft () * (format *trace-output* "; doing fft: (fft-bench)~%") * (print (time (fft-bench)))) */ /* * (defun clear-fft () * (dotimes (i 1025) * (setf (aref **fft-re** i) 0.0 * (aref **fft-im** i) 0.0)) * (values)) */ clear_fft(){ int i; for (i=0;i<1025;i++) fft_re[i] = fft_im[i] = 0; } /* * (defun setup-fft-component (theta &optional (phase 0.0)) * (let ((f (* 2 pi theta)) * (c (cos (* 0.5 pi phase))) * (s (sin (* 0.5 pi phase)))) * (dotimes (i 1025) * (let ((x (sin (* f (/ i 1024.0))))) * (incf (aref **fft-re** i) (float (* c x) 0.0)) * (incf (aref **fft-im** i) (float (* s x) 0.0))))) * (values)) */ setup_fft_component(theta,phase) float theta, phase; { float f, c, s; int i; f = 2 * s_pi * theta; c = cos(0.5 * s_pi * phase); s = sin(0.5 * s_pi * phase); for (i = 0; i < 1025; i++){ float x; x = sin(f * (i / 1024.0)); fft_re[i] += c * x; fft_im[i] += s * x; } } /* * (defvar fft-delta 0.0001) */ #define fft_delta 0.0001 /* * (defun print-fft () * (dotimes (i 1025) * (let ((re (aref **fft-re** i)) * (im (aref **fft-im** i))) * (unless (and (< (abs re) fft-delta) (< (abs im) fft-delta)) * (format t "~4d ~10f ~10f~%" i re im)))) * (values)) */ print_fft(){ int i; for (i = 0; i< 1025; i++){ float re, im; re = fft_re[i]; im = fft_im[i]; if (abs(re) > fft_delta || abs(im) > fft_delta) printf("%4d %10f %10f\n", i, re, im); } } main(){ fft_bench(); }
eggert@sea.sm.unisys.com (Paul Eggert) (01/31/88)
Expires:
Sender:
Followup-To:
Distribution:
Keywords:
In article <536@spar.SPAR.SLB.COM> malcolm@spar.SPAR.SLB.COM (Malcolm Slaney)
gives a C version on of the Barrow/Gabriel/Lucid FFT benchmark, and concludes
that Lisp compilers now generate code as fast as the code that C compilers
generate. But his C version's register declarations could stand improvements;
in particular, the key scalar variables should be put in registers. Given the
changes to his program suggested at the end of this message, C's performance
goes up 15-20% on a Sun-3 with 68881, so his table should be revised. (I lack
access to the other kinds of Suns he benchmarked and so will omit their
figures.)
Time (in seconds) to execute 10 iterations of a 1024 point FFT
Sun-3/160 Sun-3/160
(original) (revised)
Double 68881 C 5.0 4.1
Single 68881 C 4.8 4.2
Single 68881 Lucid 2.1.1 4.7 4.7
Even with these changes, I would like to see a better benchmark for relating
the quality of C compilers versus Lisp compilers. The FFT benchmark is
dominated by floating point work, which is relatively insensitive to the choice
of language and compiler. Even if we were to pick a different Gabriel
benchmark, blindly translating an existing Lisp program into C (as Slaney has
done) biases the results in favor of Lisp.
*** fftorig.c Sat Jan 30 09:09:15 1988
--- fft.c Sat Jan 30 10:13:45 1988
***************
*** 46,52 ****
*/
register float *ar = areal, *ai = aimag;
! register int i = 1, j = 0, k = 0, m = 0;
! int n = 1024, nv2 = 512, le = 0,
! le1 = 0, ip = 0;
! float ur = 0.0, ui = 0.0, wr = 0.0, wi = 0.0, tr = 0.0, ti = 0.0;
register int l;
--- 46,53 ----
*/
+ register int i = 1, ip = 0;
register float *ar = areal, *ai = aimag;
! register float ur = 0, ui = 0, tr = 0, ti = 0;
! register int le1 = 0, le = 0, n = 1024, j = 0, k = 0, m = 0;
! register int nv2 = n>>1;
! register float wr = 0, wi = 0;
register int l;
malcolm@spar.SPAR.SLB.COM (Malcolm Slaney) (02/01/88)
In article <6@sea.sm.unisys.com> eggert@sea.sm.unisys.com (Paul Eggert) writes: > In article <536@spar.SPAR.SLB.COM> malcolm@spar.SPAR.SLB.COM (Malcolm Slaney) > gives a C version on of the Barrow/Gabriel/Lucid FFT benchmark, and concludes > that Lisp compilers now generate code as fast as the code that C compilers > generate. But his C version's register declarations could stand improvements; > in particular, the key scalar variables should be put in registers. Thanks for the better numbers. I feel much better now that C is coming out *slightly* ahead. I would like to make two points. First, as far as I can tell the Lucid compiler is not putting ANYTHING in registers (I wish their disassembler was 1/10th as good as their compiler!) I hope this means there is still plenty of room for improvement in the Lisp compilers. Secondly, we *were* seeing floating point run 10 times slower than Lisp because of the need for boxing (tags) and no type propogation. I'm VERY happy to see that the lisp compilers are improving so much. > Even with these changes, I would like to see a better benchmark for relating > the quality of C compilers versus Lisp compilers. The FFT benchmark is > dominated by floating point work, which is relatively insensitive to the > choice of language and compiler. Thank you...that is my entire point. I'm building systems to make signal processing easier. If floating point is relatively insensitive to the choice of language (as I believe it is starting to be) then why not use Lisp and get all the interactive and user interface advantages?????? > Even if we were to pick a different Gabriel benchmark, blindly translating > an existing Lisp program into C biases the results in favor of Lisp. I'm not sure why this is true. The only way to really compare two languages for performance is to lock two hackers into two different rooms, feed them equal amounts of caffeinated soda (:-) and see which one is faster after a month. I don't have the time or resources to do that. Instead I have to settle for "toy" benchmarks to identify problem areas and help understand the issues. I'm open to suggestions and more data. Malcolm
eggert@sea.sm.unisys.com (Paul Eggert) (02/07/88)
In article <557@spar.SPAR.SLB.COM> malcolm@spar.slb.com (Malcolm Slaney) writes: The only way to really compare two languages for performance is to lock two hackers into two different rooms, feed them equal amounts of caffeinated soda (:-) and see which one is faster after a month. I wonder whether Slaney would still say this if he was locked in a room with the job of translating dhrystone into Lisp? (:-) I still don't think this benchmark is a good one. But I'll play the game by his rules for a few seconds. If minor changes to code are permitted (see the end of this note for details), then plain Sun C can run the FFT benchmark about a third faster than Lucid 2.1.1: Time (in seconds) to execute 10 iterations of a 1024 point FFT Sun-3/160 68881 single double C (SunOS 3.4) 3.5 3.7 Lucid 2.1.1 4.7 ? Slaney also writes: ... we *were* seeing floating point run 10 times slower than Lisp because of the need for boxing (tags) and no type propogation. I'm VERY happy to see that the lisp compilers are improving so much.... I'm also happy to see Lucid Lisp improving, and it's important to say that floating point need not cause one to shun Lisp. But I'm not yet convinced that Lucid Lisp and Sun C have similar floating point performance, even ignoring the the 35% performance difference reported above. First, no Lucid times for FPA-equipped Sun-3s or for Sun-4s were reported; what is the problem here? Second, many Lisp systems don't support fast double precision, which is crucial for many applications. Can the question mark in the table above be replaced by a hard number, so that we can see how well Lucid handles double precision? ---- The following changes to Slaney's (original) benchmark generate the performance figures described above. The changes to lines 178 and 264 fix bugs that don't affect CPU time -- but they lead me to suspect that there are more bugs! 18d17 < float fft_re[1025], fft_im[1025]; 19a19,22 > #ifndef real > #define real float > #endif > real fft_re[1025], fft_im[1025]; 31c34 < float areal[], aimag[]; --- > real areal[], aimag[]; 47,51c50,55 < register float *ar = areal, *ai = aimag; < register int i = 1, j = 0, k = 0, m = 0; < int n = 1024, nv2 = 512, le = 0, < le1 = 0, ip = 0; < float ur = 0.0, ui = 0.0, wr = 0.0, wi = 0.0, tr = 0.0, ti = 0.0; --- > register int i = 1, ip; > register real *ar = areal, *ai = aimag; > register double r, s, ur, ui, tr, ti; > register int le1, le, n = 1024, j, k, m = 0; > register int nv2 = n>>1; > register double wr, wi; 169,174c173,182 < tr = ar[ip]*ur - ai[ip] * ui; < ti = ar[ip]*ui + ai[ip] * ur; < ar[ip] = ar[i] - tr; < ai[ip] = ai[i] - ti; < ar[i] += tr; < ai[i] += ti; --- > r = ar[ip]; > s = ai[ip]; > tr = r*ur - s*ui; > ti = r*ui + s*ur; > r = ar[i]; > s = ai[i]; > ar[ip] = r - tr; > ai[ip] = s - ti; > ar[i] = r += tr; > ai[i] = s += ti; 178c186 < ti = ur * wi + wi * wr; --- > ui = ur * wi + ui * wr; 180d187 < ui = ti; 229c236 < float theta, phase; --- > double theta, phase; 231c238 < float f, c, s; --- > double f, c, s; 237c244 < float x; --- > double x; 261c268 < float re, im; --- > double re, im; 264c271 < if (abs(re) > fft_delta || abs(im) > fft_delta) --- > if (fabs(re) > fft_delta || fabs(im) > fft_delta)