[comp.lang.lisp] Lisp vs. C Floating Point

malcolm@spar.SPAR.SLB.COM (Malcolm Slaney) (01/30/88)

The times shown in the table below are number of seconds to calculate 10
iterations of a 1024 point FFT.  In this study the C and Lisp version
of the FFT routine are identical.  I started with Harry Barrow's FFT 
benchmark (with declarations added by Lucid) and then translated it
statement by statement into C (thanks to Dave Singer for double checking
this.)  I believe that this is as accurate a comparison as is possible.

Note, all of the Sun tests were measured with a version of Lucid Common
Lisp labelled "Development Environment 2.1.1".  This lisp only does single
precision arithmetic using the 68881.  Thus the only two lines that can
be directly compared are "Single 68881 C" versus "Lucid 2.1.1".

I believe it is now reasonable to conclude that Lisp compilers are as
fast as C compilers.  I don't have any numbers for a prodution quality
Lisp compiler for the Sun/4's.  I don't know of any reason the SPARC 
machine would run compiled Lisp slower than C.

Finally, a word about register declarations in C:  The times shown here
were measured by putting the variables I thought were most accessed into
registers.  I did this with the same effort I normally go to make my own
programs fast.  I did not do any exhaustive testing to find the best set.
When all of the register declarations are removed the 3/160 times got slower
by .8 seconds and all the 2/260 times got slower by about .4 seconds.  The
Sun 4/260 numbers did not change any indicating they are doing global register
optimization or the 4/260 is being completely dominated by the floating point
work.  The Sun 3 no-register times make sense since only the loop iteration
and array addresses were put in registers and thus they are independent of 
the floating point work.
							Malcolm

	***********************************************************
	Time (in seconds) to execute 10 iterations of a 1024 point FFT

			3/160		3/260		4/260

Double FPA C		2.4		1.5		0.8 (*)
Single FPA C		1.3		0.8		0.3 (*)

Double 68881 C		5.0		4.1		0.8 (*)
Single 68881 C		4.8		4.1		0.3 (*)  <======
Lucid 2.1.1		4.7		4.0		         <======

Symbolics Machine	4.7 (No IFU, No FPA - From Gabriel - Duplicated @ SPAR)
Symbolics Machine	3.9 (with either IFU or FPA - From Gabriel)


*	Note that Sun4/260's only have one floating point option.  They use
	a pair of Weitek floating point chips.


And finally, here is the actual C version of the Barrow/Gabriel/Lucid
FFT benchmark.  The original Lisp is in comments.

	***********************************************************
/* 
 * ;; FFT -- This is an FFT benchmark written by Harry Barrow.
 * ;; It tests a variety of floating point operations, including array 
 * ;; references.
 * ;; 
 * ;; Originally written in Lisp.
 * ;; Translated to C by Malcolm@spar.slb.com    January 1988.

 * Thanks to Dave Singer @ SPAR for double checking the translation.

/* 
 *
 * (defvar **fft-re**
 *   (make-array 1025. :element-type 'single-float :initial-element 0.0))
 * (defvar **fft-im**
 *   (make-array 1025. :element-type 'single-float :initial-element 0.0))
 */
float fft_re[1025], fft_im[1025];


/* 
 * (defvar s-pi (float pi 0.0))
 */
#define	s_pi 3.141592653589792434

/*
 * (defun fft (areal aimag)
 *   (declare (type (simple-array single-float (*)) areal aimag))
 */
fft(areal, aimag)
float areal[], aimag[];
{
/*	  (prog* ((ar areal)
 *		  (ai aimag)
 *		  (i 1)
 *		  (j 0)
 *		  (k 0)
 *		  (m 0)				;compute m = log(n)
 *		  (n (1- (array-dimension ar 0)))
 *		  (nv2 (floor n 2))
 *		  (le 0) (le1 0) (ip 0)
 *		  (ur 0.0) (ui 0.0) (wr 0.0) (wi 0.0) (tr 0.0) (ti 0.0))
 *          (declare (type fixnum i j k n nv2 m le le1 ip))
 *          (declare (type (simple-array single-float (*)) ar ai))
 *          (declare (single-float ur ui wr wi tr ti))
 */
	register float *ar = areal, *ai = aimag;
	register int	i = 1, j = 0, k = 0, m = 0; 
	int	n = 1024, nv2 = 512, le = 0,
		le1 = 0, ip = 0;
	float	ur = 0.0, ui = 0.0, wr = 0.0, wi = 0.0, tr = 0.0, ti = 0.0;
	register int	l;

/*	    l1 (cond ((< i n)
 *		      (setq m (the fixnum (1+ m))
 *			    i (the fixnum (+ i i)))
 *		      (go l1)))
 */
	l1 :
		if (i < n){
			m += 1;
			i += i;
			goto l1;
		}

/*
 *	    (cond ((not (equal n (the fixnum (expt 2 m))))
 *		   (princ "error ... array size not a power of two.")
 *		   (read)
 *		   (return (terpri))))
 */

/*
 *	    (setq j 1				;interchange elements
 *		  i 1)				;in bit-reversed order
 */
	j = 1;
	i = 1;

/*
 *	    l3 (cond ((< i j)
 *		      (setq tr (aref ar j)
 *			    ti (aref ai j))
 *		      (setf (aref ar j) (aref ar i))
 *		      (setf (aref ai j) (aref ai i))
 *		      (setf (aref ar i) tr)
 *		      (setf (aref ai i) ti)))
 */
	l3:
		if (i < j){
			tr = ar[j];
			ti = ai[j];
			ar[j] = ar[i];
			ai[j] = ai[i];
			ar[i] = tr;
			ai[i] = ti;
		}

/*
 *	    (setq k nv2)
 *	    l6 (cond ((< k j)
 *		      (setq j (the fixnum (- j k))
 *			    k (the fixnum (/ k 2)))
 *		      (go l6)))
 *	    (setq j (the fixnum (+ j k))
 *		  i (the fixnum (1+ i)))
 */
		k = nv2;
	l6:
		if (k < j){
			j -= k;
			k /= 2;
			goto l6;
		}
	j += k;
	i += 1;

/*
 *	(cond ((< i n)
 *	       (go l3)))
 */
	if (i < n)
		goto l3;

/*
 *	    (do ((l 1 (the fixnum (1+ l))))
 *		((> (the fixnum l) m))		;loop thru stages
 *	      (declare (type fixnum l))
 *	      (setq le (the fixnum (expt 2 l))
 *		    le1 (floor le 2) ;;(the (values fixnum fixnum) (floor le 2))
 *		    ur 1.0
 *		    ui 0.0			;This used to be fixnum 0 --smh
 *		    wr (cos (/ s-pi le1))
 *		    wi (sin (/ s-pi le1)))
 */
	for (l = 1; l <= m; l++){
		le = 1 << l;
		le1 = le / 2;
		ur = 1.0;
		ui = 0.0;
		wr = cos(s_pi / le1);
		wi = sin(s_pi / le1);

/*
 *	      (do ((j 1 (the fixnum (1+ (the fixnum j)))))
 *		  ((> (the fixnum j) le1))	;loop thru butterflies
 *		(declare (type fixnum j))
 *		(do ((i j (the fixnum (+ i le))))
 *		    ((> (the fixnum i) n))	;do a butterfly
 *		  (declare (type fixnum i))
 *		  (setq ip (the fixnum (+ i le1))
 *			tr (- (* (aref ar ip) ur)
 *			      (* (aref ai ip) ui))
 *			ti (+ (* (aref ar ip) ui)
 *			      (* (aref ai ip) ur)))
 *		  (setf (aref ar ip) (- (aref ar i) tr))
 *		  (setf (aref ai ip) (- (aref ai i) ti))
 *		  (setf (aref ar i) (+ (aref ar i) tr))
 *		  (setf (aref ai i) (+ (aref ai i) ti))))
 *	      (setq tr (- (* ur wr) (* ui wi))
 *		    ti (+ (* ur wi) (* ui wr))
 *		    ur tr
 *		    ui ti))
 *	    (return t)))
 */
		for (j = 1; j <= le1; j++){
			for (i = j; i <= n; i += le){
				ip = i + le1;
				tr = ar[ip]*ur - ai[ip] * ui;
				ti = ar[ip]*ui + ai[ip] * ur;
				ar[ip] = ar[i] - tr;
				ai[ip] = ai[i] - ti;
				ar[i] += tr;
				ai[i] += ti;
			}
		}
		tr = ur * wr - ui * wi;
		ti = ur * wi + wi * wr;
		ur = tr;
		ui = ti;
	}
}

/*
 *	 (defun fft-bench ()
 *	  (dotimes (i 10)
 *	    (fft **fft-re** **fft-im**)))
 */

fft_bench(){
	int	i;
	for (i=0; i< 10; i++)
		fft(fft_re, fft_im);
}


/*
 *	(defun testfft ()
 *	  (format *trace-output* "; doing fft: (fft-bench)~%") 
 *	  (print (time (fft-bench))))
 */

/*
 *	(defun clear-fft ()
 *	  (dotimes (i 1025)
 *	    (setf (aref **fft-re** i) 0.0
 *		  (aref **fft-im** i) 0.0))
 *	  (values))
 */

clear_fft(){
	int	i;
	for (i=0;i<1025;i++)
		fft_re[i] = fft_im[i] = 0;
}

/*
 *	(defun setup-fft-component (theta &optional (phase 0.0))
 *	  (let ((f (* 2 pi theta))
 *		(c (cos (* 0.5 pi phase)))
 *		(s (sin (* 0.5 pi phase))))
 *	    (dotimes (i 1025)
 *	      (let ((x (sin (* f (/ i 1024.0)))))
 *		(incf (aref **fft-re** i) (float (* c x) 0.0))
 *		(incf (aref **fft-im** i) (float (* s x) 0.0)))))
 *	  (values))
 */
 setup_fft_component(theta,phase)
 float	theta, phase;
 {
	float	f, c, s;
	int	i;
	f = 2 * s_pi * theta;
	c = cos(0.5 * s_pi * phase);
	s = sin(0.5 * s_pi * phase);
	for (i = 0; i < 1025; i++){
		float	x;
		x = sin(f * (i / 1024.0));
		fft_re[i] += c * x;
		fft_im[i] += s * x;
	}
}

/*
 *	(defvar fft-delta 0.0001)
 */
#define	fft_delta 0.0001

/*
 *	(defun print-fft ()
 *	  (dotimes (i 1025)
 *	    (let ((re (aref **fft-re** i))
 *		  (im (aref **fft-im** i)))
 *	      (unless (and (< (abs re) fft-delta) (< (abs im) fft-delta))
 *		(format t "~4d  ~10f ~10f~%" i re im))))
 *	  (values))
 */
 print_fft(){
	int	i;
	for (i = 0; i< 1025; i++){
		float	re, im;
		re = fft_re[i];
		im = fft_im[i];
		if (abs(re) > fft_delta || abs(im) > fft_delta)
			printf("%4d  %10f %10f\n", i, re, im);
	}
}

main(){
	fft_bench();
}

eggert@sea.sm.unisys.com (Paul Eggert) (01/31/88)

Expires:

Sender:

Followup-To:

Distribution:

Keywords:


In article <536@spar.SPAR.SLB.COM> malcolm@spar.SPAR.SLB.COM (Malcolm Slaney)
gives a C version on of the Barrow/Gabriel/Lucid FFT benchmark, and concludes
that Lisp compilers now generate code as fast as the code that C compilers
generate.  But his C version's register declarations could stand improvements;
in particular, the key scalar variables should be put in registers.  Given the
changes to his program suggested at the end of this message, C's performance
goes up 15-20% on a Sun-3 with 68881, so his table should be revised.  (I lack
access to the other kinds of Suns he benchmarked and so will omit their
figures.)

		Time (in seconds) to execute 10 iterations of a 1024 point FFT

					Sun-3/160	Sun-3/160
					(original)	(revised)

	Double 68881 C			5.0		4.1
	Single 68881 C			4.8		4.2
	Single 68881 Lucid 2.1.1	4.7		4.7

Even with these changes, I would like to see a better benchmark for relating
the quality of C compilers versus Lisp compilers.  The FFT benchmark is
dominated by floating point work, which is relatively insensitive to the choice
of language and compiler.  Even if we were to pick a different Gabriel
benchmark, blindly translating an existing Lisp program into C (as Slaney has
done) biases the results in favor of Lisp.

*** fftorig.c	Sat Jan 30 09:09:15 1988
--- fft.c	Sat Jan 30 10:13:45 1988
***************
*** 46,52 ****
   */
  	register float *ar = areal, *ai = aimag;
! 	register int	i = 1, j = 0, k = 0, m = 0; 
! 	int	n = 1024, nv2 = 512, le = 0,
! 		le1 = 0, ip = 0;
! 	float	ur = 0.0, ui = 0.0, wr = 0.0, wi = 0.0, tr = 0.0, ti = 0.0;
  	register int	l;
--- 46,53 ----
   */
+ 	register int i = 1, ip = 0;
  	register float *ar = areal, *ai = aimag;
! 	register float ur = 0, ui = 0, tr = 0, ti = 0;
! 	register int le1 = 0, le = 0, n = 1024, j = 0, k = 0, m = 0; 
! 	register int nv2 = n>>1;
! 	register float wr = 0, wi = 0;
  	register int	l;

malcolm@spar.SPAR.SLB.COM (Malcolm Slaney) (02/01/88)

In article <6@sea.sm.unisys.com> eggert@sea.sm.unisys.com (Paul Eggert) writes:
> In article <536@spar.SPAR.SLB.COM> malcolm@spar.SPAR.SLB.COM (Malcolm Slaney)
> gives a C version on of the Barrow/Gabriel/Lucid FFT benchmark, and concludes
> that Lisp compilers now generate code as fast as the code that C compilers
> generate.  But his C version's register declarations could stand improvements;
> in particular, the key scalar variables should be put in registers.  
Thanks for the better numbers.  I feel much better now that C is coming out
*slightly* ahead.

I would like to make two points.  First, as far as I can tell the Lucid
compiler is not putting ANYTHING in registers (I wish their disassembler
was 1/10th as good as their compiler!)  I hope this means there is still
plenty of room for improvement in the Lisp compilers.  Secondly, we *were*
seeing floating point run 10 times slower than Lisp because of the need
for boxing (tags) and no type propogation.  I'm VERY happy to see that the
lisp compilers are improving so much.

> Even with these changes, I would like to see a better benchmark for relating
> the quality of C compilers versus Lisp compilers.  The FFT benchmark is
> dominated by floating point work, which is relatively insensitive to the 
> choice of language and compiler.  
Thank you...that is my entire point.  I'm building systems to make signal
processing easier.  If floating point is relatively insensitive to the choice
of language (as I believe it is starting to be) then why not use Lisp and
get all the interactive and user interface advantages??????

> Even if we were to pick a different Gabriel benchmark, blindly translating 
> an existing Lisp program into C biases the results in favor of Lisp.
I'm not sure why this is true.  The only way to really compare two languages
for performance is to lock two hackers into two different rooms, feed them
equal amounts of caffeinated soda (:-) and see which one is faster after a
month.  I don't have the time or resources to do that.  Instead I have to
settle for "toy" benchmarks to identify problem areas and help understand
the issues.

I'm open to suggestions and more data.

								Malcolm

eggert@sea.sm.unisys.com (Paul Eggert) (02/07/88)

In article <557@spar.SPAR.SLB.COM> malcolm@spar.slb.com (Malcolm Slaney) writes:

	The only way to really compare two languages for performance is to lock
	two hackers into two different rooms, feed them equal amounts of
	caffeinated soda (:-) and see which one is faster after a month.

I wonder whether Slaney would still say this if he was locked in a room with
the job of translating dhrystone into Lisp? (:-)  I still don't think this
benchmark is a good one.  But I'll play the game by his rules for a few
seconds.  If minor changes to code are permitted (see the end of this note for
details), then plain Sun C can run the FFT benchmark about a third faster than
Lucid 2.1.1:

	Time (in seconds) to execute 10 iterations of a 1024 point FFT
		Sun-3/160 68881
		single	double
C (SunOS 3.4)	3.5	3.7
Lucid 2.1.1	4.7	?

Slaney also writes:

	... we *were* seeing floating point run 10 times slower than Lisp
	because of the need for boxing (tags) and no type propogation.  I'm
	VERY happy to see that the lisp compilers are improving so much....

I'm also happy to see Lucid Lisp improving, and it's important to say that
floating point need not cause one to shun Lisp.  But I'm not yet convinced that
Lucid Lisp and Sun C have similar floating point performance, even ignoring the
the 35% performance difference reported above.  First, no Lucid times for
FPA-equipped Sun-3s or for Sun-4s were reported; what is the problem here?
Second, many Lisp systems don't support fast double precision, which is crucial
for many applications.  Can the question mark in the table above be replaced by
a hard number, so that we can see how well Lucid handles double precision?

----
The following changes to Slaney's (original) benchmark generate the performance
figures described above.  The changes to lines 178 and 264 fix bugs that don't
affect CPU time -- but they lead me to suspect that there are more bugs!

18d17
< float fft_re[1025], fft_im[1025];
19a19,22
> #ifndef real
> #define real float
> #endif
> real fft_re[1025], fft_im[1025];
31c34
< float areal[], aimag[];
---
> real areal[], aimag[];
47,51c50,55
< 	register float *ar = areal, *ai = aimag;
< 	register int	i = 1, j = 0, k = 0, m = 0; 
< 	int	n = 1024, nv2 = 512, le = 0,
< 		le1 = 0, ip = 0;
< 	float	ur = 0.0, ui = 0.0, wr = 0.0, wi = 0.0, tr = 0.0, ti = 0.0;
---
> 	register int i = 1, ip;
> 	register real *ar = areal, *ai = aimag;
> 	register double r, s, ur, ui, tr, ti;
> 	register int le1, le, n = 1024, j, k, m = 0;
> 	register int nv2 = n>>1;
> 	register double wr, wi;
169,174c173,182
< 				tr = ar[ip]*ur - ai[ip] * ui;
< 				ti = ar[ip]*ui + ai[ip] * ur;
< 				ar[ip] = ar[i] - tr;
< 				ai[ip] = ai[i] - ti;
< 				ar[i] += tr;
< 				ai[i] += ti;
---
> 				r = ar[ip];
> 				s = ai[ip];
> 				tr = r*ur - s*ui;
> 				ti = r*ui + s*ur;
> 				r = ar[i];
> 				s = ai[i];
> 				ar[ip] = r - tr;
> 				ai[ip] = s - ti;
> 				ar[i] = r += tr;
> 				ai[i] = s += ti;
178c186
< 		ti = ur * wi + wi * wr;
---
> 		ui = ur * wi + ui * wr;
180d187
< 		ui = ti;
229c236
<  float	theta, phase;
---
>  double	theta, phase;
231c238
< 	float	f, c, s;
---
> 	double	f, c, s;
237c244
< 		float	x;
---
> 		double	x;
261c268
< 		float	re, im;
---
> 		double	re, im;
264c271
< 		if (abs(re) > fft_delta || abs(im) > fft_delta)
---
> 		if (fabs(re) > fft_delta || fabs(im) > fft_delta)