[gnu.gcc] GCC performance

arman@oahu.cs.ucla.edu (09/23/88)

I just finished compiling a Prolog interpreter with GCC v1.28 on a
Sun/260. The interpreter compiled with GCC is, on the average, 15%
slower than the version which was compiled with the Sun 3.5 C
compiler. I had been under the impression that GCC produced better
code than the Sun C compiler. I have also tried smaller benchmarks,
but they also indicate that Sun CC produces better code.

Did I goof some place in the compilation of GCC, or does Sun CC
produce better code?? 

Thanx for your help,
arman.



-- Arman Bostani // UCLA Computer Science Department // +1 213-825-3194
	3417 Boelter Hall // Los Angeles, California 90024-1596 // USA
	arman@CS.UCLA.EDU   ...!(ucbvax,rutgers)!ucla-cs!arman

arman@oahu.cs.ucla.edu (09/27/88)

Machine 	= Sun 3/60
System  	= SunOS 3.5
CC/GCC Options	= -O

This is a follow up to the message I posted about a week ago about the
quality of code produced by GCC. Since then, I have compiled many more
programs and benchmarks. It seems that GCC performs consistently better
than the Sun C compiler on programs with relatively simple flow
structures such as the Drhystone benchmarks, especially in the absence
of register declarations.

Unfortunately, with programs that have complicated flow structures,
Sun cc seems to perform much better. The simplest example I have
found is Duff's device which was posted to the net some time ago. The
Sun compiler seems to produce code which on the average is 13% faster
than the code produced by GCC. In fact, when Duff's device is GCC'd
without the -O option, it runs faster! Which leads us to believe that
there is possibly a bug in GCC's optimizer. I have included the source
for Duff's device and the optimized assembly code produced by GCC.

Cheers,
  arman.

  (windsor 203) cc -O duff.c -o duff.cc
  (windsor 204) gcc -O -v duff.c -o duff.gcc
gcc version 1.28
 /usr/local/lib/gcc-cpp -v -undef -D__GNU__ -D__GNUC__ -Dmc68000 -Dsun -Dunix -D__OPTIMIZE__ -D__HAVE_FPU__-Dmc68020 duff.c /tmp/cca07939.cpp
GNU CPP version 1.28
 /usr/local/lib/gcc-cc1 /tmp/cca07939.cpp -quiet -dumpbase duff.c -O -version -o /tmp/cca07939.s
GNU C version 1.28 (68k, MIT syntax) compiled by GNU C version 1.28.
 as -mc68020 /tmp/cca07939.s -o duff.o
 ld -o duff.gcc /lib/crt0.o /lib/Mcrt1.o duff.o /usr/local/lib/gcc-gnulib -lc
  (windsor 205) gcc -v duff.c -o duff.nop
gcc version 1.28
 /usr/local/lib/gcc-cpp -v -undef -D__GNU__ -D__GNUC__ -Dmc68000 -Dsun -Dunix -D__HAVE_FPU__-Dmc68020 duff.c /tmp/cca07949.cpp
GNU CPP version 1.28
 /usr/local/lib/gcc-cc1 /tmp/cca07949.cpp -quiet -dumpbase duff.c -version -o /tmp/cca07949.s
GNU C version 1.28 (68k, MIT syntax) compiled by GNU C version 1.28.
 as -mc68020 /tmp/cca07949.s -o duff.o
 ld -o duff.nop /lib/crt0.o /lib/Mcrt1.o duff.o /usr/local/lib/gcc-gnulib -lc
  (windsor 206) time duff.cc 10000
5.4u 0.0s 0:05 92% 0+8k 2+0io 0pf+0w
  (windsor 207) time duff.gcc 10000
6.1u 0.1s 0:06 96% 0+8k 2+0io 0pf+0w
  (windsor 208) time duff.nop 10000
5.8u 0.1s 0:06 92% 0+8k 2+0io 0pf+0w
  (windsor 209) 

---------------------------------- duff.c ----------------------------------

#define	BLK	1024

int a[BLK], b[BLK];

main(c,v)
char **v;
{
	int i, n = atoi(v[1]);
	for(i=0; i<n; i++)
		send(a,b,BLK);
}

/*
	Duff's device
*/
send(to, from, count)
register short *to, *from;
register count;
{
	register n=(count+7)/8;
	switch(count%8){
	case 0:	do{	*to = *from++;
	case 7:		*to = *from++;
	case 6:		*to = *from++;
	case 5:		*to = *from++;
	case 4:		*to = *from++;
	case 3:		*to = *from++;
	case 2:		*to = *from++;
	case 1:		*to = *from++;
		}while(--n>0);
	}
}

---------------------------------- duff.s ----------------------------------

#NO_APP
.text
	.even
.globl _main
_main:
	link a6,#0
	moveml #0x3000,sp@-
	movel a6@(12),a0
	movel a0@(4),sp@-
	jbsr _atoi
	movel d0,d3
	clrl d2
	addqw #4,sp
	cmpl d2,d3
	jle L6
L5:
	pea 1024:w
	pea _b
	pea _a
	jbsr _send
	addw #12,sp
	addql #1,d2
	cmpl d2,d3
	jgt L5
L6:
	moveml a6@(-8),#0xc
	unlk a6
	rts
	.even
.globl _send
_send:
	link a6,#0
	moveml #0x3000,sp@-
	movel a6@(8),a1
	movel a6@(12),a0
	movel a6@(16),d2
	movel d2,d1
	addql #7,d1
	jpl L8
	addql #7,d1
L8:
	asrl #3,d1
	movel d2,d0
	jge L23
	addql #7,d0
L23:
	moveq #-8,d3
	andl d3,d0
	subl d2,d0
	negl d0
	moveq #7,d3
	cmpl d3,d0
	jhi L9
LI21:
	movew pc@(L21-LI21-2:b,d0:l:2),d3
	jmp pc@(2,d3:w)
L21:
	.word L10-L21
	.word L20-L21
	.word L19-L21
	.word L18-L21
	.word L17-L21
	.word L16-L21
	.word L15-L21
	.word L14-L21
L10:
L11:
	movew a0@+,a1@
L14:
	movew a0@+,a1@
L15:
	movew a0@+,a1@
L16:
	movew a0@+,a1@
L17:
	movew a0@+,a1@
L18:
	movew a0@+,a1@
L19:
	movew a0@+,a1@
L20:
	movew a0@+,a1@
	subql #1,d1
	andb #0xc,cc
	jgt L11
L9:
	moveml a6@(-8),#0xc
	unlk a6
	rts
.comm _b,4096
.comm _a,4096

-- Arman Bostani // UCLA Computer Science Department // +1 213-825-3194
	3417 Boelter Hall // Los Angeles, California 90024-1596 // USA
	arman@CS.UCLA.EDU   ...!(ucbvax,rutgers)!ucla-cs!arman

usenet@cps3xx.UUCP (Usenet file owner) (09/28/88)

  The problem is that GCC is putting an "andb #x0c, cc" into the inner
loop.  I don't have time to delve into the mysteries of the code, but
it seems to be caused by something (???) setting the CC_NO_OVERFLOW
flag, so that gcc thinks it has to mask off some bits.
  If there are any gcc gurus out there, does someone want to look into
this?  I looked through the m68k.md and tm-m68k.h files, but there's
just too much going on.
  If you take out the "andb" they're both just as fast, by the way.

+----------------------------------+------------------------+
| Anton Rang (grad student)	   | "VMS forever!"	    |
| Michigan State University	   | rang@cpswh.cps.msu.edu |
+----------------------------------+------------------------+