[gnu.gcc.bug] Floating multiply bug

apratt@AMES.ARC.NASA.GOV (Allan Pratt) (06/08/89)
I believe I have found a bug in GCC, either in the library source
for __qmult in dflonum.c or in the optimizer.

dflonum.c contains two versions of __qmult: one which gets compiled
when SLOW_KLUDGEY_QMULT is defined, and another, presumably better one.
The fast one contains a lot of inline assembly, including a couple of
lines which generate addx.l instructions.

In one case, with no -O option, the compiler (GCC 1.35) generates

	addx.l	a0@-,a2@-

which is not unreasonable.  However, with -O, it generates

	addx.l	a0@-,a0@-

in the same place, and, at least in some circumstances, this code
causes big trouble: the area of the stack which holds the caller's
registers (a5 etc.) gets clobbered.

I don't know enough about GCC's inline assembly syntax to figure
this out on my own.  The macro in question is XXADDL which is
only used once in dflonum.c.  I enclose the source (flonum.h and
the __qmult part of dflonum.c) below.

Please let me know if I have provided you with enough information on
this.  Also, if there is an address I can write to for information
about GCC (especially the copyleft) please tell me what it is; I have
some questions.

============================================
Opinions expressed above do not necessarily	-- Allan Pratt, Atari Corp.
reflect those of Atari Corp. or anyone else.	  ...ames!atari!apratt


/************************************************************************/
/* begin flonum.h from my distribution 					*/
/************************************************************************/

/* Defs and macros for floating point code.  This stuff is heavily based
   on Scott McCauley's code, except that this version works :-} */


/* These definitions work for machines where an SF value is
   returned in the same register as an int.  */
#ifndef FLONUM_H
#define FLONUM_H

#ifndef SFVALUE  
#define SFVALUE int
#endif

#ifndef INTIFY
#define INTIFY(FLOATVAL)  (intify.f = (FLOATVAL), intify.i)
#endif

/* quasi-IEEE floating point number definitions */

struct bitfloat {
	unsigned long sign : 1;
	unsigned long exp : 8;
	unsigned long mant : 23;
};

struct bitdouble {
	unsigned long sign : 1;
	unsigned long exp : 11;
	unsigned long mant1 : 20;
	unsigned long mant2;
};

union double_di { double d; long i[2]; };
union flt_or_int { long i; float f; };

#ifdef WORDS_BIG_ENDIAN
#define HIGH 0
#define LOW 1
#else
#define HIGH 1
#define LOW 0
#endif

/* start of symbolic asm definitions */

/* you may have to change the g's to d's if you start getting
   illegal operands from as */

#define MUL(a, b) asm volatile ("mulu %2,%0" : "=d" (b) : "0" (b) , "g" (a))
#define DIV(a, b) asm volatile ("divu %2,%0" : "=d" (b) : "0" (b) , "g" (a))
#define SWAP(a) asm volatile ("swap %0" : "=r" (a) : "r" (a) , "r" (a) )

#define ASL2(r1, r2) { asm volatile ("asll #1,%0" : "=d" (r2) : "d" (r2));\
		       asm volatile ("roxll #1,%0" : "=d" (r1) : "d" (r1)); }
#define ASL3(r1, r2, r3) { asm volatile ("asll #1,%0" : "=d" (r3) : "d" (r3));\
			   asm volatile ("roxll #1,%0" : "=d" (r2) : "d" (r2));\
			   asm volatile ("roxll #1,%0" : "=d" (r1) : "d" (r1)); }

#define ASR2(r1, r2) { asm volatile ("asrl #1,%0" : "=d" (r1) : "d" (r1));\
		       asm volatile ("roxrl #1,%0" : "=d" (r2) : "d" (r2)); }
#define ASR3(r1, r2, r3) { asm volatile ("asrl #1,%0" : "=d" (r1) : "d" (r1));\
			   asm volatile ("roxrl #1,%0" : "=d" (r2) : "d" (r2));\
			   asm volatile ("roxrl #1,%0" : "=d" (r3) : "d" (r3)); }
#define ASR4(r1, r2, r3, r4) { asm volatile ("asrl #1,%0" : "=d" (r1) : "d" (r1));\
			       asm volatile ("roxrl #1,%0" : "=d" (r2) : "d" (r2));\
			       asm volatile ("roxrl #1,%0" : "=d" (r3) : "d" (r3));\
			       asm volatile ("roxrl #1,%0" : "=d" (r4) : "d" (r4)); }

#define ADD2(r1, r2, r3, r4) \
	{ asm volatile ("addl %2,%0": "=g" (r4) : "0" (r4) , "g" (r2)); \
	  asm volatile ("addxl %2,%0": "=g" (r3) : "0" (r3) , "g" (r1)); }

/* y <- y - x  */
#define SUB3(x1, x2, x3, y1, y2, y3) \
	{ asm volatile ("subl %2,%0": "=g" (y3) : "g" (y3) , "d" (x3)); \
	  asm volatile ("subxl %2,%0": "=g" (y2) : "g" (y2) , "d" (x2));\
	  asm volatile ("subxl %2,%0": "=g" (y1) : "g" (y1) , "d" (x1)); }

/* sub4 here is rather complex, as the compiler is overwhelmed by me wanting
   to have 8 data registers allocated for mantissa accumulators.  Help it out
   by declaring a temp that it can move stuff in and out of.  */
#define SUB4(x1, x2, x3, x4, y1, y2, y3, y4) \
	{ register long temp = y4; \
	  asm volatile ("subl %2,%0": "=d" (temp) : "d" (temp) , "d" (x4)); \
	  y4 = temp; temp = y3; \
	  asm volatile ("subxl %2,%0": "=d" (temp) : "d" (temp) , "d" (x3));\
	  y3 = temp; temp = y2; \
	  asm volatile ("subxl %2,%0": "=d" (temp) : "d" (temp) , "d" (x2));\
	  y2 = temp; temp = y1; \
	  asm volatile ("subxl %2,%0": "=d" (temp) : "d" (temp) , "d" (x1));\
	  y1 = temp; }

#define NEG(r1, r2) { asm volatile ("negl %0" : "=d" (r2) : "d" (r2)); \
		      asm volatile ("negxl %0" : "=d" (r1) : "d" (r1)); } 

/* switches for which routines to compile.  All the single-float and
long-int arithmetic routines are turned off here, as they were all
done in assembly language last year.  */

/*
#define L_umulsi3
#define L_mulsi3
#define L_udivsi3
#define L_divsi3
#define L_umodsi3
#define L_modsi3
#define L_lshrsi3
#define L_lshlsi3
#define L_ashrsi3
#define L_ashlsi3
*/
#define L_divdf3
#define L_muldf3
#define L_negdf2
#define L_adddf3
#define L_subdf3
#define L_cmpdf2
#define L_fixunsdfsi
#define L_fixunsdfdi
#define L_fixdfdi
#define L_floatsidf
#define L_floatdidf
/*
#define L_addsf3
#define L_negsf2
#define L_subsf3
#define L_cmpsf2
#define L_mulsf3
#define L_divsf3
*/
#define L_truncdfsf2
#define L_extendsfdf2

#endif /* FLONUM_H */



/************************************************************************/
/* begin excerpt from dflonum.c						*/
/************************************************************************/

/* A working Gnulib68k, thanks to Scott McCauley for the origonal
   version, and John Dunning (jrd@STONY-BROOK.SCRC.Symbolics.COM)
   who got this working.
   
   Please not that only the double float. stuff is picked up from
   here, the other long-int and single float stuff come from
   fixnum.s and sflonum.s (see flonum.h for the appr. #defs).
   ++jrb
   */

/* Subroutines needed by GCC output code for the 68000/20. 
   
   Compile using -O flag with gcc. 
   Use the -m68000 flag if you have a 68000
   
   This package includes 32 bit integer and 64-bit floating
   point routines.
   
   WARNING: alpha-test version (July 1988) -- probably full of bugs.
   If you find a bug, send bugs reports to jsm@phoenix.princeton.edu,
   or
   
   Scott McCauley
   PPPL P. O. Box 451
   Princeton NJ 08543
   
   Known bugs/features:
   
   1) Depending on the version of GCC, this may produce a lot of
   warnings about conversion clashes, etc. It should still compile
   as is. Also, there appears to be a bug in the register-allocation
   parts of gcc that makes the compiler sometimes core dump
   or run into an infinite loop. This version works -- if you
   decide to change routines these compiler bugs can bite very hard....
   
   2) all single-precision gets converted to double precision in the
   math routines (in accordance with K&R), so doing things in
   double precision is faster than single precision....

(not any more -- jrd )
   
   3) double precision floating point division may not be accurate to
   the last four or so bits. Most other routines round to the
   lsb.

(not any more -- jrd )
   
   4) no support of NaN and Inf
   
   5) beware of undefined operations: i.e. a float to integer conversion
   when the float is larger than MAXIINT.
   
   */

#include "flonum.h"

#ifdef __GCC_OLD__
#define umultl _umulsi
#define multl _mulsi3
#define udivl _udivsi3 
#define divl _divsi3
#define ddiv _divdf3
#define qmult _qmult
#define dmult _muldf3
#define dneg _negdf2
#define dadd _adddf3
#define dcmp _cmpdf2
#define dtoul _fixunsdfsi
#define dtol _fixdfsi
#define ltod _floatsidf
#else
#define umultl __umulsi
#define multl __mulsi3
#define udivl __udivsi3 
#define divl __divsi3
#define ddiv __divdf3
#define qmult __qmult
#define dmult __muldf3
#define dneg __negdf2
#define dadd __adddf3
#define dcmp __cmpdf2
#define dtoul __fixunsdfsi
#define dtol __fixdfsi
#define ltod __floatsidf
#endif


#ifdef L_muldf3
/*double _muldf3 (a, b) double a, b; { return a * b; } */

#ifdef SLOW_KLUDGEY_QMULT
__qmult(m11, m12, m21, m22, p1, p2)
unsigned long m11, m12, m21, m22, *p1, *p2;
{
/*    register unsigned long d2 = m11; */
    register long d2 = m11;
    register unsigned long d3 = m12, d4 = m21, d5 = m22, d6 =0, d7 = 0;
    int i;
    /* guess what happens if you delete the next line.... */
    /*	&i; */
    for (i = 0; i < 11; i++) ASL2(d2, d3);
    for (i = 0; i < 9; i++) ASL2(d4, d5);
    
    for (i = 0; i < 64; i++) {
	if (d2 < 0) { ADD2(d4, d5, d6, d7);}
	ASL2(d2, d3);
	ASR2(d4, d5);
    }
    d2 = d6;
    d3 = d7;
    for (i = 0; i < 9; i++) ASR2(d2, d3);
    *p1 = d2; *p2 = d3;
}

#else
/* new qmult */

/* a set of totally local kludges, for summing partial results into
   result vector */

#define XADDL(partial, target_ptr) \
	{ register unsigned long temp = *target_ptr; \
	asm volatile("addl %2,%0" : "=d" (temp) : "d" (temp), "g" (partial)); \
	*target_ptr-- = temp; temp = *target_ptr; \
	asm volatile("addxl #0,%0" : "=d" (temp) : "d" (temp)); \
	*target_ptr = temp; }

static long constant_zero_kludge = 0;

#define XXADDL(partial, target) \
	{ register unsigned long * zero = &constant_zero_kludge + 1; \
	asm volatile("addl %2,%0@" : "=a" (target) : "a" (target), "g" (partial)); \
	asm volatile("addxl %0@-,%1@-" : "=a" (zero) : "a" (target), "a" (zero)); }

/*
#define ADDL(partial, target_ptr) \
	{ register unsigned long temp = *target_ptr; \
	asm volatile("addl %2,%0" : "=d" (temp) : "d" (temp), "g" (partial)); \
	*target_ptr-- = temp }

#define ADDXL(partial, target_ptr) \
	{ register unsigned long temp = *target_ptr; \
	asm volatile("addxl %2,%0" : "=d" (temp) : "d" (temp), "g" (partial)); \
	*target_ptr-- = temp }
	
#define ADDW(partial, target_ptr) \
	{ register unsigned short temp = *(unsigned short * )target_ptr; \
	asm volatile("addw %2,%0" : "=d" (temp) : "d" (temp), "g" (partial)); \
	*(unsigned short * )target_ptr-- = temp }

#define ADDXW(partial, target_ptr) \
	{ register unsigned sort temp = *(unsigned short * )target_ptr; \
	asm volatile("addxw %2,%0" : "=d" (temp) : "d" (temp), "g" (partial)); \
	*(unsigned short * )target_ptr-- = temp }
*/	

static char component_index[] = 
	{
	3, 3,				/* last ones */

	2, 3,				/* next to last x last */
	3, 2,				/* ... */

	1, 3,
	2, 2,
	3, 1,

	0, 3,
	1, 2,
	2, 1,
	3, 0,
	
	0, 2,
	1, 1,
	2, 0,
	
	0, 1,
	1, 0,
	
	0, 0
	};

qmult(m1_hi, m1_lo, m2_hi, m2_lo, result_hi, result_lo)
unsigned long m1_hi, m1_lo, m2_hi, m2_lo, * result_hi, * result_lo;
{
  unsigned short * m1 = (unsigned short * )(&m1_hi);
  unsigned short * m2 = (unsigned short * )(&m2_hi);
  unsigned short result[10];		/* two extra for XADDL */
  register unsigned long mult_register;
  register unsigned long res1, res2, res3;
  long * kludge;
  short i, j;
  char * idx_p = (char * )&component_index;
/*
fprintf(stderr, "  qmult: %08X:%08X * %08X:%08X\n", 
	m1_hi, m1_lo, m2_hi, m2_lo);
*/
  for (i = 0 ; i < 10 ; i++) result[i] = 0;

/* walk thru 'vectors' of mantissa pieces, doing unsigned multiplies
   and summing results into result vector.  Note that the order is 
   chosen to guarantee that no more adds than generated by XADDL are
   needed.  */

  for ( ; ; )
	{
	i = *idx_p++;
	j = *idx_p++;
	mult_register = m1[i]; 
	MUL(m2[j], mult_register);
	kludge = (long * )(&result[i + j + 2]);
	XXADDL(mult_register, kludge);
/*
fprintf(stderr, "  qmult: %d[%04X] * %d[%04X] -> %08X\n",
  i, m1[i], j, m2[j], mult_register);
fprintf(stderr, "       %04X %04X %04X %04X %04X %04X %04X %04X\n",
  result[2], result[3], result[4], result[5], 
  result[6], result[7], result[8], result[9]);
*/
	if ((i == 0) && (j == 0))
		break;
	}

  /* get the result shifted to the right place.  Unfortunately, we need
     the 53 bits that's 22 bits down in the result vec.  sigh */
  kludge = (long * )(&result[2]);
  res1 = *kludge++;
  res2 = *kludge++;
  res3 = *kludge;
  for (i = 0 ; i < 12 ; i++)
	ASL3(res1, res2, res3);
  /* now put the result back */
  *result_hi = res1;
  *result_lo = res2;
}
#endif
#endif