[comp.arch] long = short*short ;

dgh%dgh@Sun.COM (David Hough) (07/22/89)
Sophomores of all ages have been recently aflame at the suggestion that
C perhaps may not be ideally suited to common tasks, because C provides
ready access to part of the hardware of some machines, while many
machines are designed to only implement what C will provide access to.

Part of the argument has been as to whether a double-precision product
of single-precision operands, or a single-precision quotient/remainder
pair resulting from a double-precision dividend and a single-precision
divisor, are worth providing in EFFICIENT hardware and supporting by
EFFICIENT language constructs.  A fair number of people have suggested
that these operations are too rare to consider.  I guess they don't run
ps or use printf or display floating-point numbers on their
spreadsheets.

I used to hear that at mainframe computer centers supporting beginning
(Fortran) programming courses for engineers, the biggest single use of
machine resources was formatting floating-point numbers for printing.
Computing is less centralized these days so it would be more trouble to
confirm that proposition.  I wouldn't have thought base conversion
would be in the inner loop of ANY realistic scientific applications
until a number of workstation customers convinced me otherwise.

The inner loops follow from a correctly-rounded binary<->decimal
floating-point conversion package that will eventually appear in a
commercial product.  The 1984 UCB thesis of Jerome Coonen discusses
fast methods and correctly-rounded methods but not fast
correctly-rounded methods; that's where the art comes in; it doesn't
show up in these inner loops but at a higher level.   That art is in a
proprietary implementation but I hope to eventually make available
publicly a test program that manifests when the art's missing, as it
usually is.

If you had to maintain such code for three or more CPU architectures
wouldn't you rather write it in a readily available language in a
natural form that was readily compiled into efficient code?   If so, I
am sorry to report that the machine-independent C code may tend to look
like the following:

/*
 * Fundamental utilities that multiply two shorts into a unsigned long, add
 * carry, compute quotient and remainder in underlying base, and return
 * quo<<16 | rem as  a unsigned long.
 */

/*
 * C compilers tend to generate bad code - forcing full unsigned long by
 * unsigned long multiplies when what is really wanted is the unsigned long
 * product of half-long operands. Similarly the quotient and remainder are
 * all half-long. So these functions should really be implemented by inline
 * expansion templates.
 */

unsigned long
_umac(x, y, c)			/* p = x * y + c ; return p */
	_BIG_FLOAT_DIGIT x, y;
	unsigned long   c;
{
	return x * (unsigned long) y + c;
}

unsigned long
_carry_in_b10000(x, c)		/* p = x + c ; return (p/10000 << 16 |
				 * p%10000) */
	_BIG_FLOAT_DIGIT x;
	long unsigned   c;
{
	unsigned long   p = x + c;

	return ((p / 10000) << 16) | (p % 10000);
}

void
_carry_propagate_two(carry, psignificand)
	unsigned long   carry;
	_BIG_FLOAT_DIGIT *psignificand;
{
	/*
	 * Propagate carries in a base-2**16 significand.
	 */

	long unsigned   p;
	int             j;

	j = 0;
	while (carry != 0) {
		p = _carry_in_b65536(psignificand[j], carry);
		psignificand[j++] = (_BIG_FLOAT_DIGIT) (p & 0xffff);
		carry = p >> 16;
	}
}

void
_carry_propagate_ten(carry, psignificand)
	unsigned long   carry;
	_BIG_FLOAT_DIGIT *psignificand;
{
	/*
	 * Propagate carries in a base-10**4 significand.
	 */

	int             j;
	unsigned long   p;

	j = 0;
	while (carry != 0) {
		p = _carry_in_b10000(psignificand[j], carry);
		psignificand[j++] = (_BIG_FLOAT_DIGIT) (p & 0xffff);
		carry = p >> 16;
	}
}

void
_fourdigitsquick(t, d)
	short unsigned  t;
	char           *d;

/* Converts t < 10000 into four ascii digits at *pc.     */

{
	register short  i;

	i = 3;
	do {
		d[i] = '0' + t % 10;
		t = t / 10;
	}
	while (--i != -1);
}

void
_multiply_base_two_vector(n, px, py, product)
	short unsigned  n, *py;
	_BIG_FLOAT_DIGIT *px, product[3];

{
	/*
	 * Given xi and yi, base 2**16 vectors of length n, computes dot
	 * product
	 * 
	 * sum (i=0,n-1) of x[i]*y[n-1-i]
	 * 
	 * Product may fill as many as three short-unsigned buckets. Product[0]
	 * is least significant, product[2] most.
	 */

	unsigned long   acc, p;
	short unsigned  carry;
	int             i;

	acc = 0;
	carry = 0;
	for (i = 0; i < n; i++) {
		p = _umac(px[i], py[n - 1 - i], acc);
		if (p < acc)
			carry++;
		acc = p;
	}
	product[0] = (_BIG_FLOAT_DIGIT) (acc & 0xffff);
	product[1] = (_BIG_FLOAT_DIGIT) (acc >> 16);
	product[2] = (_BIG_FLOAT_DIGIT) (carry);
}

void
_multiply_base_ten_vector(n, px, py, product)
	short unsigned  n, *py;
	_BIG_FLOAT_DIGIT *px, product[3];

{
	/*
	 * Given xi and yi, base 10**4 vectors of length n, computes dot
	 * product
	 * 
	 * sum (i=0,n-1) of x[i]*y[n-1-i]
	 * 
	 * Product may fill as many as three short-unsigned buckets. Product[0]
	 * is least significant, product[2] most.
	 */

#define ABASE	3000000000	/* Base of accumulator. */

	unsigned long   acc;
	short unsigned  carry;
	int             i;

	acc = 0;
	carry = 0;
	for (i = 0; i < n; i++) {
		acc = _umac(px[i], py[n - 1 - i], acc);
		if (acc >= (unsigned long) ABASE) {
			carry++;
			acc -= ABASE;
		}
	}
	/*
	 * NOTE: because acc * <= ABASE-1, acc/10000 <= 299999 which would
	 * overflow a short unsigned
	 */
	product[0] = (_BIG_FLOAT_DIGIT) (acc % 10000);
	acc /= 10000;
	product[1] = (_BIG_FLOAT_DIGIT) (acc % 10000);
	acc /= 10000;
	product[2] = (_BIG_FLOAT_DIGIT) (acc + (ABASE / 100000000) * carry);
}

David Hough

na.hough@na-net.stanford.edu