[comp.arch] machine architecture influence on memory copy code

chris@mimsy.UUCP (Chris Torek) (08/01/88)
[The following was sent to me as mail to work around a silly
restriction.  I, Chris Torek, am merely forwarding this; I did not
write it.  The following is, rather, in reply to something I wrote.]

You  said that the obvious bcopy() loop can be possibly improved; I have
long  ago  written a procedure to implement bcopy() efficiently, and you
will see how many compile time parameters it has.

One  of  these,  which  is very very important, is the size of the cache
line  for  cached machines that do write thru. Consider copying just one
byte  at  a  time  on amchine like the 780 where the line size is 8: for
every  byte written the cache shall initiate an SBI transaction to write
back to memory the 8 bytes line it is in.

Notice  that  I  am using several #define's and typedef'ed names; I hope
that their meaning is obvious.

/*
    This function is handed pointers to two memory areas, and copies
    as  many  units  as it is told from the second to the first. The
    two  areas  are  expected to begin at any byte boundary, and the
    size  is  given  in  bytes  too. 

    If  the memory subsystem of the machine handles more efficiently
    naturally aligned requests in clusters (multiples of a unit), we
    try to take advantage of that. Since we cannot take advantage of
    moving clusters for both source and destination, we optimize the
    writing of clusters, of course...
*/

#include "Cpu.h"
#include "Comp.h"
#include "Os.h"
#include "Unix.h"
#include "Where.h"

#include "Sizes.h"
#include "Types.h"
#include "Convert.h"

#include "Extend.h"

public pointer		CoreCopy(to,from,bytes)

#if (ClusterBYTES == 1 || !(MemFEATURE & MemDCACHE))
    /*
	No clustering, and no data cache either...
    */
    register pointer	to;
    register pointer	from;
    register offset	bytes;
{
    while (bytes > 0)
	*(byte *) to++ = *(byte *) from++, bytes--;

    return to;
}
#else
    /*
	If the size of a physical memory line is not defined, we
	assume it is the size of the smallest addressable mem unit
    */
#   if (!defined MemSZLINE)
#	define ClusterBITS	(CpuUNIT)
#   else
#	define ClusterBITS	(MemSZLINE*CpuUNIT)
#   endif

#   if (ClusterBITS == ShortBITS)
#	define Cluster	short
#   endif
#   if (ClusterBITS == IntBITS)
#	define Cluster	int
#   endif
#   if (ClusterBITS >= LongBITS && (ClusterBITS % LongBITS) == 0)
#	define Cluster		long
#	undef ClusterBITS
#	define ClusterBITS	LongBITS
#   endif

#   if (!defined Cluster)
#	include "ERROR: cannot define a sensible Cluster"
#   endif

#   if ((ClusterBITS % ByteBITS) == 0)
#	define ClusterBYTES	BYTESOFBITS(ClusterBITS)
#   else
#	include "ERROR: Cluster is not an even # of bytes"
#   endif

    register pointer	to;
    register pointer	from;
    offset			bytes;
{
    register offset		pieces;
    offset			headbytes,clusters,tailbytes;

    /*
	The numerical factors here are highly machine (and compiler
	dependent
    */
#   if (MemFEATURE & MemWTHRU)
#	define ClusterADVANTAGE	(ClusterBYTES*3)
#   else
#	define ClusterADVANTAGE (ClusterBYTES*6)
#   endif

    if ((pieces = bytes) < ClusterADVANTAGE)
    {
	while (pieces > 0)
	    *(byte *) to++ = *(byte *) from++, pieces--;

	return to;
    }

    headbytes	= (ClusterBYTES-((offset) to % ClusterBYTES)) % ClusterBYTES;
    tailbytes	= ((offset) to + bytes) % ClusterBYTES;
    clusters	= (bytes-headbytes-tailbytes) / ClusterBYTES;

    /*
	Why not a for(){} ? Because on many machines we have
	a decrement and jump on zero/not zero opcode...
    */
    if ((pieces = headbytes) != 0)
	do *(byte *) to++ = *(byte *) from++;
	while (--pieces != 0);

    /*
	This   is  hard  to  believe.  If  the  CPU  has  scaled
	postincrement,  why ought we to do it manually ? Because
	we  might  have to create two new register pointers with
	the  right  scale,  and  maybe  there  not  other  spare
	registers.
    */
    if ((pieces = clusters) != 0)
#	if (CpuIS & (CpuVAX11|CpuPDP11|CpuNS32000)		\
	    && (CodeREGISTER >= 5 || CodePREGISTER >= 4))
	{
	    register Cluster	*target;
	    register Cluster	*source;
    
	    target = (Cluster *) to; source = (Cluster *) from;
	    do *target++ = *source++;
	    while (--pieces != 0);
	    to = (pointer) target; from = (pointer) source;
	}
#	else
	    do
	    {
		*(Cluster *) to = *(Cluster *) from;
		to += ClusterBYTES; from += ClusterBYTES;
	    }
	    while (--pieces != 0);
#	endif

    if ((pieces = tailbytes) != 0)
	do *(byte *) to++ = *(byte *) from++;
	while (--pieces != 0);

    return to;
}
#endif


-- 
In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 454 7163)
Domain:	chris@mimsy.umd.edu	Path:	uunet!mimsy!chris