chris@mimsy.UUCP (Chris Torek) (08/01/88)
[The following was sent to me as mail to work around a silly
restriction. I, Chris Torek, am merely forwarding this; I did not
write it. The following is, rather, in reply to something I wrote.]
You said that the obvious bcopy() loop can be possibly improved; I have
long ago written a procedure to implement bcopy() efficiently, and you
will see how many compile time parameters it has.
One of these, which is very very important, is the size of the cache
line for cached machines that do write thru. Consider copying just one
byte at a time on amchine like the 780 where the line size is 8: for
every byte written the cache shall initiate an SBI transaction to write
back to memory the 8 bytes line it is in.
Notice that I am using several #define's and typedef'ed names; I hope
that their meaning is obvious.
/*
This function is handed pointers to two memory areas, and copies
as many units as it is told from the second to the first. The
two areas are expected to begin at any byte boundary, and the
size is given in bytes too.
If the memory subsystem of the machine handles more efficiently
naturally aligned requests in clusters (multiples of a unit), we
try to take advantage of that. Since we cannot take advantage of
moving clusters for both source and destination, we optimize the
writing of clusters, of course...
*/
#include "Cpu.h"
#include "Comp.h"
#include "Os.h"
#include "Unix.h"
#include "Where.h"
#include "Sizes.h"
#include "Types.h"
#include "Convert.h"
#include "Extend.h"
public pointer CoreCopy(to,from,bytes)
#if (ClusterBYTES == 1 || !(MemFEATURE & MemDCACHE))
/*
No clustering, and no data cache either...
*/
register pointer to;
register pointer from;
register offset bytes;
{
while (bytes > 0)
*(byte *) to++ = *(byte *) from++, bytes--;
return to;
}
#else
/*
If the size of a physical memory line is not defined, we
assume it is the size of the smallest addressable mem unit
*/
# if (!defined MemSZLINE)
# define ClusterBITS (CpuUNIT)
# else
# define ClusterBITS (MemSZLINE*CpuUNIT)
# endif
# if (ClusterBITS == ShortBITS)
# define Cluster short
# endif
# if (ClusterBITS == IntBITS)
# define Cluster int
# endif
# if (ClusterBITS >= LongBITS && (ClusterBITS % LongBITS) == 0)
# define Cluster long
# undef ClusterBITS
# define ClusterBITS LongBITS
# endif
# if (!defined Cluster)
# include "ERROR: cannot define a sensible Cluster"
# endif
# if ((ClusterBITS % ByteBITS) == 0)
# define ClusterBYTES BYTESOFBITS(ClusterBITS)
# else
# include "ERROR: Cluster is not an even # of bytes"
# endif
register pointer to;
register pointer from;
offset bytes;
{
register offset pieces;
offset headbytes,clusters,tailbytes;
/*
The numerical factors here are highly machine (and compiler
dependent
*/
# if (MemFEATURE & MemWTHRU)
# define ClusterADVANTAGE (ClusterBYTES*3)
# else
# define ClusterADVANTAGE (ClusterBYTES*6)
# endif
if ((pieces = bytes) < ClusterADVANTAGE)
{
while (pieces > 0)
*(byte *) to++ = *(byte *) from++, pieces--;
return to;
}
headbytes = (ClusterBYTES-((offset) to % ClusterBYTES)) % ClusterBYTES;
tailbytes = ((offset) to + bytes) % ClusterBYTES;
clusters = (bytes-headbytes-tailbytes) / ClusterBYTES;
/*
Why not a for(){} ? Because on many machines we have
a decrement and jump on zero/not zero opcode...
*/
if ((pieces = headbytes) != 0)
do *(byte *) to++ = *(byte *) from++;
while (--pieces != 0);
/*
This is hard to believe. If the CPU has scaled
postincrement, why ought we to do it manually ? Because
we might have to create two new register pointers with
the right scale, and maybe there not other spare
registers.
*/
if ((pieces = clusters) != 0)
# if (CpuIS & (CpuVAX11|CpuPDP11|CpuNS32000) \
&& (CodeREGISTER >= 5 || CodePREGISTER >= 4))
{
register Cluster *target;
register Cluster *source;
target = (Cluster *) to; source = (Cluster *) from;
do *target++ = *source++;
while (--pieces != 0);
to = (pointer) target; from = (pointer) source;
}
# else
do
{
*(Cluster *) to = *(Cluster *) from;
to += ClusterBYTES; from += ClusterBYTES;
}
while (--pieces != 0);
# endif
if ((pieces = tailbytes) != 0)
do *(byte *) to++ = *(byte *) from++;
while (--pieces != 0);
return to;
}
#endif
--
In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 454 7163)
Domain: chris@mimsy.umd.edu Path: uunet!mimsy!chris