chris@mimsy.UUCP (Chris Torek) (08/01/88)
[The following was sent to me as mail to work around a silly restriction. I, Chris Torek, am merely forwarding this; I did not write it. The following is, rather, in reply to something I wrote.] You said that the obvious bcopy() loop can be possibly improved; I have long ago written a procedure to implement bcopy() efficiently, and you will see how many compile time parameters it has. One of these, which is very very important, is the size of the cache line for cached machines that do write thru. Consider copying just one byte at a time on amchine like the 780 where the line size is 8: for every byte written the cache shall initiate an SBI transaction to write back to memory the 8 bytes line it is in. Notice that I am using several #define's and typedef'ed names; I hope that their meaning is obvious. /* This function is handed pointers to two memory areas, and copies as many units as it is told from the second to the first. The two areas are expected to begin at any byte boundary, and the size is given in bytes too. If the memory subsystem of the machine handles more efficiently naturally aligned requests in clusters (multiples of a unit), we try to take advantage of that. Since we cannot take advantage of moving clusters for both source and destination, we optimize the writing of clusters, of course... */ #include "Cpu.h" #include "Comp.h" #include "Os.h" #include "Unix.h" #include "Where.h" #include "Sizes.h" #include "Types.h" #include "Convert.h" #include "Extend.h" public pointer CoreCopy(to,from,bytes) #if (ClusterBYTES == 1 || !(MemFEATURE & MemDCACHE)) /* No clustering, and no data cache either... */ register pointer to; register pointer from; register offset bytes; { while (bytes > 0) *(byte *) to++ = *(byte *) from++, bytes--; return to; } #else /* If the size of a physical memory line is not defined, we assume it is the size of the smallest addressable mem unit */ # if (!defined MemSZLINE) # define ClusterBITS (CpuUNIT) # else # define ClusterBITS (MemSZLINE*CpuUNIT) # endif # if (ClusterBITS == ShortBITS) # define Cluster short # endif # if (ClusterBITS == IntBITS) # define Cluster int # endif # if (ClusterBITS >= LongBITS && (ClusterBITS % LongBITS) == 0) # define Cluster long # undef ClusterBITS # define ClusterBITS LongBITS # endif # if (!defined Cluster) # include "ERROR: cannot define a sensible Cluster" # endif # if ((ClusterBITS % ByteBITS) == 0) # define ClusterBYTES BYTESOFBITS(ClusterBITS) # else # include "ERROR: Cluster is not an even # of bytes" # endif register pointer to; register pointer from; offset bytes; { register offset pieces; offset headbytes,clusters,tailbytes; /* The numerical factors here are highly machine (and compiler dependent */ # if (MemFEATURE & MemWTHRU) # define ClusterADVANTAGE (ClusterBYTES*3) # else # define ClusterADVANTAGE (ClusterBYTES*6) # endif if ((pieces = bytes) < ClusterADVANTAGE) { while (pieces > 0) *(byte *) to++ = *(byte *) from++, pieces--; return to; } headbytes = (ClusterBYTES-((offset) to % ClusterBYTES)) % ClusterBYTES; tailbytes = ((offset) to + bytes) % ClusterBYTES; clusters = (bytes-headbytes-tailbytes) / ClusterBYTES; /* Why not a for(){} ? Because on many machines we have a decrement and jump on zero/not zero opcode... */ if ((pieces = headbytes) != 0) do *(byte *) to++ = *(byte *) from++; while (--pieces != 0); /* This is hard to believe. If the CPU has scaled postincrement, why ought we to do it manually ? Because we might have to create two new register pointers with the right scale, and maybe there not other spare registers. */ if ((pieces = clusters) != 0) # if (CpuIS & (CpuVAX11|CpuPDP11|CpuNS32000) \ && (CodeREGISTER >= 5 || CodePREGISTER >= 4)) { register Cluster *target; register Cluster *source; target = (Cluster *) to; source = (Cluster *) from; do *target++ = *source++; while (--pieces != 0); to = (pointer) target; from = (pointer) source; } # else do { *(Cluster *) to = *(Cluster *) from; to += ClusterBYTES; from += ClusterBYTES; } while (--pieces != 0); # endif if ((pieces = tailbytes) != 0) do *(byte *) to++ = *(byte *) from++; while (--pieces != 0); return to; } #endif -- In-Real-Life: Chris Torek, Univ of MD Comp Sci Dept (+1 301 454 7163) Domain: chris@mimsy.umd.edu Path: uunet!mimsy!chris