ewilts%Ins.MRC.AdhocNet.CA%Stasis.MRC.AdhocNet.CA%UNCAEDU.@CORNELLC.CCS.CORNELL.EDU.BITNET (Ed Wilts) (06/24/88)
$Part08: $ File_is="ARCLZW.C" $ Check_Sum_is=606678970 $ Copy SYS$Input VMS_SHAR_DUMMY.DUMMY Xstatic char *RCSid = "$Header: arclzw.c,v 1.2 86/07/15 07:53:20 turner Exp $"; X X/* X * $Log:`009arclzw.c,v $ X * Hack-attack 1.3 86/12/20 01:23:45 wilhite@usceast.uucp X * `009Bludgeoned into submission for VAX 11/780 BSD4.2 X *`009(ugly code, but fewer core dumps) X * X * Revision 1.2 86/07/15 07:53:20 turner X * X * X * Revision 1.1 86/06/26 15:00:26 turner X * initial version X * X * X */ X X/* ARC - Archive utility - ARCLZW X X$define(tag,$$segment(@1,$$index(@1,=)+1))# X$define(version,Version $tag( XTED_VERSION DB =1.88), created on $tag( XTED_DATE DB =01/20/86) at $tag( XTED_TIME DB =16:47:04))# X$undefine(tag)# X $version X X(C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED X X By: Thom Henderson X X Description: X This file contains the routines used to implement Lempel-Zev X data compression, which calls for building a coding table on X the fly. This form of compression is especially good for encoding X files which contain repeated strings, and can often give dramatic X improvements over traditional Huffman SQueezing. X X Language: X Computer Innovations Optimizing C86 X X Programming notes: X In this section I am drawing heavily on the COMPRESS program X from UNIX. The basic method is taken from "A Technique for High X Performance Data Compression", Terry A. Welch, IEEE Computer X Vol 17, No 6 (June 1984), pp 8-19. Also see "Knuth's Fundamental X Algorithms", Donald Knuth, Vol 3, Section 6.4. X X As best as I can tell, this method works by tracing down a hash X table of code strings where each entry has the property: X X if <string> <char> is in the table X then <string> is in the table. X*/ X#include <stdio.h> X#include "arc.h" X X/* definitions for older style crunching */ X X#define FALSE 0 X#define TRUE !FALSE X#define TABSIZE 4096 X#define NO_PRED 0xFFFF X#define EMPTY 0xFFFF X#define NOT_FND 0xFFFF X Xstatic unsigned INT inbuf; /* partial input code storage */ Xstatic INT sp; /* current stack pointer */ X Xstatic struct entry /* string table entry format */ X{ char used; /* true when this entry is in use */ X unsigned INT next; /* ptr to next in collision list */ X unsigned INT predecessor; /* code for preceeding string */ X unsigned char follower; /* char following string */ X} string_tab[TABSIZE]; /* the code string table */ X X X/* definitions for the new dynamic Lempel-Zev crunching */ X X#define BITS 12 /* maximum bits per code */ X#define HSIZE 5003 /* 80% occupancy */ X#define INIT_BITS 9 /* initial number of bits/code */ X Xstatic INT n_bits; /* number of bits/code */ Xstatic INT maxcode; /* maximum code, given n_bits */ X#define MAXCODE(n) ((1<<(n)) - 1) /* maximum code calculation */ Xstatic INT maxcodemax = 1 << BITS; /* largest possible code (+1) */ X Xstatic unsigned char buf[BITS]; /* input/output buffer */ X Xstatic unsigned char lmask[9] = /* left side masks */ X{ 0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 }; Xstatic unsigned char rmask[9] = /* right side masks */ X{ 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff}; X Xstatic INT offset; /* byte offset for code output */ Xstatic long in_count; /* length of input */ Xstatic long bytes_out; /* length of compressed output */ Xstatic unsigned INT ent; X X/* To save much memory (which we badly need at this point), we overlay X * the table used by the previous version of Lempel-Zev with those used X * by the new version. Since no two of these routines will be used X * together, we can safely do this. Note that the tables used for Huffman X * squeezing may NOT overlay these, since squeezing and crunching are done X * in parallel. X */ X X#if MSDOS Xstatic long *htab = string_tab; /* hash code table (crunch) */ X#endif X#if BSD | ST Xstatic long htab[HSIZE]; /* hash code table (crunch) */ X#endif Xstatic unsigned INT codetab[HSIZE]; /* string code table (crunch) */ X Xstatic unsigned INT *prefix = codetab; /* prefix code table (uncrunch) */ X X#if MSDOS Xstatic unsigned char *suffix = string_tab; /* suffix table (uncrunch) */ X#endif X#if BSD | ST Xstatic unsigned char suffix[HSIZE]; /* suffix table (uncrunch) */ X#endif Xstatic INT free_ent; /* first unused entry */ Xstatic INT firstcmp; /* true at start of compression */ Xstatic unsigned char stack[HSIZE]; /* local push/pop stack */ X X/* X * block compression parameters -- after all codes are used up, X * and compression rate changes, start over. X */ X Xstatic INT clear_flg; Xstatic long ratio; X#define CHECK_GAP 10000 /* ratio check interval */ Xstatic long checkpoint; X X/* X * the next two codes should not be changed lightly, as they must not X * lie within the contiguous general code space. X */ X#define FIRST 257 /* first free entry */ X#define CLEAR 256 /* table clear output code */ X Xstatic INT cl_block(t) /* table clear for block compress */ XFILE *t; /* our output file */ X{ X long rat; X INT putcode(); X X checkpoint = in_count + CHECK_GAP; X X if(in_count > 0x007fffff) /* shift will overflow */ X { rat = bytes_out >> 8; X if(rat == 0) /* Don't divide by zero */ X rat = 0x7fffffff; X else rat = in_count / rat; X } X else rat = (in_count<<8)/bytes_out;/* 8 fractional bits */ X X if(rat > ratio) X ratio = rat; X else X { ratio = 0; X setmem`009(htab,HSIZE*sizeof(long),0xff); X free_ent = FIRST; X clear_flg = 1; X putcode(CLEAR,t); X } X} X X/***************************************************************** X * X * Output a given code. X * Inputs: X * code: A n_bits-bit integer. If == -1, then EOF. This assumes X * that n_bits =< (long)wordsize - 1. X * Outputs: X * Outputs code to the file. X * Assumptions: X * Chars are 8 bits long. X * Algorithm: X * Maintain a BITS character long buffer (so that 8 codes will X * fit in it exactly). When the buffer fills up empty it and start over. X */ X Xstatic INT putcode(code,t) /* output a code */ XINT code; /* code to output */ XFILE *t; /* where to put it */ X{ X INT r_off = offset; /* right offset */ X INT bits = n_bits; /* bits to go */ X unsigned char *bp = buf; /* buffer pointer */ X INT n; /* index */ X X if(code >= 0) /* if a real code */ X { /* X * Get to the first byte. X */ X bp += (r_off >> 3); X r_off &= 7; X X /* X * Since code is always >= 8 bits, only need to mask the first X * hunk on the left. X */ X *bp = (*bp&rmask[r_off]) | (code<<r_off) & lmask[r_off]; X bp++; X bits -= (8 - r_off); X code >>= (8 - r_off); X X /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */ X if(bits >= 8) X { *bp++ = code; X code >>= 8; X bits -= 8; X } X X /* Last bits. */ X if(bits) X *bp = code; X X offset += n_bits; X X if(offset == (n_bits << 3)) X { bp = buf; X bits = n_bits; X bytes_out += bits; X do X putc_pak(*bp++,t); X while(--bits); X offset = 0; X } X X /* X * If the next entry is going to be too big for the code size, X * then increase it, if possible. X */ X if(free_ent>maxcode || clear_flg>0) X { /* X * Write the whole buffer, because the input side won't X * discover the size increase until after it has read it. X */ X if(offset > 0) X { bp = buf; /* reset pointer for writing */ X bytes_out += n = n_bits; X while(n--) X putc_pak(*bp++,t); X } X offset = 0; X X if(clear_flg) /* reset if clearing */ X { maxcode = MAXCODE(n_bits = INIT_BITS); X clear_flg = 0; X } X else /* else use more bits */ X { n_bits++; X if(n_bits == BITS) X maxcode = maxcodemax; X else X maxcode = MAXCODE(n_bits); X } X } X } X X else /* dump the buffer on EOF */ X { bytes_out += n = (offset+7) / 8; X X if(offset > 0) X while(n--) X putc_pak(*bp++,t); X offset = 0; X } X} X X/***************************************************************** X * X * Read one code from the standard input. If EOF, return -1. X * Inputs: X * cmpin X * Outputs: X * code or -1 is returned. X */ X Xstatic INT getcode(f) /* get a code */ XFILE *f; /* file to get from */ X{ X INT code; X static INT offset = 0, size = 0; X INT r_off, bits; X unsigned char *bp = buf; X X if(clear_flg > 0 || offset >= size || free_ent > maxcode) X { /* X * If the next entry will be too big for the current code X * size, then we must increase the size. This implies reading X * a new buffer full, too. X */ X if(free_ent > maxcode) X { n_bits++; X if(n_bits == BITS) X maxcode = maxcodemax; /* won't get any bigger now */ X else maxcode = MAXCODE(n_bits); X } X if(clear_flg > 0) X { maxcode = MAXCODE(n_bits = INIT_BITS); X clear_flg = 0; X } X X for(size=0; size<n_bits; size++) X { if((code=getc_unp(f))==EOF) X break; X else buf[size] = code; X } X if(size <= 0) X return -1; /* end of file */ X X offset = 0; X /* Round size down to integral number of codes */ X size = (size << 3)-(n_bits - 1); X } X r_off = offset; X bits = n_bits; X X /* X * Get to the first byte. X */ X bp +=(r_off >> 3); X r_off &= 7; X X /* Get first part (low order bits) */ X code = (*bp++ >> r_off); X bits -= 8 - r_off; X r_off = 8 - r_off; /* now, offset into code word */ X X /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */ X if(bits >= 8) X { code |= *bp++ << r_off; X r_off += 8; X bits -= 8; X } X /* high order bits. */ X code |= (*bp & rmask[bits]) << r_off; X offset += n_bits; X X return code; X} X X/* X * compress a file X * X * Algorithm: use open addressing double hashing (no chaining) on the X * prefix code / next character combination. We do a variant of Knuth's X * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime X * secondary probe. Here, the modular division first probe is gives way X * to a faster exclusive-or manipulation. Also do block compression with X * an adaptive reset, where the code table is cleared when the compression X * ratio decreases, but after the table fills. The variable-length output X * codes are re-sized at this point, and a special CLEAR code is generated X * for the decompressor. X */ X XINT init_cm(f,t) /* initialize for compression */ XFILE *f; /* file we will be compressing */ XFILE *t; /* where we will put it */ X{ X offset = 0; X bytes_out = 1; X clear_flg = 0; X ratio = 0; X in_count = 1; X checkpoint = CHECK_GAP; X maxcode = MAXCODE(n_bits = INIT_BITS); X free_ent = FIRST; X setmem(htab,HSIZE*sizeof(long),0xff); X n_bits = INIT_BITS; /* set starting code size */ X X putc_pak(BITS,t); /* note our max code length */ X X firstcmp = 1; /* next byte will be first */ X} X XINT putc_cm(c,t) /* compress a character */ Xunsigned char c; /* character to compress */ XFILE *t; /* where to put it */ X{ X static long fcode; X static INT hshift; X register INT i; X register INT disp; X X if(firstcmp) /* special case for first byte */ X { ent = c; /* remember first byte */ X X hshift = 0; X for(fcode=(long)HSIZE; fcode<65536L; fcode*=2L) X hshift++; X hshift = 8 - hshift; /* set hash code range bund */ X X firstcmp = 0; /* no longer first */ X return; X } X X in_count++; X fcode =(long)(((long)c << BITS)+ent); X i = (c<<hshift)^ent; /* xor hashing */ X X if(htab[i]==fcode) X { ent = codetab[i]; X return; X } X else if(htab[i]<0) /* empty slot */ X goto nomatch; X disp = HSIZE - i; /* secondary hash (after G.Knott) */ X if(i == 0) X disp = 1; X Xprobe: X if((i -= disp) < 0) X i += HSIZE; X X if(htab[i] == fcode) X { ent = codetab[i]; X return; X }