[comp.os.vms] ARC_C.SHAR07_OF_19

ewilts%Ins.MRC.AdhocNet.CA%Stasis.MRC.AdhocNet.CA%UNCAEDU.@CORNELLC.CCS.CORNELL.EDU.BITNET (Ed Wilts) (06/24/88)
$Part08:
$ File_is="ARCLZW.C"
$ Check_Sum_is=606678970
$ Copy SYS$Input VMS_SHAR_DUMMY.DUMMY
Xstatic char *RCSid = "$Header: arclzw.c,v 1.2 86/07/15 07:53:20 turner Exp $";
X
X/*
X * $Log:`009arclzw.c,v $
X * Hack-attack 1.3  86/12/20  01:23:45  wilhite@usceast.uucp
X * `009Bludgeoned into submission for VAX 11/780 BSD4.2
X *`009(ugly code, but fewer core dumps)
X *
X * Revision 1.2  86/07/15  07:53:20  turner
X *
X *
X * Revision 1.1  86/06/26  15:00:26  turner
X * initial version
X *
X *
X */
X
X/*  ARC - Archive utility - ARCLZW
X
X$define(tag,$$segment(@1,$$index(@1,=)+1))#
X$define(version,Version $tag(
XTED_VERSION DB =1.88), created on $tag(
XTED_DATE DB =01/20/86) at $tag(
XTED_TIME DB =16:47:04))#
X$undefine(tag)#
X    $version
X
X(C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED
X
X    By:  Thom Henderson
X
X    Description:
X         This file contains the routines used to implement Lempel-Zev
X         data compression, which calls for building a coding table on
X         the fly.  This form of compression is especially good for encoding
X         files which contain repeated strings, and can often give dramatic
X         improvements over traditional Huffman SQueezing.
X
X    Language:
X         Computer Innovations Optimizing C86
X
X    Programming notes:
X         In this section I am drawing heavily on the COMPRESS program
X         from UNIX.  The basic method is taken from "A Technique for High
X         Performance Data Compression", Terry A. Welch, IEEE Computer
X         Vol 17, No 6 (June 1984), pp 8-19.  Also see "Knuth's Fundamental
X         Algorithms", Donald Knuth, Vol 3, Section 6.4.
X
X         As best as I can tell, this method works by tracing down a hash
X         table of code strings where each entry has the property:
X
X              if <string> <char> is in the table
X              then <string> is in the table.
X*/
X#include <stdio.h>
X#include "arc.h"
X
X/* definitions for older style crunching */
X
X#define FALSE    0
X#define TRUE     !FALSE
X#define TABSIZE  4096
X#define NO_PRED  0xFFFF
X#define EMPTY    0xFFFF
X#define NOT_FND  0xFFFF
X
Xstatic unsigned INT inbuf;             /* partial input code storage */
Xstatic INT sp;                         /* current stack pointer */
X
Xstatic struct entry                    /* string table entry format */
X{   char used;                         /* true when this entry is in use */
X    unsigned INT next;                 /* ptr to next in collision list */
X    unsigned INT predecessor;          /* code for preceeding string */
X    unsigned char follower;            /* char following string */
X}   string_tab[TABSIZE];               /* the code string table */
X
X
X/* definitions for the new dynamic Lempel-Zev crunching */
X
X#define BITS   12                      /* maximum bits per code */
X#define HSIZE  5003                    /* 80% occupancy */
X#define INIT_BITS 9                    /* initial number of bits/code */
X
Xstatic INT n_bits;                     /* number of bits/code */
Xstatic INT maxcode;                    /* maximum code, given n_bits */
X#define MAXCODE(n)      ((1<<(n)) - 1) /* maximum code calculation */
Xstatic INT maxcodemax =  1 << BITS;    /* largest possible code (+1) */
X
Xstatic unsigned char buf[BITS];        /* input/output buffer */
X
Xstatic unsigned char lmask[9] =        /* left side masks */
X{   0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 };
Xstatic unsigned char rmask[9] =        /* right side masks */
X{   0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};
X
Xstatic INT offset;                     /* byte offset for code output */
Xstatic long in_count;                  /* length of input */
Xstatic long bytes_out;                 /* length of compressed output */
Xstatic unsigned INT ent;
X
X/* To save much memory (which we badly need at this point), we overlay
X * the table used by the previous version of Lempel-Zev with those used
X * by the new version.  Since no two of these routines will be used
X * together, we can safely do this.  Note that the tables used for Huffman
X * squeezing may NOT overlay these, since squeezing and crunching are done
X * in parallel.
X */
X
X#if MSDOS
Xstatic long *htab = string_tab;        /* hash code table   (crunch) */
X#endif
X#if BSD | ST
Xstatic long htab[HSIZE];               /* hash code table   (crunch) */
X#endif
Xstatic unsigned INT codetab[HSIZE];    /* string code table (crunch) */
X
Xstatic unsigned INT *prefix = codetab; /* prefix code table (uncrunch) */
X
X#if MSDOS
Xstatic unsigned char *suffix = string_tab;  /* suffix table (uncrunch) */
X#endif
X#if BSD | ST
Xstatic unsigned char suffix[HSIZE];    /* suffix table (uncrunch) */
X#endif
Xstatic INT free_ent;                   /* first unused entry */
Xstatic INT firstcmp;                   /* true at start of compression */
Xstatic unsigned char stack[HSIZE];     /* local push/pop stack */
X
X/*
X * block compression parameters -- after all codes are used up,
X * and compression rate changes, start over.
X */
X
Xstatic INT clear_flg;
Xstatic long ratio;
X#define CHECK_GAP 10000                /* ratio check interval */
Xstatic long checkpoint;
X
X/*
X * the next two codes should not be changed lightly, as they must not
X * lie within the contiguous general code space.
X */
X#define FIRST   257                    /* first free entry */
X#define CLEAR   256                    /* table clear output code */
X
Xstatic INT cl_block(t)                     /* table clear for block compress */
XFILE *t;                               /* our output file */
X{
X    long rat;
X    INT putcode();
X
X    checkpoint = in_count + CHECK_GAP;
X
X    if(in_count > 0x007fffff)          /* shift will overflow */
X    {    rat = bytes_out >> 8;
X         if(rat == 0)                  /* Don't divide by zero */
X              rat = 0x7fffffff;
X         else rat = in_count / rat;
X    }
X    else rat = (in_count<<8)/bytes_out;/* 8 fractional bits */
X
X    if(rat > ratio)
X         ratio = rat;
X    else
X    {    ratio = 0;
X         setmem`009(htab,HSIZE*sizeof(long),0xff);
X         free_ent = FIRST;
X         clear_flg = 1;
X         putcode(CLEAR,t);
X    }
X}
X
X/*****************************************************************
X *
X * Output a given code.
X * Inputs:
X *      code:   A n_bits-bit integer.  If == -1, then EOF.  This assumes
X *              that n_bits =< (long)wordsize - 1.
X * Outputs:
X *      Outputs code to the file.
X * Assumptions:
X *      Chars are 8 bits long.
X * Algorithm:
X *      Maintain a BITS character long buffer (so that 8 codes will
X * fit in it exactly).  When the buffer fills up empty it and start over.
X */
X
Xstatic INT putcode(code,t)                 /* output a code */
XINT code;                              /* code to output */
XFILE *t;                               /* where to put it */
X{
X INT r_off = offset;                /* right offset */
X INT bits = n_bits;                 /* bits to go */
X unsigned char *bp = buf;           /* buffer pointer */
X INT n;                             /* index */
X
X    if(code >= 0)                      /* if a real code */
X    {    /*
X          * Get to the first byte.
X          */
X         bp += (r_off >> 3);
X         r_off &= 7;
X
X         /*
X          * Since code is always >= 8 bits, only need to mask the first
X          * hunk on the left.
X          */
X         *bp = (*bp&rmask[r_off]) | (code<<r_off) & lmask[r_off];
X         bp++;
X         bits -= (8 - r_off);
X         code >>= (8 - r_off);
X
X         /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
X         if(bits >= 8)
X         {    *bp++ = code;
X              code >>= 8;
X              bits -= 8;
X         }
X
X         /* Last bits. */
X         if(bits)
X              *bp = code;
X
X         offset += n_bits;
X
X         if(offset == (n_bits << 3))
X         {    bp = buf;
X              bits = n_bits;
X              bytes_out += bits;
X              do
X                   putc_pak(*bp++,t);
X              while(--bits);
X              offset = 0;
X         }
X
X         /*
X          * If the next entry is going to be too big for the code size,
X          * then increase it, if possible.
X          */
X         if(free_ent>maxcode || clear_flg>0)
X         {    /*
X               * Write the whole buffer, because the input side won't
X               * discover the size increase until after it has read it.
X               */
X              if(offset > 0)
X              {    bp = buf;           /* reset pointer for writing */
X                   bytes_out += n = n_bits;
X                   while(n--)
X                        putc_pak(*bp++,t);
X              }
X              offset = 0;
X
X              if(clear_flg)            /* reset if clearing */
X              {    maxcode = MAXCODE(n_bits = INIT_BITS);
X                   clear_flg = 0;
X              }
X              else                     /* else use more bits */
X              {    n_bits++;
X                   if(n_bits == BITS)
X                        maxcode = maxcodemax;
X                   else
X                        maxcode = MAXCODE(n_bits);
X              }
X         }
X    }
X
X    else                               /* dump the buffer on EOF */
X    {    bytes_out += n = (offset+7) / 8;
X
X         if(offset > 0)
X              while(n--)
X                   putc_pak(*bp++,t);
X         offset = 0;
X    }
X}
X
X/*****************************************************************
X *
X * Read one code from the standard input.  If EOF, return -1.
X * Inputs:
X *      cmpin
X * Outputs:
X *      code or -1 is returned.
X */
X
Xstatic INT getcode(f)                  /* get a code */
XFILE *f;                               /* file to get from */
X{
X INT code;
X    static INT offset = 0, size = 0;
X INT r_off, bits;
X    unsigned char *bp = buf;
X
X    if(clear_flg > 0 || offset >= size || free_ent > maxcode)
X    {    /*
X          * If the next entry will be too big for the current code
X          * size, then we must increase the size.  This implies reading
X          * a new buffer full, too.
X          */
X         if(free_ent > maxcode)
X         {    n_bits++;
X              if(n_bits == BITS)
X                   maxcode = maxcodemax;    /* won't get any bigger now */
X              else maxcode = MAXCODE(n_bits);
X         }
X         if(clear_flg > 0)
X         {    maxcode = MAXCODE(n_bits = INIT_BITS);
X              clear_flg = 0;
X         }
X
X         for(size=0; size<n_bits; size++)
X         {    if((code=getc_unp(f))==EOF)
X                   break;
X              else buf[size] = code;
X         }
X         if(size <= 0)
X              return -1;               /* end of file */
X
X         offset = 0;
X         /* Round size down to integral number of codes */
X         size = (size << 3)-(n_bits - 1);
X    }
X    r_off = offset;
X    bits = n_bits;
X
X    /*
X     * Get to the first byte.
X     */
X    bp +=(r_off >> 3);
X    r_off &= 7;
X
X    /* Get first part (low order bits) */
X    code = (*bp++ >> r_off);
X    bits -= 8 - r_off;
X    r_off = 8 - r_off;                 /* now, offset into code word */
X
X    /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
X    if(bits >= 8)
X    {    code |= *bp++ << r_off;
X         r_off += 8;
X         bits -= 8;
X    }
X    /* high order bits. */
X    code |= (*bp & rmask[bits]) << r_off;
X    offset += n_bits;
X
X    return code;
X}
X
X/*
X * compress a file
X *
X * Algorithm:  use open addressing double hashing (no chaining) on the
X * prefix code / next character combination.  We do a variant of Knuth's
X * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
X * secondary probe.  Here, the modular division first probe is gives way
X * to a faster exclusive-or manipulation.  Also do block compression with
X * an adaptive reset, where the code table is cleared when the compression
X * ratio decreases, but after the table fills.  The variable-length output
X * codes are re-sized at this point, and a special CLEAR code is generated
X * for the decompressor.
X */
X
XINT init_cm(f,t)                           /* initialize for compression */
XFILE *f;                               /* file we will be compressing */
XFILE *t;                               /* where we will put it */
X{
X    offset = 0;
X    bytes_out = 1;
X    clear_flg = 0;
X    ratio = 0;
X    in_count = 1;
X    checkpoint = CHECK_GAP;
X    maxcode = MAXCODE(n_bits = INIT_BITS);
X    free_ent = FIRST;
X    setmem(htab,HSIZE*sizeof(long),0xff);
X    n_bits = INIT_BITS;                /* set starting code size */
X
X    putc_pak(BITS,t);                  /* note our max code length */
X
X    firstcmp = 1;                      /* next byte will be first */
X}
X
XINT putc_cm(c,t)                           /* compress a character */
Xunsigned char c;                       /* character to compress */
XFILE *t;                               /* where to put it */
X{
X    static long fcode;
X    static INT hshift;
X register INT i;
X register INT disp;
X
X    if(firstcmp)                       /* special case for first byte */
X    {    ent = c;                      /* remember first byte */
X
X         hshift = 0;
X         for(fcode=(long)HSIZE;  fcode<65536L; fcode*=2L)
X              hshift++;
X         hshift = 8 - hshift;          /* set hash code range bund */
X
X         firstcmp = 0;                 /* no longer first */
X         return;
X    }
X
X    in_count++;
X    fcode =(long)(((long)c << BITS)+ent);
X    i = (c<<hshift)^ent;               /* xor hashing */
X
X    if(htab[i]==fcode)
X    {    ent = codetab[i];
X         return;
X    }
X    else if(htab[i]<0)                 /* empty slot */
X         goto nomatch;
X    disp = HSIZE - i;                  /* secondary hash (after G.Knott) */
X    if(i == 0)
X         disp = 1;
X
Xprobe:
X    if((i -= disp) < 0)
X         i += HSIZE;
X
X    if(htab[i] == fcode)
X    {    ent = codetab[i];
X         return;
X    }