[net.sources] File compression utility

thomas@utah-gr.UUCP (07/05/84)
: So you can just pipe this message into sh :
cat <<'EOF' >README

Well, with all this discussion about file compression (for news batching
in particular) going around, I decided to implement the text compression
algorithm described in the June Computer magazine.  The author claimed
blinding speed and good compression ratios.  It's certainly faster than
compact (but, then, what wouldn't be), but it's also the same speed as
pack, and gets better compression than both of them.  On 350K bytes of
unix-wizards, compact took about 8 minutes of CPU, pack took about 80
seconds, and compress (herein) also took 80 seconds.  But, compact and
pack got about 30% compression, whereas compress got over 50%.  So, I
decided I had something, and that others might be interested, too.

As is probably true of compact and pack (although I haven't checked),
the byte order within a word is probably relevant here, but as long as
you stay on a single machine type, you should be ok.  (Can anybody
elucidate on this?)  There are a couple of asm's in the code (extv and
insv instructions), so anyone porting it to another machine will have to
deal with this anyway (and could probably make it compatible with Vax
byte order at the same time).  Anyway, I've linted the code (both with
and without -p), so it should run elsewhere.  Note the longs in the
code, you can take these out if you reduce BITS to <= 15.

Have fun, and as always, if you make good enhancements, or bug fixes,
I'd like to see them.

=Spencer (thomas@utah-20, {harpo,hplabs,arizona}!utah-cs!thomas)
EOF
: Run this shell script with "sh" not "csh"
PATH=:/bin:/usr/bin:/usr/ucb
export PATH
all=FALSE
if [ $1x = -ax ]; then
	all=TRUE
fi
/bin/echo 'Extracting compress.l'
sed 's/^X//' <<'//go.sysin dd *' >compress.l
X.PU
X.TH COMPRESS 1
X.SH NAME
compress  \-  compress files
X.SH SYNOPSIS
X.B compress
X.B [\-d]
[filename]
X.br
X.B uncompress
[filename]
X.SH DESCRIPTION
Compresses the specified file or standard input.
The result is written to standard output.
Compressed files can be restored
to their original form by specifying the
X.B \-d
option, or by running
X.I uncompress.
X.PP
X.I Compress
uses the modified Lempel-Ziv algorithm described in
"A Technique for High Performance Data Compression",
Terry A. Welch,
X.I IEEE Computer
Vol 17, No 6 (June 1984), pp 8-19.
X.PP
The amount of compression obtained depends on the size of the
input file, and the distribution of character sequences.
Typically, text files, such as C programs,
are reduced to 40\-50% of their original size;
X.SH BUGS
Even if the file gets bigger after compression, the output is still produced.
Some of the code is highly VAX dependent.
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 664 compress.l
	/bin/echo -n '	'; /bin/ls -ld compress.l
fi
/bin/echo 'Extracting uncompress.l'
sed 's/^X//' <<'//go.sysin dd *' >uncompress.l
X.so manl/compress.l
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 664 uncompress.l
	/bin/echo -n '	'; /bin/ls -ld uncompress.l
fi
/bin/echo 'Extracting compress.c'
sed 's/^X//' <<'//go.sysin dd *' >compress.c
X/* 
 * compress.c - File compression ala IEEE Computer June 1984.
 * 
 * Author:	Spencer W. Thomas
 * 		Computer Science Dept.
 * 		University of Utah
 * Date:	Wed Jul  4 1984
 * Copyright (c) 1984 Spencer W. Thomas
 * 
 * WARNING: due to cc -O bug (dealing with ext instruction), do not
 *	compile this program with -O (4.2bsd, at least).
 * 
 * $Header: compress.c,v 1.4 84/07/05 03:11:11 thomas Exp $
 * $Log:	compress.c,v $
 * Revision 1.4  84/07/05  03:11:11  thomas
 * Clean up the code a little and lint it.  (Lint complains about all
 * the regs used in the asm, but I'm not going to "fix" this.)
 * 
 * Revision 1.3  84/07/05  02:06:54  thomas
 * Minor fixes.
 * 
 * Revision 1.2  84/07/05  00:27:27  thomas
 * Add variable bit length output.
 * 
 */
static char rcs_ident[] = "$Header: compress.c,v 1.4 84/07/05 03:11:11 thomas Exp $";

#include "stdio.h"
#include "ctype.h"

#define	BITS	16			/* maximum number of bits/code */
int maxbits = BITS;			/* user settable max # bits/code */
long int maxmaxcode = 1 << BITS;
int n_bits = 9;				/* initial number of bits/code */
long int maxcode = 511;			/* 1 << n_bits - 1 */
long int bytes_out = 0;			/* count how many are written */

X/* 
 * One code could conceivably represent (1<<BITS) characters, but
 * to get a code of length N requires an input string of at least
 * N*(N-1)/2 characters.  With 10000 chars in the stack, an input
 * file would have to contain a 50Mb string of a single character.
 * This seems unlikely.
 */
#define MAXSTACK    10000		/* size of output stack */

struct c_ent {
    struct c_ent * next,		/* chain of entries with same prefix */
		 * chain;		/* chain prefixed with this entry */
    unsigned int prefix : BITS,		/* prefix code for this entry */
		 code : BITS;		/* code for this entry */
    unsigned char suffix;		/* last char in this entry */
} c_tab[1<<BITS];			/* a table full of them */
long int free_ent = 0;			/* first unused entry */

long int getcode();

int debug = 0;

X/*****************************************************************
 * TAG( main )
 * 
 * Algorithm from "A Technique for High Performance Data Compression",
 * Terry A. Welch, IEEE Computer Vol 17, No 6 (June 1984), pp 8-19.
 * 
 * Usage: compress [-d] [file]
 * Inputs:
 *	-d:	    If given, decompression is done instead.
 * 
 * 	file:	    File to be compressed.  If none specified, stdin
 *		    is used.
 * Outputs:
 * 	stdout:	    Compressed form of file.
 * 
 * Assumptions:
 * 	File is better off compressed.  Goes ahead and compresses even
 * if the result is bigger.
 * Algorithm:
 * 	Modified Lempel-Ziv method (LZW).  Basically finds common
 * substrings and replaces them with a variable size code.  This is
 * deterministic, and can be done on the fly.  Thus, the decompression
 * procedure needs no input table, but tracks the way the table was
 * built.
 */


main( argc, argv )
char **argv;
{
    FILE * infile;
    char * fname = NULL;
    int decompress_flg = 0;
    int verbose = 0;

    /* 4.2BSD dependent - take it out if not */
    setlinebuf( stderr );
#ifndef NO_SCANARGS
    if ( scanargs( argc, argv, "compress D%- d%- v%- b%-maxbits!d file%s",
		    &debug, &decompress_flg, &verbose,
		    &maxbits, &maxbits, &fname ) == 0 )
	exit( 1 );
#else
    /* 
     * Roll your own here using getopt or whatever.  All flags are
     * optional.
     * -D => debug
     * -d => decompress_flg
     * -v => verbose
     * -b maxbits => maxbits.  If -b is specified, then maxbits MUST be
     *	    given also.
     * if a string is left, must be filename.
     */
#endif

    if ( maxbits > BITS )
	maxbits = BITS;
    maxmaxcode = 1 << maxbits;

    if ( fname != NULL )
    {
	if ( ( infile = fopen( fname, "r" ) ) == NULL )
	{
	    perror( fname );
	    exit( 1 );
	}
    }
    else
	infile = stdin;

    if ( decompress_flg == 0 )
	compress( infile );
    else if ( debug == 0 )
	decompress( infile );
    else
    {
	/* 
	 * Just print out codes from input file.  Mostly for debugging.
	 */
	long int code;
	int col = 0, bits = n_bits;

	free_ent = 255;
	while ( ( code = getcode( infile ) ) >= 0 )
	{
	    if ( free_ent < maxmaxcode )
		free_ent++;
	    if ( bits != n_bits )
	    {
		printf( "\nChange to %d bits\n", n_bits );
		bits = n_bits;
		col = 0;
	    }
	    printf( "%5d%c", code, (col+=6) >= 74 ? (col = 0, '\n') : ' ' );
	}
	putchar( '\n' );
	exit( 0 );
    }

    /* If requested (verbose), dump string table */
    if ( verbose )
	dump_tab();
}


X/*****************************************************************
 * TAG( compress )
 * 
 * Actually does the compression.
 * Inputs:
 * 	infile:	    File to compress.
 * Outputs:
 * 	Writes compressed file to stdout.
 * Assumptions:
 *	[None]
 * Algorithm:
 * 	See above.
 */

compress( infile )
FILE *infile;
{
    int ic;
    unsigned char c;
    register struct c_ent * ent, * n_ent;
    long int in_count = 1;
#ifdef STATS
    int out_count = 0;
#endif

    /* 
     * Initialize the compression table to contain all 8-bit values
     * initially.  These don't need to be chained, they can be looked
     * up directly.
     */
    for ( free_ent = 0, ent = c_tab; free_ent < 256; free_ent++, ent++ )
    {
	ent->next = ent->chain = NULL;
	ent->prefix = 0;
	ent->code = free_ent;
	ent->suffix = free_ent;
    }

    c = getc( infile );
    ent = &c_tab[c];			/* initial entry */
    while ( !feof( infile ) && (ic = getc(infile)) != EOF )
    {
	in_count++;
	c = ic;
	/* 
	 * Find the entry corresponding to the current entry suffixed
	 * with this char.
	 */
	for ( n_ent = ent->chain; n_ent != NULL; n_ent = n_ent->next )
	    if ( n_ent->suffix == c )
		break;			/* found it */

	/* 
	 * If no such entry, do 2 things:
	 * 1. Put out code for current prefix string.
	 * 2. Add the new string to the table.
	 *    If the table is full, just start over with current input
	 *    character.
	 */
	if ( n_ent == NULL )
	{
	    output( (long int)ent->code );
#ifdef STATS
	    out_count++;
#endif
	    if ( free_ent < maxmaxcode )
	    {
		n_ent = &c_tab[free_ent];
		n_ent->chain = NULL;
		n_ent->prefix = ent->code;
		n_ent->suffix = c;
		n_ent->code = free_ent;
		n_ent->next = ent->chain;
		ent->chain = n_ent;
		free_ent++;
	    }
	    n_ent = &c_tab[c];
	}
	ent = n_ent;
    }
    /* 
     * Put out the final code.
     */
    output( (long int)ent->code );
#ifdef STATS
    out_count++;
#endif
    output( -1L );			/* finish up output if necessary */

    /* 
     * Print out stats on stderr
     */
#ifdef STATS
    fprintf( stderr,
	"%ld chars in, %ld codes (%ld bytes) out, compression factor %g\n",
		in_count, out_count, bytes_out,
		(double)in_count / (double)bytes_out );
    fprintf( stderr, "\tCompression as in compact: %5.2f%%\n",
		100.0 * ( in_count - bytes_out ) / (double) in_count );
    fprintf( stderr, "\tLargest code was %d (%d bits)\n", free_ent - 1, n_bits );
#else
    fprintf( stderr, "\tCompression: %5.2f%%\n",
		100.0 * ( in_count - bytes_out ) / (double) in_count );
#endif
}


X/*****************************************************************
 * TAG( output )
 * 
 * Output the given code.
 * Inputs:
 * 	code:	A n_bits-bit integer.  If == -1, then EOF.  This assumes
 *		that n_bits =< (long)wordsize - 1.
 * 
 * Outputs:
 * 	Outputs code to the file.
 * Assumptions:
 *	Chars are 8 bits long.
 * Algorithm:
 * 	Maintain a BITS character long buffer (so that 8 codes will
 * fit in it exactly).  Use the VAX insv instruction to insert each
 * code in turn.  When the buffer fills up empty it and start over.
 */

output( code )
long int  code;
{
    static char buf[BITS];
    static int offset = 0;
    static int col = 0;
    /* 
     * On the VAX, it is important to have the register declarations
     * in exactly the order given, or the asm will break.
     */
    register int r_off = offset, bits= n_bits;
    register char * bp = buf;

    if ( code >= 0 )
    {
	/* VAX DEPENDENT!! Implementation on other machines may be
	 * difficult.
	 * 
	 * Translation: Insert BITS bits from the argument starting at
	 * offset bits from the beginning of buf.
	 */
#ifndef vax
	You lose
#endif
	asm( "insv	4(ap),r11,r10,(r9)" );
	offset += n_bits;
	if ( offset == n_bits*8 )
	{
	    fwrite( buf, 1, n_bits, stdout );
	    offset = 0;
	    bytes_out += n_bits;
	}
	if ( debug )
	    fprintf( stderr, "%5d%c", code,
		    (col+=6) >= 74 ? (col = 0, '\n') : ' ' );

	/* 
	 * If the next entry is going to be too big for the code size,
	 * then increase it, if possible.
	 */
	if ( free_ent > maxcode )
	{
	    /* 
	     * Write the whole buffer, because the input side won't
	     * discover the size increase until after it has read it.
	     */
	    if ( offset > 0 )
	    {
		fwrite( buf, 1, n_bits, stdout );
		bytes_out += n_bits;
	    }
	    offset = 0;
		
	    n_bits++;
	    if ( n_bits == maxbits )
		maxcode = maxmaxcode;
	    else
		maxcode = (1 << n_bits) - 1;
	    if ( debug )
	    {
		fprintf( stderr, "\nChange to %d bits\n", n_bits );
		col = 0;
	    }
	}
    }
    else
    {
	/* 
	 * At EOF, write the rest of the buffer.
	 */
	if ( offset > 0 )
	    fwrite( buf, 1, (offset + 7) / 8, stdout );
	bytes_out += (offset + 7) / 8;
	offset = 0;
	fflush( stdout );
	if ( debug )
	    fprintf( stderr, "\n" );
    }
}


X/*****************************************************************
 * TAG( decompress )
 * 
 * Decompress the input file.
 * 
 * Inputs:
 * 	infile:	    File to be decompressed.
 * Outputs:
 * 	Writes decompressed file to stdout.
 * Assumptions:
 * 	Input file was created with compress (could use a magic #, I
 *	guess).
 * Algorithm:
 * 	See article cited above.
 */

decompress( infile )
FILE *infile;
{
    unsigned char stack[MAXSTACK];
    int stack_top = MAXSTACK;
    long int code, oldcode, incode;
    unsigned char finchar;
    register struct c_ent * ent;

    /* 
     * As above, initialize the first 256 entries in the table.  
     */
    for ( free_ent = 0; free_ent < 256; free_ent++ )
    {
	ent = &c_tab[free_ent];
	ent->next = ent->chain = NULL;
	ent->prefix = 0;
	ent->code = free_ent;
	ent->suffix = free_ent;
    }

    code = oldcode = getcode( infile );
    putchar( (char)code );		/* first code must be 8 bits = char */
    finchar = code;

    while ( ( code = getcode( infile ) ) != -1 )
    {
	incode = code;
	/* 
	 * Special case for KwKwK string.
	 */
	if ( code >= free_ent )
	{
	    stack[--stack_top] = finchar;
	    code = oldcode;
	}

	/* 
	 * Generate output characters in reverse order
	 */
	ent = &c_tab[code];
	while ( ent->code >= 256 )
	{
	    stack[--stack_top] = ent->suffix;
	    ent = &c_tab[ent->prefix];
	}
	stack[--stack_top] = finchar = ent->suffix;

	/* 
	 * And put them out in forward order
	 */
	fwrite( &stack[stack_top], 1, MAXSTACK - stack_top, stdout );
	stack_top = MAXSTACK;

	/* 
	 * Generate the new entry.
	 */
	if ( free_ent < maxmaxcode )
	{
	    ent = &c_tab[free_ent];
	    ent->prefix = oldcode;
	    ent->suffix = finchar;
	    ent->code = free_ent;
	    free_ent++;
	}
	/* 
	 * Remember previous code.
	 */
	oldcode = incode;
    }
    fflush( stdout );
}


X/*****************************************************************
 * TAG( getcode )
 * 
 * Read one code from the input file.  If EOF, return -1.
 * Inputs:
 * 	infile:	    Input file.
 * Outputs:
 * 	code or -1 is returned.
 * Assumptions:
 *	[None]
 * Algorithm:
 * 	For now, use scanf to read ascii input.
 */

long int
getcode( infile )
FILE *infile;
{
    /* 
     * On the VAX, it is important to have the register declarations
     * in exactly the order given, or the asm will break.
     */
    register long int code;
    static int offset = 0, size = 0;
    static char buf[BITS];
    register int r_off, bits;
    register char * bp = buf;

    if ( offset >= size || free_ent > maxcode )
    {
	/* 
	 * If the next entry will be too big for the current code
	 * size, then we must increase the size.  This implies reading
	 * a new buffer full, too.
	 */
	if ( free_ent > maxcode )
	{
	    n_bits++;
	    if ( n_bits == maxbits )
		maxcode = maxmaxcode;	/* won't get any bigger now */
	    else
		maxcode = (1 << n_bits) - 1;
	}
	size = fread( buf, 1, n_bits, infile );
	if ( size <= 0 )
	    return -1;			/* end of file */
	offset = 0;
	/* Round size down to integral number of codes */
	size = size * 8 - (n_bits - 1);
    }
    /* 
     * Get it into a register for the following VAX dependent code.
     */
    r_off = offset;
    bits = n_bits;
#ifndef vax
    You lose
#endif
    asm( "extzv   r10,r9,(r8),r11" );
    offset += n_bits;

    return code;
}


X/*****************************************************************
 * TAG( dump_tab )
 * 
 * Dump the string table.
 * Inputs:
 *	[None]
 * Outputs:
 *	[None]
 * Assumptions:
 *	[None]
 * Algorithm:
 *	[None]
 */

dump_tab()
{
    register int i;
    register struct c_ent * ent;
    char stack[4 * MAXSTACK];	/* \nnn makes it 4 times bigger */
    int stack_top = 4 * MAXSTACK;

    for ( i = 0; i < free_ent; i++ )
    {
	ent = &c_tab[i];
	if ( isascii(ent->suffix) && isprint(ent->suffix) )
	    fprintf( stderr, "%5d: %5d/'%c'  \"",
			ent->code, ent->prefix, ent->suffix );
	else
	    fprintf( stderr, "%5d: %5d/\\%03o \"",
			ent->code, ent->prefix, ent->suffix );
	stack[--stack_top] = '\n';
	stack[--stack_top] = '"';
	for ( ; ent != NULL;
		ent = (ent->code >= 256 ? &c_tab[ent->prefix] : NULL) )
	{
	    if ( isascii(ent->suffix) && isprint(ent->suffix) )
		stack[--stack_top] = ent->suffix;
	    else
	    {
		switch( ent->suffix )
		{
		case '\n': stack[--stack_top] = 'n'; break;
		case '\t': stack[--stack_top] = 't'; break;
		case '\b': stack[--stack_top] = 'b'; break;
		case '\f': stack[--stack_top] = 'f'; break;
		case '\r': stack[--stack_top] = 'r'; break;
		default:
		    stack[--stack_top] = '0' + ent->suffix % 8;
		    stack[--stack_top] = '0' + (ent->suffix / 8) % 8;
		    stack[--stack_top] = '0' + ent->suffix / 64;
		    break;
		}
		stack[--stack_top] = '\\';
	    }
	}
	fwrite( &stack[stack_top], 1, 4 * MAXSTACK - stack_top, stderr );
	stack_top = 4 * MAXSTACK;
    }
}
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 444 compress.c
	/bin/echo -n '	'; /bin/ls -ld compress.c
fi
/bin/echo 'Extracting uncompress.sh'
sed 's/^X//' <<'//go.sysin dd *' >uncompress.sh
#! /bin/sh
PATH=/bin:/usr/bin:/usr/local
compress -d $*
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 664 uncompress.sh
	/bin/echo -n '	'; /bin/ls -ld uncompress.sh
fi