[comp.compression] Painless hard-wired arithmetic coding

Peter_Gutmann@kcbbs.gen.nz (Peter Gutmann) (03/27/91)

Last night I read Ross Williams' posting on "Painless hard-wired arithmetic  
coding", which describes a scheme similar to uuencoding and its variants which
is both simple to implement and quite fast.  I've always wanted to have a more
efficient text-encoding program than uuencode/atob/xxencode so this afternoon 
wrote pgencode and pgdecode (the name is a subtle dig at some friends of mine 
who hassle my coding style by saying that I don't write programs, I pgencode  
them, and they have to be pgdecoded before they can be understood by normal  
humans.  The pgencode and pgdecode programs do the same thing for archived  
programs).  Anyway, they are a direct replacement for uuencode/decode etc.  
They encode at 50K/s and decode at 60K/s on my 386 box, but I could probably  
improve the speed to around 200K/s by recoding them in assembly language  
(80386, so all you Cray owners will have to write your own version :-).  The  
code seems to be fairly efficient - most lines translate directly into a singl
CISC instruction, and I think a complex table lookup scheme would waste more  
time in memory accesses than any possible speed gain is worth.  
  
The following code is un-Copyright Peter Gutmann, 1991.  Do what you want with
it (but if you find any bugs please let me know).  
  
------------------------- Cut carefully with chainsaw ------------------------
  
/* PGCODE.H - un(c) Peter Gutmann 1991 */  
  
/* The number of bits in the short and long codes */  
  
#define SHORT_CODE_BITS     6  
#define LONG_CODE_BITS      7  
  
/* A mask used to extract the short code */  
  
#define SHORT_CODE_MASK     ( ( 1 << SHORT_CODE_BITS ) - 1 )  
  
/* The offset of the first long code, and the first code due to a 1 bit  
   following a long code */  
  
#define LONG_CODE_OFFSET    34  
#define HIGH_LONG_OFFSET    30  
  
/* The base encoded value */  
  
#define CODE_BASE           ( ' ' + 1 )  
  
/* The maximum length for output lines */  
  
#define LINE_LEN    78  
  
------------------------------------------------------------------------------
  
/* PGENCODE.C - un(c) Peter Gutmann 1991 */  
  
#include <limits.h>  
#include <stdio.h>  
#include <stdlib.h>  
#include "pgcode.h"  
  
/* Standard useful defines - normally in a header file */  
  
#define BYTE    unsigned char  
#define WORD    unsigned int    /* Your mileage may vary on this one */  
  
#define TRUE    1  
  
#define ERROR   1  
#define OK      0  
  
/* Variables to handle the encoding */  
  
static int bitCount = 0;  
static WORD bitBuffer = 0;  
  
/* The data are encoded as follows:  
  
   code <- get 6 bits;  
   if( code < 34 )  
       // Output as straight code 0..33  
       output( code );  
   else  
       // Output as code in range 34..63 or 64..93 depending on next bit  
       bit <- get 1 bit;  
       if( bit == 1 )  
           code += HIGH_LONG_OFFSET;  
       output( code );  
*/  
  
static void encodeFile( FILE *inFilePtr, FILE *outFilePtr )  
    {  
    int outByteCount = 0;  
    BYTE theCode;  
  
    while( TRUE )  
        {  
        /* Make sure there is enough data in the buffer to read from */  
        while( bitCount < LONG_CODE_BITS )  
            {  
            bitBuffer |= getc( inFilePtr ) << bitCount;  
            bitCount += CHAR_BIT;  
            }  
  
        /* Build the output code */  
        theCode = bitBuffer & SHORT_CODE_MASK;  
        bitBuffer >>= SHORT_CODE_BITS;  
        bitCount -= SHORT_CODE_BITS;  
        if( theCode >= LONG_CODE_OFFSET )  
            {  
            if( bitBuffer & 0x0001 )  
                theCode += HIGH_LONG_OFFSET;  
            bitBuffer >>= 1;  
            bitCount--;  
            }  
  
        /* Output the code as text */  
        putc( theCode + CODE_BASE, outFilePtr );  
        if( outByteCount++ == LINE_LEN )  
            {  
            putc( '\n', outFilePtr );  
            outByteCount = 0;  
            }  
  
        if( feof( inFilePtr ) )  
            break;  
        }  
  
    putc( '\n', outFilePtr );  
    }  
  
/* The main program */  
  
int main( const int argc, const char *argv[] )  
    {  
    FILE *inFilePtr, *outFilePtr;  
  
    if( argc == 3 )  
        {  
        if( ( inFilePtr = fopen( argv[ 1 ], "rb" ) ) == NULL )  
            {  
            perror( argv[ 1 ] );  
            exit( ERROR );  
            }  
        if( ( outFilePtr = fopen( argv[ 2 ], "w" ) ) == NULL )  
            {  
            perror( argv[ 2 ] );  
            exit( ERROR );  
            }  
        }  
    else  
        {  
        printf( "Usage: pgencode <infile> <outfile>\n" );  
        exit( ERROR );  
        }  
  
    /* Print std. start-of-file title */  
    fprintf( outFilePtr, "pgbegin 666 %s\n", argv[ 1 ] );  
  
    encodeFile( inFilePtr, outFilePtr );  
  
    /* Print end-of-file title */  
    fputs( "end\n", outFilePtr );  
  
    fclose( inFilePtr );  
    fclose( outFilePtr );  
    return( OK );  
    }  
  
------------------------------------------------------------------------------
  
/* PGDECODE.C - un(c) Peter Gutmann 1991 */  
  
#include <limits.h>  
#include <stdio.h>  
#include <stdlib.h>  
#include <string.h>  
#include "pgcode.h"  
  
/* Standard useful defines - normally in a header file */  
  
#define BYTE    unsigned char  
#define WORD    unsigned int    /* Your mileage may vary on this one */  
  
#define TRUE    1  
  
#define ERROR   1  
#define OK      0  
  
/* Variables to handle the decoding */  
  
static int bitCount = 0;  
static WORD bitBuffer = 0;  
  
/* The data are decoded as follows:  
  
   code <- read char;  
   if( code > 33 + 30 )  
       out( 1 );  
       out( code );  
   else  
       if( code > 33 )  
           out( 0 );  
           out( code );  
       else  
           out( code );  
*/  
  
void decodeFile( FILE *inFilePtr, FILE *outFilePtr )  
    {  
    int noBits, inByteCount = 0;  
    BYTE theCode;  
  
    while( TRUE )  
        {  
        noBits = SHORT_CODE_BITS;  
        theCode = getc( inFilePtr ) - CODE_BASE;  
  
        /* Check for end of data */  
        if( theCode == ( BYTE ) ( '\n' - CODE_BASE ) )  
            break;  
  
        /* Add in a 0 or 1 bit if necessary */  
        if( theCode >= LONG_CODE_OFFSET )  
            {  
            if( theCode >= LONG_CODE_OFFSET + HIGH_LONG_OFFSET )  
                {  
                theCode -= HIGH_LONG_OFFSET;  
                theCode |= 1 << SHORT_CODE_BITS;  
                }  
            noBits++;  
            }  
  
        /* Output the code */  
        bitBuffer |= theCode << bitCount;  
        bitCount += noBits;  
        while( bitCount >= CHAR_BIT )  
            {  
            putc( bitBuffer & 0xFF, outFilePtr );  
            bitBuffer >>= CHAR_BIT;  
            bitCount -= CHAR_BIT;  
            }  
  
        /* Skip EOL */  
        if( inByteCount++ == LINE_LEN )  
            {  
            getc( inFilePtr );  
            inByteCount = 0;  
            }  
  
        /* Make sure we don't run forever on corrupt file */  
        if( feof( inFilePtr ) )  
            {  
            puts( "File corrupted" );  
            exit( ERROR );  
            }  
        }  
    }  
  
/* The main program */  
  
int main( const int argc, const char *argv[] )  
    {  
    FILE *inFilePtr, *outFilePtr;  
    char fileName[ 64 ], buffer[ 80 ];  
    int permiss;  
  
    /* Open input file */  
    if( argc == 2 )  
        {  
        if( ( inFilePtr = fopen( argv[ 1 ], "r" ) ) == NULL )  
            {  
            perror( argv[ 1 ] );  
            exit( ERROR );  
            }  
        }  
    else  
        {  
        puts( "Usage: pgdecode <infile>" );  
        exit( ERROR );  
        }  
  
    /* Find header line, if necessary skipping useless message header info */ 
    while( TRUE )  
        {  
        /* Complain if we reach EOF without finding a begin line */  
        if( fgets( buffer, sizeof( buffer ), inFilePtr ) == NULL )  
            {  
            puts( "No begin line" );  
            exit( ERROR );  
            }  
  
        /* We've hit a begin line, exit */  
        if( !strncmp( buffer, "pgbegin ", 6 ) )  
            break;  
        }  
  
    sscanf( buffer, "pgbegin %3d %s", &permiss, fileName );  
  
    /* Create output file */  
    if( ( outFilePtr = fopen( fileName, "wb" ) ) == NULL )  
        {  
        perror( fileName );  
        exit( ERROR );  
        }  
  
    decodeFile( inFilePtr, outFilePtr );  
  
    /* Nitpick to make sure there's an end line */  
    if( fgets( buffer, sizeof( buffer ), inFilePtr ) == NULL || strcmp( buffer
        {  
        puts( "No end line" );  
        exit( ERROR );  
        }  
  
    fclose( inFilePtr );  
    fclose( outFilePtr );  
    return( OK );  
    }  
  
-------------------------------- CODE_SEG ENDS -------------------------------
  
 Peter_Gutmann@kcbbs.gen.nz || peter@nacjack.gen.nz || pgut1@cs.aukuni.ac.nz  
                     (In order of decreasing reliability)  
Warning!  
  Something large, scaly, and with fangs a foot long lives between <yoursite> 
and <mysite>.  Every now and then it kills and eats messages.  If you don't  
receive a reply within a week, try resending...