[comp.compression] looking for arith. codeing refs.

xanthian@zorch.SF-Bay.ORG (Kent Paul Dolan) (04/22/91)

 wardrb@cs.utexas.edu (Richard Byron Ward) writes:
> laszlo@hppad.waterloo.hp.com (Joe Laszlo) writes:

>> I'm looking for a few good references on arithmetic encoding
>> (practical as well as theoretical if possible). I'd also appreciate
>> any code you could direct me to.

>> Thanks in advance.

> "Arithmetic Coding for Data Compression", Witten, Radford and Cleary,
> Communications of the ACM, Volume 30, #6, June 1987

> This paper has a good description of arithmetic coding as well as
> working C code. I'm sure that the code is online somewhere, but I
> couldn't tell you where.

I'll probably get harassed for this, but here it is. I added ANSI C
prototypes, and for complicated reasons this exact Makefile is untested,
and it needs unpacking on a system with long file name capability, or
else hand editing to shorten the names before unpacking (and in the
files, too.)

Compiles and packs and unpacks files unchanged on two systems I use,
which guarantees exactly zip, of course.

It's fun to play with and learn from, and there's not much sense fifty
more people typing it in from the paper reference, since I already did.

Kent, the man from xanth.
<xanthian@Zorch.SF-Bay.ORG> <xanthian@well.sf.ca.us>

#! /bin/sh
# This is a shell archive.  Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file".  To overwrite existing
# files, type "sh file -c".  You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g..  If this archive is complete, you
# will see the following message at the end:
#		"End of shell archive."
# Contents:  Makefile adaptive_model.c aed_prototypes.h
#   arithmetic_coding.h arithmetic_decode.c arithmetic_encode.c
#   bit_input.c bit_output.c decode.c encode.c model.h
# Wrapped by xanthian@zorch on Mon Apr 22 05:56:20 1991
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'Makefile' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'Makefile'\"
else
echo shar: Extracting \"'Makefile'\" \(1264 characters\)
sed "s/^X//" >'Makefile' <<'END_OF_FILE'
Xall:			encode decode
X
Xencode:			encode.o arithmetic_encode.o \
X			bit_output.o adaptive_model.o
X			cc -o encode \
X			encode.o arithmetic_encode.o \
X			bit_output.o adaptive_model.o
X
Xdecode:			decode.o arithmetic_decode.o \
X			bit_input.o adaptive_model.o
X			cc -o decode \
X			decode.o arithmetic_decode.o \
X			bit_input.o adaptive_model.o
X
Xencode.o:		aed_prototypes.h model.h encode.c
X			cc -c encode.c
X
Xarithmetic_encode.o:	aed_prototypes.h arithmetic_coding.h arithmetic_encode.c
X			cc -c arithmetic_encode.c
X
Xbit_output.o:		aed_prototypes.h bit_output.c
X			cc -c bit_output.c
X
Xadaptive_model.o:	aed_prototypes.h model.h adaptive_model.c
X			cc -c adaptive_model.c
X
Xdecode.o:		aed_prototypes.h model.h decode.c
X			cc -c decode.c
X
Xarithmetic_decode.o:	aed_prototypes.h arithmetic_coding.h arithmetic_decode.c
X			cc -c arithmetic_decode.c
X
Xbit_input.o:		aed_prototypes.h arithmetic_coding.h bit_input.c
X			cc -c bit_input.c
X
Xlintfree:
X			echo "linting encode"
X			lint aed_prototypes.h model.h arithmetic_coding.h \
X			encode.c arithmetic_encode.c bit_output.c \
X			adaptive_model.c
X			echo "linting decode"
X			lint aed_prototypes.h \
X			model.h arithmetic_coding.h decode.c \
X			arithmetic_decode.c bit_input.c adaptive_model.c
X
Xclean:
X			/bin/rm *.o encode decode
END_OF_FILE
if test 1264 -ne `wc -c <'Makefile'`; then
    echo shar: \"'Makefile'\" unpacked with wrong size!
fi
# end of 'Makefile'
fi
if test -f 'adaptive_model.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'adaptive_model.c'\"
else
echo shar: Extracting \"'adaptive_model.c'\" \(1733 characters\)
sed "s/^X//" >'adaptive_model.c' <<'END_OF_FILE'
X/* The adaptive source model */
X#include "aed_prototypes.h"
X#include "model.h"
Xint freq[ No_of_symbols + 1 ]; /* Symbol frequencies */
X/* Initialize the model. */
Xvoid
Xstart_model()
X{
X  int i;
X
X  /* Set up tables that translate between symbol indexes and characters. */
X  for ( i = 0; i < No_of_chars; i++ )
X  {
X    char_to_index[ i ] = i + 1;
X    index_to_char[ i + 1 ] = i;
X  }
X  /* Set up initial frequency counts to be one for all symbols. */
X  for ( i = 0; i <= No_of_symbols; i++ )
X  {
X    freq[ i ] = 1;
X    cum_freq[ i ] = No_of_symbols - i;
X  }
X  freq[ 0 ] = 0;   /* Freq[ 0 ] must not be the same as freq[ 1 ]. */
X}
X/* Update the model to account for a new symbol. */
Xvoid
Xupdate_model( int symbol /* Index of new symbol */ )
X{
X  int i;               /* New index for symbol */
X
X  /* See if frequency counts are at their maximum */
X  if ( cum_freq[ 0 ] == Max_frequency )
X  {
X    int cum;
X    cum = 0;
X
X    /* If so, halve all the counts (keeping them non-zero). */
X    for ( i = No_of_symbols; i >= 0; i-- )
X    {
X      freq[ i ] = ( freq[ i ] + 1 ) / 2;
X      cum_freq[ i ] = cum;
X      cum += freq[ i ];
X    }
X  }
X  /* Find symbol's new index. */
X  for ( i = symbol; freq[ i ] == freq[ i - 1 ]; i-- );
X
X  /* Update the translation tables if the symbol has moved. */
X  if ( i < symbol )
X  {
X    int ch_i, ch_symbol;
X    ch_i = index_to_char[ i ];
X    ch_symbol = index_to_char[ symbol ];
X    index_to_char[ i ] = ch_symbol;
X    index_to_char[ symbol ] = ch_i;
X    char_to_index[ ch_i ] = symbol;
X    char_to_index[ ch_symbol ] = i;
X  }
X  /*
X    Increment the frequency count for the symbol and update the cumulative 
X    frequencies.
X  */
X  freq[ i ] += 1;
X  while ( i > 0 )
X  {
X    i -= 1;
X    cum_freq[ i ] += 1;
X  }
X}
END_OF_FILE
if test 1733 -ne `wc -c <'adaptive_model.c'`; then
    echo shar: \"'adaptive_model.c'\" unpacked with wrong size!
fi
# end of 'adaptive_model.c'
fi
if test -f 'aed_prototypes.h' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'aed_prototypes.h'\"
else
echo shar: Extracting \"'aed_prototypes.h'\" \(420 characters\)
sed "s/^X//" >'aed_prototypes.h' <<'END_OF_FILE'
Xint main();
Xvoid encode();
Xvoid decode();
Xvoid start_outputing_bits();
Xvoid output_bit( int bit );
Xvoid done_outputing_bits();
Xvoid start_inputing_bits();
Xint input_bit();
Xvoid start_encoding();
Xvoid encode_symbol( int symbol, int cum_freq[] );
Xvoid done_encoding();
Xstatic void bit_plus_follow( int bit );
Xvoid start_decoding();
Xint decode_symbol( int cum_freq[] );
Xvoid start_model();
Xvoid update_model( int symbol );
END_OF_FILE
if test 420 -ne `wc -c <'aed_prototypes.h'`; then
    echo shar: \"'aed_prototypes.h'\" unpacked with wrong size!
fi
# end of 'aed_prototypes.h'
fi
if test -f 'arithmetic_coding.h' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'arithmetic_coding.h'\"
else
echo shar: Extracting \"'arithmetic_coding.h'\" \(643 characters\)
sed "s/^X//" >'arithmetic_coding.h' <<'END_OF_FILE'
X/* Declarations used for arithmetic encoding and decoding */
X/* Size of arithmetic code values. */
X#define Code_value_bits 16           /* Number of bits in a code value */
Xtypedef long code_value;             /* Type of an arithmetic code value */
X
X                                     /* Largest code value */
X#define Top_value ( ( ( long ) 1 << Code_value_bits ) - 1 )
X/* Half and quarter points in the code value range. */
X#define First_qtr ( Top_value / 4 + 1 ) /* Point after first quarter */
X#define Half      ( 2 * First_qtr )     /* Point after first half    */
X#define Third_qtr ( 3 * First_qtr )     /* Point after Third quarter */
END_OF_FILE
if test 643 -ne `wc -c <'arithmetic_coding.h'`; then
    echo shar: \"'arithmetic_coding.h'\" unpacked with wrong size!
fi
# end of 'arithmetic_coding.h'
fi
if test -f 'arithmetic_decode.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'arithmetic_decode.c'\"
else
echo shar: Extracting \"'arithmetic_decode.c'\" \(2139 characters\)
sed "s/^X//" >'arithmetic_decode.c' <<'END_OF_FILE'
X/* Arithmetic decoding algorithm. */
X#include "aed_prototypes.h"
X#include "arithmetic_coding.h"
X/* Current state of the decoding. */
Xstatic code_value value;     /* Currently-seen code value */
Xstatic code_value low, high; /* Ends of current code region */
X/* Start decoding a stream of symbols. */
Xvoid
Xstart_decoding()
X{
X  int i;
X
X  /* Input bits to fill the code value. */
X  value = 0;
X  for ( i = 1; i <= Code_value_bits; i++ )
X  {
X    value = 2 * value + input_bit();
X  }
X  low = 0;          /* Full code range. */
X  high = Top_value;
X}
X/* Decode the next symbol. */
Xint decode_symbol( int cum_freq[] /* Cumulative symbol frequencies */ )
X{
X  long range;                         /* Size of current code region */
X  int cum;                            /* Cumulative frequency calculated */
X  int symbol;                         /* Symbol decoded */
X
X  range = ( long )( high - low ) + 1;
X  /* Find cumulative frequency for value. */
X  cum = ( ( ( long ) ( value - low ) + 1 ) * cum_freq[ 0 ] - 1 ) / range;
X  /* Then find symbol. */
X  for ( symbol = 1; cum_freq[ symbol ] > cum; symbol++ );
X  /* Narrow the code region to that allotted to this symbol. */
X  high = low + ( ( range * cum_freq[ symbol - 1 ] ) / cum_freq[ 0 ] ) - 1;
X  low = low +    ( range * cum_freq[ symbol     ] ) / cum_freq[ 0 ];
X  for (;;)                            /* Loop to get rid of bits. */
X  {
X    if ( high < Half )                /* Expand low half. */
X    {
X      /* nothing */
X    }
X    else if ( low >= Half )           /* Expand high half. */
X    {
X      value -= Half;                  /* Subtract offset to top. */
X      low -= Half;
X      high -= Half;
X    }
X    else if ( low >= First_qtr && high < Third_qtr )
X    {                                 /* Expand middle half. */
X      value -= First_qtr;             /* Subtract offset to middle. */
X      low -= First_qtr;
X      high -= First_qtr;
X    }
X    else break;                       /* Otherwise exit loop. */
X    low = 2 * low;                    /* Scale up code range. */
X    high = 2 * high + 1;
X    value = 2 * value + input_bit();  /* Move in next input bit. */
X  }
X  return symbol;
X}
END_OF_FILE
if test 2139 -ne `wc -c <'arithmetic_decode.c'`; then
    echo shar: \"'arithmetic_decode.c'\" unpacked with wrong size!
fi
# end of 'arithmetic_decode.c'
fi
if test -f 'arithmetic_encode.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'arithmetic_encode.c'\"
else
echo shar: Extracting \"'arithmetic_encode.c'\" \(2342 characters\)
sed "s/^X//" >'arithmetic_encode.c' <<'END_OF_FILE'
X/* Arithmetic encoding algorithm. */
X#include "aed_prototypes.h"
X#include "arithmetic_coding.h"
X/* Current state of the encoding. */
Xstatic code_value low, high;    /* Ends of the current code region */
Xstatic long bits_to_follow;     /* Number of opposite bits to output after */
X                                /* the next bit. */
X/* Start encoding a stream of symbols. */
Xvoid
Xstart_encoding()
X{
X  low = 0;                      /* Full code range. */
X  high = Top_value;
X  bits_to_follow = 0;           /* No bits to follow next. */
X}
X/* Encode a symbol. */
Xvoid
Xencode_symbol(
X                int symbol,     /* Symbol to encode */
X                int cum_freq[]  /* Cumulative symbol frequencies */
X             )
X{
X  long range;                   /* Size of the current code region */
X
X  range = ( long )( high - low ) + 1;
X  /* Narrow the code region to that allotted to this symbol. */
X  high = low + ( ( range * cum_freq[ symbol - 1 ] ) / cum_freq[ 0 ] ) - 1;
X  low = low + ( range * cum_freq[ symbol ] ) / cum_freq[ 0 ];
X  for (;;)                      /* Loop to output bits */
X  {
X    if ( high < Half )          /* Output zero if in low half */
X    {
X      bit_plus_follow( 0 );
X    }
X    else if ( low >= Half )     /* Output 1 if in high half. */
X    {
X      bit_plus_follow( 1 );
X      low -= Half;
X      high -= Half;             /* Subtract offset to top. */
X    }
X    else if (low >= First_qtr   /* Output an opposite bit */
X          && high < Third_qtr ) /* later if in middle half. */
X    {
X      bits_to_follow += 1;
X      low -= First_qtr;         /* Subtract offset to middle. */
X      high -= First_qtr;
X    }
X    else break;                 /* Otherwise exit loop. */
X    low = 2 * low;
X    high = 2 * high + 1;        /* Scale up code range. */
X  }
X}
X/* Finish encoding the stream. */
Xvoid
Xdone_encoding()
X{
X  /*
X    Output two bits that select the quarter that the current code range 
X    contains.
X  */
X  bits_to_follow += 1;
X  if ( low < First_qtr ) bit_plus_follow( 0 );
X  else bit_plus_follow( 1 );
X}
X/* Output bits plus following opposite bits. */
Xstatic void bit_plus_follow( int bit )
X{
X  output_bit( bit );             /* Output the bit. */
X  /* Output bits_to_follow opposite bits.  Set bits_to_follow to zero */
X  while ( bits_to_follow > 0 )
X  {
X    output_bit( ! bit );
X    bits_to_follow -= 1;
X  }
X}
END_OF_FILE
if test 2342 -ne `wc -c <'arithmetic_encode.c'`; then
    echo shar: \"'arithmetic_encode.c'\" unpacked with wrong size!
fi
# end of 'arithmetic_encode.c'
fi
if test -f 'bit_input.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'bit_input.c'\"
else
echo shar: Extracting \"'bit_input.c'\" \(1421 characters\)
sed "s/^X//" >'bit_input.c' <<'END_OF_FILE'
X/* Bit input routines. */
X#include <stdio.h>
X#include "aed_prototypes.h"
X#include "arithmetic_coding.h"
X/* The bit buffer. */
Xstatic int buffer;       /* Bits waiting to be input */
Xstatic int bits_to_go;   /* Number of bits still in buffer */
Xstatic int garbage_bits; /* Number of bits past end-of-file */
X/* Initialize bit input. */
Xvoid
Xstart_inputing_bits()
X{
X  bits_to_go = 0;        /* Buffer starts out with no bits in it. */
X  garbage_bits = 0;
X}
X/* Input a bit. */
Xint input_bit()
X{
X  int t;
X
X  /* Read the next byte if no bits are left in buffer. */
X  if ( bits_to_go == 0 )
X  {
X    buffer = getc( stdin );
X    /*
X      This next check is misplaced; it should be before the "buffer >>= 1;
X      bits_to_go -=1;" lines below, and those two lines should be in an else
X      clause of this if statement.  As written, it counts bytes, not bits,
X      and permits multiple "getc( stdin )" calls after EOF is encountered,
X      because bits_to_go keeps getting decremented. Kent Paul Dolan.
X    */
X    if ( buffer == EOF )
X    {
X      garbage_bits += 1; /* Return arbitrary bits after EOF, but check */
X      if ( garbage_bits > Code_value_bits - 2 ) /* for too many such. */
X      {
X        fprintf( stderr, "Bad input file\n" );
X        exit( -1 );
X      }
X    }
X    bits_to_go = 8;
X  }
X  t = buffer & 1;        /* Return the next bit from the bottom of the byte */
X  buffer >>= 1;
X  bits_to_go -= 1;
X  return t;
X}
END_OF_FILE
if test 1421 -ne `wc -c <'bit_input.c'`; then
    echo shar: \"'bit_input.c'\" unpacked with wrong size!
fi
# end of 'bit_input.c'
fi
if test -f 'bit_output.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'bit_output.c'\"
else
echo shar: Extracting \"'bit_output.c'\" \(788 characters\)
sed "s/^X//" >'bit_output.c' <<'END_OF_FILE'
X/* Bit output routines. */
X#include <stdio.h>
X#include "aed_prototypes.h"
X/* The bit buffer. */
Xstatic int buffer;       /* Bits buffered for output */
Xstatic int bits_to_go;   /* Number of bits free in buffer */
X/* Initialize for bit output. */
Xvoid
Xstart_outputing_bits()
X{
X  buffer = 0;  /* Buffer is empty to start with. */
X  bits_to_go = 8;
X}
X/* Output a bit. */
Xvoid
Xoutput_bit( int bit )
X{
X  buffer >>= 1;              /* put bit in top of buffer */
X  if ( bit ) buffer |= 0x80;
X  bits_to_go -= 1;
X  if ( bits_to_go == 0 )     /* Output buffer if it is now full. */
X  {
X    (void) putc( buffer, stdout );
X    bits_to_go = 8;
X  }
X}
X/* Flush out the last bits. */
Xvoid
Xdone_outputing_bits()
X{
X  ( void ) putc( buffer >> bits_to_go, stdout );
X  /* This might be an empty buffer. */
X}
END_OF_FILE
if test 788 -ne `wc -c <'bit_output.c'`; then
    echo shar: \"'bit_output.c'\" unpacked with wrong size!
fi
# end of 'bit_output.c'
fi
if test -f 'decode.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'decode.c'\"
else
echo shar: Extracting \"'decode.c'\" \(724 characters\)
sed "s/^X//" >'decode.c' <<'END_OF_FILE'
X/* Main program for decoding. */
X#include <stdio.h>
X#include "aed_prototypes.h"
X#define aed_allocate_storage
X#include "model.h"
Xint
Xmain()
X{
X  start_model();                         /* Set up other modules. */
X  start_inputing_bits();
X  start_decoding();
X  for (;;)                               /* loop through characters. */
X  {
X    int ch;
X    int symbol;
X    symbol = decode_symbol( cum_freq );  /* Decode next symbol. */
X    if ( symbol == EOF_symbol ) break;   /* Exit loop if EOF symbol. */
X    ch = index_to_char[ symbol ];        /* Translate to a character. */
X    (void) putc( ch, stdout );           /* Write that character. */
X    update_model( symbol );              /* Update the model. */
X  }
X  exit( 0 );
X}
END_OF_FILE
if test 724 -ne `wc -c <'decode.c'`; then
    echo shar: \"'decode.c'\" unpacked with wrong size!
fi
# end of 'decode.c'
fi
if test -f 'encode.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'encode.c'\"
else
echo shar: Extracting \"'encode.c'\" \(902 characters\)
sed "s/^X//" >'encode.c' <<'END_OF_FILE'
X/* Main program for encoding. */
X#include <stdio.h>
X#include "aed_prototypes.h"
X#define aed_allocate_storage
X#include "model.h"
Xint
Xmain()
X{
X  start_model();                          /* Set up other modules. */
X  start_outputing_bits();
X  start_encoding();
X  for (;;)                                /* Loop through characters. */
X  {
X    int ch;
X    int symbol;
X
X    ch = getc( stdin );                   /* Read the next character. */
X    if ( ch == EOF ) break;               /* Exit loop on end-of-file. */
X    symbol = char_to_index[ ch ];         /* Translate to an index. */
X    encode_symbol( symbol, cum_freq );    /* Encode that symbol. */
X    update_model(symbol);                 /* Update the model. */
X  }
X  encode_symbol( EOF_symbol, cum_freq );  /* Encode the EOF symbol. */
X  done_encoding();                        /* Send the last few bits. */
X  done_outputing_bits();
X  exit( 0 );
X}
END_OF_FILE
if test 902 -ne `wc -c <'encode.c'`; then
    echo shar: \"'encode.c'\" unpacked with wrong size!
fi
# end of 'encode.c'
fi
if test -f 'model.h' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'model.h'\"
else
echo shar: Extracting \"'model.h'\" \(1142 characters\)
sed "s/^X//" >'model.h' <<'END_OF_FILE'
X/* Interface to the model. */
X/* The set of symbols that may be encoded. */
X#define No_of_chars 256                   /* Number of character symbols */
X#define EOF_symbol  ( No_of_chars + 1 )   /* Index of EOF symbol */
X#define No_of_symbols ( No_of_chars + 1 ) /* Total Number of symbols */
X/* Translation tables between characters and symbol indexes. */
X#ifdef aed_allocate_storage
Xint char_to_index[ No_of_chars ];                 /* To index from character */
Xunsigned char index_to_char[ No_of_symbols + 1 ]; /* To character from index */
X#else
Xextern int char_to_index[ No_of_chars ];          /* To index from character */
Xextern unsigned char index_to_char[ No_of_symbols + 1 ];
X                                                  /* To character from index */
X#endif
X/* Cumulative frequency table. */
X#define Max_frequency 16383               /* Maximum allowed frequency count */
X                                          /* 2^14 - 1 */
X#ifdef aed_allocate_storage
Xint cum_freq[ No_of_symbols + 1 ];        /* Cumulative symbol frequencies */
X#else
Xextern int cum_freq[ No_of_symbols + 1 ]; /* Cumulative symbol frequencies */
X#endif
END_OF_FILE
if test 1142 -ne `wc -c <'model.h'`; then
    echo shar: \"'model.h'\" unpacked with wrong size!
fi
# end of 'model.h'
fi
echo shar: End of shell archive.
exit 0