[alt.sources] code for reading from .?Q? files

nelson@sun.soe.clarkson.edu (Russ Nelson) (11/16/90)

In article <1422@tharr.UUCP> gtoal@tharr.UUCP (Graham Toal) writes:

   This posting consists of a set of routines which roughly simulate
   fopen, fgetc, fgets, and fclose.  The difference between these and
   the originals is that these will read data from a .Z compressed
   file, decompressing it on the fly.  It does *not* uses pipes,
   processes, or intermediate files.  This makes it useful to add to
   any programs which read large text files sequentially.

I had a very similar need, except that I needed to SEEK also.  With
compress, this is a problem.  But with Huffman encoding, seeking isn't
a problem, so long as you seek to a bit boundary rather than a byte
boundary.

Since other people may need to seek into squeezed files, here's my
version.  It reads files produced in the format of the "SQ" standard
first seen on CP/M.  It's derived from
simtel20.army.mil:pd1:<unix-c.cpm>xsq.tar-z, and is kind of useless
without it, because you need to modify xsq.c or xusq.c to convince it to tell
you the code offsets.  I've enclosed my xusqi.c program, which looks
through a specially formatted-file (seven-line records), and maps a byte
offset into the original file into a bit offset into the squeezed file.

This code has been written for MSDOS, but should be portable to Unix.

#! /bin/sh
# This is a shell archive.  Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file".  To overwrite existing
# files, type "sh file -c".  You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g..  If this archive is complete, you
# will see the following message at the end:
#		"End of shell archive."
# Contents:  sqread.c sqread.h xusqi.c
# Wrapped by nelson@image.soe.clarkson.edu on Thu Nov 15 09:59:12 1990
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'sqread.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'sqread.c'\"
else
echo shar: Extracting \"'sqread.c'\" \(4903 characters\)
sed "s/^X//" >'sqread.c' <<'END_OF_FILE'
X#include <stdio.h>
X#include <stdlib.h>
X#include <signal.h>
X#include <ctype.h>
X#include <limits.h>
X#include <mem.h>
X#define INTERNAL
X#include "sqread.h"
X
X#define ERROR (-1)
X#define PATHLEN	312	/* Number of characters allowed in pathname */
X#define OK 0
X
X#define RECOGNIZE 0xFF76	/* unlikely pattern */
X#define DLE 0x90		/* repeat byte flag */
X#define SPEOF 256		/* special endfile token */
X#define NUMVALS 257		/* 256 data values plus SPEOF*/
X
Xstatic int getuhuff(SQFILE *sqp);
Xstatic int portgetw(FILE *f);
X
X/* BUG: you cannot select an arbitrary position in the output stream
X * to start your output (unless you make provisions for that).  This is
X * because of the repeat compression.  You can make provision for seeking
X * to the middle of a repeat sequence by terminating the repeat when
X * compressing the file.
X */
X
XSQFILE *
Xsqopen    (const char *path)
X{
X	FILE *fp;
X	SQFILE *sqp;
X	int numnodes;
X	int i;
X
X	fp = fopen(path, "rb");
X	if (!fp)
X		return NULL;
X	sqp = malloc(sizeof(SQFILE));
X	if (!sqp)
X		return NULL;
X	sqp->file = fp;
X	if(portgetw(fp) != (int) RECOGNIZE) {/* Process header */
X		rewind(sqp->file);
X		sqp->issq = 0;
X		return sqp;
X	}
X	sqp->issq = 1;
X	(void) portgetw(fp);		/* checksum */
X	while (getc(fp))		/* Read and discard the name */
X		;
X
X	numnodes = portgetw(fp);
X	if(numnodes < 0 || numnodes >= NUMVALS) {
X		/* invalid decode tree */
X		fclose(fp);
X		return NULL;
X	}
X	/* Initialize for possible empty tree (SPEOF only) */
X	sqp->sqleaf[0].children[0] = -(SPEOF + 1);
X	sqp->sqleaf[0].children[1] = -(SPEOF + 1);
X
X	for(i = 0; i < numnodes; ++i) {	/* Get decoding tree from file */
X		sqp->sqleaf[i].children[0] = portgetw(fp);
X		sqp->sqleaf[i].children[1] = portgetw(fp);
X	}
X	sqp->rewpos = ftell(sqp->file);
X	sqp->bpos = 7;			/* force initial read */
X	sqp->repct = 0;			/* start with no repeats */
X	return sqp;
X}
X
Xvoid
Xsqrewind(SQFILE *sqp)
X{
X	sqp->bpos = 7;			/* force initial read */
X	sqp->repct = 0;			/* start with no repeats */
X	fseek(sqp->file, sqp->rewpos, SEEK_SET);
X}
X
X
X/* The "offset" is actually a bit offset within the file.  Offset 0 is
X * bit 0, byte 0.  Offset 8 is bit 0, byte 1.  Etc...
X */
Xlong
Xsqseek(SQFILE *sqp, long offset)
X{
X	int bpos;
X	long value;
X
X	if (!sqp->issq)
X		return fseek(sqp->file, offset, SEEK_SET);
X
X	bpos = offset & 7;
X	value = fseek(sqp->file, offset >> 3, SEEK_SET);
X	if (bpos == 0) {
X		sqp->bpos = 7;
X	} else {
X		sqp->bpos = bpos - 1;
X		sqp->curin = getc(sqp->file) >> (bpos - 1);
X	}
X	sqp->repct = 0;			/* start with no repeats */
X	return value;
X}
X
X
X/* Get bytes with decoding - this decodes repetition,
X * calls getuhuff to decode file stream into byte
X * level code with only repetition encoding.
X *
X * The code is simple passing through of bytes except
X * that DLE is encoded as DLE-zero and other values
X * repeated more than twice are encoded as value-DLE-count.
X */
X
Xint
Xsqgetc(SQFILE *sqp)
X{
X	int c;
X
X	if (!sqp->issq)
X		return getc(sqp->file);
X
X	if(sqp->repct > 0) {
X		/* Expanding a repeated char */
X		--sqp->repct;
X		return(sqp->value);
X	} else {
X		/* Nothing unusual */
X		if((c = getuhuff(sqp)) != DLE) {
X			/* It's not the special delimiter */
X			sqp->value = c;
X			if(sqp->value == EOF)
X				sqp->repct = INT_MAX;
X			return(sqp->value);
X		} else {
X			/* Special token */
X			if((sqp->repct = getuhuff(sqp)) == 0)
X				/* DLE, zero represents DLE */
X				return(DLE);
X			else {
X				/* Begin expanding repetition */
X				sqp->repct -= 2;	/* 2nd time */
X				return(sqp->value);
X			}
X		}
X	}
X}
X
X
X/* Decode file stream into a byte level code with only
X * repetition encoding remaining.
X */
Xstatic int
Xgetuhuff(SQFILE *sqp)
X{
X	int i;
X
X	/* Follow bit stream in tree to a leaf*/
X	i = 0;	/* Start at root of tree */
X	do {
X		if(++sqp->bpos > 7) {
X			if((sqp->curin = getc(sqp->file)) == EOF)
X				return(ERROR);
X			sqp->bpos = 0;
X			/* move a level deeper in tree */
X			i = sqp->sqleaf[i].children[1 & sqp->curin];
X		} else
X			i = sqp->sqleaf[i].children[1 & (sqp->curin >>= 1)];
X	} while(i >= 0);
X
X	/* Decode fake node index to original data value */
X	i = -(i + 1);
X	/* Decode special endfile token to normal EOF */
X	return(i == SPEOF) ? EOF : i;
X}
X
X
Xchar
X*sqgets    (char *s, int n, SQFILE *sqp)
X{
X	int c = '\0';
X
X	while (c != '\n' && n > 1) {
X		if ((c = sqgetc(sqp)) == EOF)
X			return NULL;
X		if (c == '\r')
X			continue;
X		*s++ = c;
X		--n;
X	}
X	*s = '\0';
X	return s;
X}
X
X
Xlong
Xsqtell    (SQFILE *sqp)
X{
X	if (!sqp->issq)
X		return ftell(sqp->file);
X	return (ftell(sqp->file) << 3) + sqp->bpos;
X}
X
X
Xlong
Xsqsize    (SQFILE *sqp)
X{
X	fseek(sqp->file, 0, SEEK_END);
X	if (!sqp->issq)
X		return ftell(sqp->file);
X	return (ftell(sqp->file) << 3) + 7;
X}
X
X
Xint
Xsqclose   (SQFILE *sqp)
X{
X	int value = fclose(sqp->file);
X	free(sqp);
X	return value;
X}
X
X
X/*
X * Machine independent getw which always gets bytes in the same order
X *  as the CP/M version of SQ wrote them
X */
Xstatic int
Xportgetw(f)
XFILE *f;
X{
X	int c;
X
X	c = getc(f) & 0377;
X	return(c | (getc(f) << 8));
X}
END_OF_FILE
if test 4903 -ne `wc -c <'sqread.c'`; then
    echo shar: \"'sqread.c'\" unpacked with wrong size!
fi
# end of 'sqread.c'
fi
if test -f 'sqread.h' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'sqread.h'\"
else
echo shar: Extracting \"'sqread.h'\" \(709 characters\)
sed "s/^X//" >'sqread.h' <<'END_OF_FILE'
Xstruct sqleaf {		/* Decoding tree */
X	int children[2];	/* left, right */
X};
X
Xtypedef struct {
X#ifdef INTERNAL
X	FILE *file;
X	struct sqleaf sqleaf[256];
X	int issq;		/* non-zero if this file's squeezed */
X	int bpos;		/* last bit position read */
X	int curin;		/* last byte value read */
X	int repct;		/* Number of times to return value */
X	int value;		/* current byte value or EOF */
X	long rewpos;		/* rewind seek position */
X#endif
X} SQFILE;
X
XSQFILE	*sqopen    (const char *path);
Xvoid	 sqrewind  (SQFILE *stream);
Xlong	 sqseek    (SQFILE *stream, long offset);
Xchar	*sqgets    (char *s, int n, SQFILE *stream);
Xlong	 sqtell    (SQFILE *stream);
Xlong	 sqsize    (SQFILE *stream);
Xint	 sqclose   (SQFILE *stream);
END_OF_FILE
if test 709 -ne `wc -c <'sqread.h'`; then
    echo shar: \"'sqread.h'\" unpacked with wrong size!
fi
# end of 'sqread.h'
fi
if test -f 'xusqi.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'xusqi.c'\"
else
echo shar: Extracting \"'xusqi.c'\" \(5092 characters\)
sed "s/^X//" >'xusqi.c' <<'END_OF_FILE'
Xstatic char *sccsid = "@(#)usq.c        1.7u (UCF) 82/12/15";
X/*
X * 	usq.c - CP/M compatible file unsqueezer utility
X *
X *	compile as follows:
X *	cc [-DVAX] -O usq.c -o usq
X *	   (define VAX only if running on VAX)
X */
X
X#include <stdio.h>
X#include <stdlib.h>
X#include <limits.h>
X#include <signal.h>
X#include <ctype.h>
X
X#define ERROR (-1)
X#define PATHLEN	312	/* Number of characters allowed in pathname */
X#define OK 0
X
X#define RECOGNIZE 0xFF76	/* unlikely pattern */
X#define DLE 0x90		/* repeat byte flag */
X#define SPEOF 256		/* special endfile token */
X#define NUMVALS 257		/* 256 data values plus SPEOF*/
X
X/* 16 bit ints, please */
Xtypedef int INT16;
Xtypedef unsigned UNSIGNED;
Xtypedef long LONG32;
X
Xstruct sqleaf {		/* Decoding tree */
X	INT16 children[2];	/* left, right */
X};
Xstruct sqleaf Dnode[NUMVALS - 1];
X
XINT16 Bpos;		/* last bit position read */
XINT16 Curin;		/* last byte value read */
XINT16 Repct;		/* Number of times to return value */
XINT16 Value;		/* current byte value or EOF */
X
XINT16 getcr(void);
XINT16 getuhuff(void);
XINT16 portgetw(FILE *f);
XLONG32 remap(LONG32);
X
Xmain(int argc, char *argv[])
X{
X	squeeze(argv[1]);
X}
X
X/*
X	The following code is primarily from typesq.c and utr.c.  Typesq
Xis a modification of USQ by Dick Greenlaw.  Those modifications (usq
Xto typesq) were made by Bob Mathias, I am responsible for the butchery
Xdone to make it work with cat.
X
X*/
X
XFILE *in;
Xsqueeze(fname)
Xchar *fname;
X{
X	register INT16 i, c;
X	register char *p;
X	register INT16 numnodes;			/* size of decoding tree */
X	register UNSIGNED crc;
X	UNSIGNED filecrc;
X	char inl[BUFSIZ];
X	INT16 inlcnt;
X
X	init_cr(); init_huff(); crc=0;
X
X	if((in=fopen( fname, "rb"))==NULL) {
X		fprintf(stderr, "usq: can't open %s\n", fname);
X		return ERROR;
X	}
X	if(portgetw(in) != (INT16) RECOGNIZE) {/* Process header */
X		fprintf(stderr, "usq: %s is not a SQueezed file\n", fname);
X		return(ERROR);
X	}
X	filecrc = (UNSIGNED) portgetw(in);	/* checksum */
X	while (getc(in))		/* Read and discard the name */
X		;
X
X	numnodes = portgetw(in);
X	if(numnodes < 0 || numnodes >= NUMVALS) {
X		fprintf(stderr, "usq: %s has invalid decode tree\n", fname);
X		fclose(in);
X		return(ERROR);
X	}
X	/* Initialize for possible empty tree (SPEOF only) */
X	Dnode[0].children[0] = -(SPEOF + 1);
X	Dnode[0].children[1] = -(SPEOF + 1);
X
X	for(i = 0; i < numnodes; ++i) {	/* Get decoding tree from file */
X		Dnode[i].children[0] = portgetw(in);
X		Dnode[i].children[1] = portgetw(in);
X	}
X	inlcnt = 0;
X	while (fgets(inl, sizeof(inl), stdin)){
X		if (++inlcnt == 7) {
X			sprintf(inl, "%ld\n", remap(atol(inl)));
X			inlcnt = 0;
X		}
X		fputs(inl, stdout);
X	}
X	fclose(in);
X	return(OK);
X}
X
X/* Map the original source position into the compressed bit position.
X *
X */
XLONG32
Xremap(LONG32 pos)
X{
X	static LONG32 inpos = 0L;
X	int c;
X
X	while (pos != inpos) {
X		if ((c = getcr()) == EOF)
X			abort();
X#ifdef notdef
X		putchar(c);
X#endif
X		inpos++;
X	}
X	fprintf(stderr, "ftell() = %ld, Bpos = %d\n", ftell(in), Bpos);
X	if (Bpos == 99)
X		return (ftell(in) << 3) + ((Bpos == 99)?0:(Bpos+1));
X	return ((ftell(in)-1) << 3) + Bpos+1;
X}
X
X
X/*** from utr.c - */
X/* initialize decoding functions */
X
Xinit_cr()
X{
X	Repct = 0;
X}
X
Xinit_huff()
X{
X	Bpos = 99;	/* force initial read */
X}
X
X/* Get bytes with decoding - this decodes repetition,
X * calls getuhuff to decode file stream into byte
X * level code with only repetition encoding.
X *
X * The code is simple passing through of bytes except
X * that DLE is encoded as DLE-zero and other values
X * repeated more than twice are encoded as value-DLE-count.
X */
X
XINT16
Xgetcr()
X{
X	register INT16 c;
X
X	if(Repct > 0) {
X		/* Expanding a repeated char */
X		--Repct;
X		return(Value);
X	} else {
X		/* Nothing unusual */
X		if((c = getuhuff()) != DLE) {
X			/* It's not the special delimiter */
X			Value = c;
X			if(Value == EOF)
X				Repct = INT_MAX;
X			return(Value);
X		} else {
X			/* Special token */
X			if((Repct = getuhuff()) == 0)
X				/* DLE, zero represents DLE */
X				return(DLE);
X			else {
X				/* Begin expanding repetition */
X				Repct -= 2;	/* 2nd time */
X				return(Value);
X			}
X		}
X	}
X}
X/* Decode file stream into a byte level code with only
X * repetition encoding remaining.
X */
X
XINT16
Xgetuhuff()
X{
X	register INT16 i;
X
X	/* Follow bit stream in tree to a leaf*/
X	i = 0;	/* Start at root of tree */
X	do {
X		if(++Bpos > 7) {
X			if((Curin = getc(in)) == EOF)
X				return(ERROR);
X			Bpos = 0;
X			/* move a level deeper in tree */
X			i = Dnode[i].children[1 & Curin];
X		} else
X			i = Dnode[i].children[1 & (Curin >>= 1)];
X	} while(i >= 0);
X
X	/* Decode fake node index to original data value */
X	i = -(i + 1);
X	/* Decode special endfile token to normal EOF */
X	return(i == SPEOF) ? EOF : i;
X}
X/*
X * Machine independent getw which always gets bytes in the same order
X *  as the CP/M version of SQ wrote them
X */
XINT16
Xportgetw(f)
XFILE *f;
X{
X	register INT16 c;
X
X	c = getc(f) & 0377;
X	return(c | (getc(f) << 8));
X}
END_OF_FILE
if test 5092 -ne `wc -c <'xusqi.c'`; then
    echo shar: \"'xusqi.c'\" unpacked with wrong size!
fi
# end of 'xusqi.c'
fi
echo shar: End of shell archive.
exit 0
--
--russ (nelson@clutx [.bitnet | .clarkson.edu])  FAX 315-268-7600
It's better to get mugged than to live a life of fear -- Freeman Dyson
I joined the League for Programming Freedom, and I hope you'll join too.