[comp.sources.unix] v11i066: Word counts, checksums, etc.

rsalz@uunet.UU.NET (Rich Salz) (09/22/87)

Submitted-by: Alan Silverstein <hpda!hpfcla!hpfcdt!ajs>
Posting-number: Volume 11, Issue 66
Archive-name: vitals

This program was developed by Hewlett-Packard and will be part of our
HP-UX product offering.  We have found it useful.  It is most useful
when most widely and commonly shared, so here it is.

We have not tested it except on HP-UX, which is mainly AT&T-compatible.
However, it should be pretty portable.  Caveat emptor.  Oh, and the
usual disclaimer that I'm not really an official HP spokesperson.

Alan Silverstein, Hewlett-Packard Systems Software Operation, Fort Collins,
Colorado; {ihnp4 | hplabs}!hpfcla!ajs; 303-229-3053; (lat-long on request :-)



# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# This archive contains:
#	Makefile	vitals.1	vitals.c	
#
# Error checking via wc(1) will be performed.

echo x - Makefile
cat >Makefile <<'@EOF'
all:	vitals
install:	all vitals.1
	@echo install in appropriate directories.
vitals:	vitals.c
	$(CC) $(CFLAGS) -o vitals vitals.c
@EOF
echo Assuming Makefile is OK -- the moderator did not repack the kit...
echo x - vitals.1
cat >vitals.1 <<'@EOF'
.\" $Header: vitals.1,v 1.1 87/08/10 12:07:47 $
.TH VITALS 1 ""
.ad b
.SH NAME
vitals \- crc, sum, line, word, and character counts
.SH SYNOPSIS
.B vitals
[
.B \-rslwcb
] [
.I file ...
]
.SH DESCRIPTION
.I Vitals
checks data integrity by
computing vital statistics related to the data in the given
file(s) or standard input (by default).
The statistics include a four-digit hex CRC, a 16-bit byte sum (similar to
.IR sum (1)
without the block count)
and line, word, and character counts (similar to
.IR wc (1)).
One line is printed for each input file or standard input, consisting of
five statistics fields 
followed by the file name, if known.
If a file is specified as "\fB\-\fR", the program reads standard 
input at that
point and shows "\fB\-\fR" as the file name.
.SS Options 
.TP 15
.B \-r
Compute only the CRC:  a true 16-bit cyclic redundancy count
that can
detect, among other things, exchanged characters.
.TP 
.B \-s
Compute only the byte sum:  the modulo-2\(**\(**16, unsigned, circular sum
of all the bytes, each taken as an
.B int
(therefore normally in the range 0-255).
.TP
.B \-l
Compute only the line count:  the number of newline characters in the input.
.TP
.B \-w
Compute only the word count:  the number of 
character sequences
delimited by blanks, tabs, or newlines.
As in
.IR wc (1),
non-printable characters are totally ignored when looking for words.
.TP
.B \-c
Compute only the character count.
.TP
.B \-b
Print only file basenames, not full path names.
.PP
These options can be used in any combination.
The default is
.BR \-rslwc ,
that is, all but
.BR \-b .
.PP
.I Vitals 
is more efficient than separate commands tied together
with a shell script because the input data is read only once.
Unlike
.IR wc (1),
this program does not compute totals, since it is intended for fast data
validation, not size counting.
.SH DIAGNOSTICS
If any file open or read fails, the program writes a message to standard
error when the problem is encountered, and continues with the next file.
It ultimately returns non-zero if any error occurs, else zero.
.SH AUTHOR
.I Vitals
was developed by Hewlett-Packard.
.SH "SEE ALSO"
sum(1), wc(1).
@EOF
if test "`wc -lwc <vitals.1`" != '     80    375   2154'
then
	echo ERROR: wc results of vitals.1 are `wc -lwc <vitals.1` should be      80    375   2154
fi

chmod 444 vitals.1

echo x - vitals.c
cat >vitals.c <<'@EOF'
static char *HPUX_ID = "@(#)27.1   85/02/04";
/* HPUX_ID: @(#)vitals.c	27.1     85/02/04  */
/*
 * Compute vital statistics of data:  crc, sum, line, word, and character
 * counts.  See the manual entry for details.
 *
 * Compile with -DTABLE to produce an alternate program, which prints a
 * CRC table for use in this program.
 *
 * The CRC code was lifted from a USENET posting:
 *
 * hpfcla:net.sources / hcrvax!sft /  8:17 pm  Nov 15, 1984
 *
 * The following program is a command to calculate the CRC of files.
 * It is useful for the same purposes as sum(1).  It calculates the
 * true CRC16 (unlike CP/M utilities that say they calculate CRCs
 * but really just hash).  Crc detects more errors than old sum(1);
 * for example, it detects exchanges of characters.  It is also
 * (now) in the public domain.
 *
 *  CRC16 polynomial: x**0 + x**2 + x**15 + x**16 (0xA001)
 * (CCITT polynomial: x**0 + x**5 + x**12 + x**16 (0x8408))
 * Initial condition: 0
 *
 * D. Hugh Redelmeier, 1983 April 15; latest change: 1984 April 2.
 */


#include <stdio.h>

char	*USAGE = "usage: %s [-rslwcb] [files...]\n";

#define	proc				/* null; easy to grep for procs */
#define	chNull	   ('\0')
#define	chNewline  ('\n')
#define	sbNull	   ((char *) NULL)
#define	fileNull   ((FILE *) NULL)
#define	false	   0
#define	true	   1

char	*myname;				/* how command invoked	*/
int	rflag = false;				/* -r (crc) option	*/
int	sflag = false;				/* -s (sum) option	*/
int	lflag = false;				/* -l (lines) option	*/
int	wflag = false;				/* -w (words) option	*/
int	cflag = false;				/* -c (chars) option	*/
int	bflag = false;				/* -b (basenames) opt	*/

int	retval = 0;				/* return value		 */
char	*defaultargs[] = {"-", sbNull};		/* read stdin by default */

#define	PrintErr(part1,part2) \
		fprintf (stderr, "%s: %s %s\n", myname, part1, part2);


/************************************************************************
 * M A I N
 *
 * Initialize, check arguments, open files, and call another routine to
 * do each file.
 */

proc main (argc, argv)
register int	argc;
register char	**argv;
{
extern	 int	optind;			/* for getopt()  */
	 int	optchar;		/* from getopt() */
register FILE	*filep;			/* file to read	 */

#ifdef TABLE					/* just print table */
	PrintTable();
#else

/*
 * CHECK ARGUMENTS:
 */

	myname = *argv;

	while ((optchar = getopt (argc, argv, "rslwcb")) != EOF)
	    switch (optchar)
	    {
	    case 'r':	rflag = true;	break;
	    case 's':	sflag = true;	break;
	    case 'l':	lflag = true;	break;
	    case 'w':	wflag = true;	break;
	    case 'c':	cflag = true;	break;
	    case 'b':	bflag = true;	break;
	    default:	fprintf (stderr, USAGE, myname); exit (1);
	    }

	if (! (rflag || sflag || lflag || wflag || cflag))
	    rflag = sflag = lflag = wflag = cflag = true;

	argc -= optind;
	argv += optind;

	if (argc < 1)				/* use default arguments */
	    argv = defaultargs;

/*
 * OPEN AND DO EACH INPUT FILE:
 *
 * Be careful to keep stdin open for filenames of "-".
 * Argc is not altered; if < 1, it means no file args were given.
 */

	for ( ; *argv != sbNull; argv++)		/* each argument */
	{
	    if (strcmp (*argv, "-") == 0)		/* read stdin */
		filep = stdin;

	    else if ((filep = fopen (*argv, "r")) == fileNull)
	    {
		PrintErr ("can't open", *argv);
		retval = 1;
		continue;
	    }

	    DoFile (filep, (argc < 1), *argv);

	    if (filep != stdin)			/* keep stdin open for reuse */
		fclose (filep);
	}
	exit (retval);

#endif /* not TABLE */

} /* main */


#ifdef TABLE

/************************************************************************
 * P R I N T   T A B L E
 *
 * Print table needed for CRC computation, as a C array declaration.
 * The output can then be included in this program.  It would be easy
 * to just build the table in memory each time the program is run, but
 * what the heck -- this way is a little more complicated, but already
 * done, and shaves off a bit of startup time.
 *
 * Assumes unsigned and short are at least 16 bits.
 */

proc PrintTable()
{
register unsigned index = 0;		/* place in table	*/
register unsigned entry;		/* table entry		*/
register int	  count;		/* for changing entry	*/

	printf ("unsigned short CRCtable [256] = {");

	while (true)
	{
	    if ((index % 8) == 0)		/* time for new line */
		putchar (chNewline);

	    for (entry = index, count = 8; (count--) > 0; )
	    {
		if (entry & 1)			/* low bit set */
		    entry = (entry >> 1) ^ 0xA001;
		else
		    entry >>= 1;
	    }

	    printf ("\t0x%4.4x", entry);

	    if (++index == 256)
		break;

	    putchar (',');
	}

	printf ("\n};\n");

} /* PrintTable */


#else /* not TABLE */


/************************************************************************
 * CRC TABLE AND BYTE SUM STRUCTS
 *
 * CRCtable[], output from PrintTable(), is used for CRC calculation.
 * Structures are used for circular mod-2**16 byte sums.  They assume
 * that two shorts == one long!
 */

unsigned short CRCtable [256] = {
	0x0000,	0xc0c1,	0xc181,	0x0140,	0xc301,	0x03c0,	0x0280,	0xc241,
	0xc601,	0x06c0,	0x0780,	0xc741,	0x0500,	0xc5c1,	0xc481,	0x0440,
	0xcc01,	0x0cc0,	0x0d80,	0xcd41,	0x0f00,	0xcfc1,	0xce81,	0x0e40,
	0x0a00,	0xcac1,	0xcb81,	0x0b40,	0xc901,	0x09c0,	0x0880,	0xc841,
	0xd801,	0x18c0,	0x1980,	0xd941,	0x1b00,	0xdbc1,	0xda81,	0x1a40,
	0x1e00,	0xdec1,	0xdf81,	0x1f40,	0xdd01,	0x1dc0,	0x1c80,	0xdc41,
	0x1400,	0xd4c1,	0xd581,	0x1540,	0xd701,	0x17c0,	0x1680,	0xd641,
	0xd201,	0x12c0,	0x1380,	0xd341,	0x1100,	0xd1c1,	0xd081,	0x1040,
	0xf001,	0x30c0,	0x3180,	0xf141,	0x3300,	0xf3c1,	0xf281,	0x3240,
	0x3600,	0xf6c1,	0xf781,	0x3740,	0xf501,	0x35c0,	0x3480,	0xf441,
	0x3c00,	0xfcc1,	0xfd81,	0x3d40,	0xff01,	0x3fc0,	0x3e80,	0xfe41,
	0xfa01,	0x3ac0,	0x3b80,	0xfb41,	0x3900,	0xf9c1,	0xf881,	0x3840,
	0x2800,	0xe8c1,	0xe981,	0x2940,	0xeb01,	0x2bc0,	0x2a80,	0xea41,
	0xee01,	0x2ec0,	0x2f80,	0xef41,	0x2d00,	0xedc1,	0xec81,	0x2c40,
	0xe401,	0x24c0,	0x2580,	0xe541,	0x2700,	0xe7c1,	0xe681,	0x2640,
	0x2200,	0xe2c1,	0xe381,	0x2340,	0xe101,	0x21c0,	0x2080,	0xe041,
	0xa001,	0x60c0,	0x6180,	0xa141,	0x6300,	0xa3c1,	0xa281,	0x6240,
	0x6600,	0xa6c1,	0xa781,	0x6740,	0xa501,	0x65c0,	0x6480,	0xa441,
	0x6c00,	0xacc1,	0xad81,	0x6d40,	0xaf01,	0x6fc0,	0x6e80,	0xae41,
	0xaa01,	0x6ac0,	0x6b80,	0xab41,	0x6900,	0xa9c1,	0xa881,	0x6840,
	0x7800,	0xb8c1,	0xb981,	0x7940,	0xbb01,	0x7bc0,	0x7a80,	0xba41,
	0xbe01,	0x7ec0,	0x7f80,	0xbf41,	0x7d00,	0xbdc1,	0xbc81,	0x7c40,
	0xb401,	0x74c0,	0x7580,	0xb541,	0x7700,	0xb7c1,	0xb681,	0x7640,
	0x7200,	0xb2c1,	0xb381,	0x7340,	0xb101,	0x71c0,	0x7080,	0xb041,
	0x5000,	0x90c1,	0x9181,	0x5140,	0x9301,	0x53c0,	0x5280,	0x9241,
	0x9601,	0x56c0,	0x5780,	0x9741,	0x5500,	0x95c1,	0x9481,	0x5440,
	0x9c01,	0x5cc0,	0x5d80,	0x9d41,	0x5f00,	0x9fc1,	0x9e81,	0x5e40,
	0x5a00,	0x9ac1,	0x9b81,	0x5b40,	0x9901,	0x59c0,	0x5880,	0x9841,
	0x8801,	0x48c0,	0x4980,	0x8941,	0x4b00,	0x8bc1,	0x8a81,	0x4a40,
	0x4e00,	0x8ec1,	0x8f81,	0x4f40,	0x8d01,	0x4dc0,	0x4c80,	0x8c41,
	0x4400,	0x84c1,	0x8581,	0x4540,	0x8701,	0x47c0,	0x4680,	0x8641,
	0x8201,	0x42c0,	0x4380,	0x8341,	0x4100,	0x81c1,	0x8081,	0x4040
};

struct shorts {			/* as used, order is irrelevant */
	unsigned short	high, low;
};

typedef union {			/* to map one long to two shorts */
	unsigned long	sum;
	struct	 shorts	shorts;
} convert;


/************************************************************************
 * D O   F I L E
 *
 * Read one file, calculate values, and print an output line.
 * Sets retval if necessary.  ch is int, not char, to be sure
 * it can distinguish 0xff and EOF.  As a result, all normal
 * values of ch should be positive, 0..255.
 */

proc DoFile (filep, nullname, filename)
register FILE	*filep;		/* file to read	    */
	 int	nullname;	/* ignore filename? */
	 char	*filename;	/* name of filep    */
{
register int	ch;		/* current char	*/
register unsigned crc = 0;	/* crc sum	*/
register long	sum   = 0L;	/* byte sum	*/
register long	lines = 0L;	/* line count	*/
register long	words = 0L;	/* word count	*/
register long	chars = 0L;	/* char count	*/

	 convert conv;		/* for 16-sum	*/
register int notword = true;	/* not in word?	*/

/*
 * READ FILE AND COMPUTE SUMS:
 *
 * CRCtable[] values have 16 bits, so the masking is necessary before each
 * repeated index into the array.  sum is allowed to increment to more than
 * 16 bits; overflow is handled later.  Line and char counts are accumulated
 * regardless whether they are needed; it's faster not to check.  Words are
 * counted in the same strange way as wc(1), ignoring special chars.
 */

	while ((ch = getc (filep)) != EOF)
	{
	    if (rflag)
		crc = (crc >> 8) ^ (CRCtable [(crc ^ ch) & 0xff]);

	    if (sflag)
		sum += ch;

	    if (ch == chNewline)
		lines++;

	    chars++;

	    if (wflag)
	    {
		if ((' ' < ch) && (ch < '\177'))	/* word char */
		{
		    if (notword)			/* start of word */
		    {
			words++;
			notword = false;
		    }
		}
		else if ((ch == ' ') || (ch == '\t') || (ch == chNewline))
		{
		    notword = true;
		}
	    }
	} /* while */

/*
 * REPORT ERROR OR PRINT TOTALS:
 */

	if (ferror (filep))
	{
	    PrintErr ("read failed from", nullname ? "stdin" : filename);
	    retval = 1;
	}
	else
	{
	    conv.sum = sum;		/* for adding back overflow */

	    if (rflag)	printf (" %4.4x", crc);
	    if (sflag)	printf (" %5u",   conv.shorts.high + conv.shorts.low);
	    if (lflag)	printf (" %6ld",  lines);
	    if (wflag)	printf (" %6ld",  words);
	    if (cflag)	printf (" %6ld",  chars);

	    if (! nullname)		/* have name to print */
	    {
		if (bflag)		/* basename only */
		{
		    char *cp = filename;

		    while (*cp != chNull)	/* till end of name */
			if (*cp++ == '/')	/* directory level  */
			    filename = cp;	/* set past '/'	    */
		}
		printf (" %s", filename);
	    }
	    putchar (chNewline);

	} /* else */

} /* DoFile */

#endif /* not TABLE */
@EOF
if test "`wc -lwc <vitals.c`" != '    338   1577   9781'
then
	echo ERROR: wc results of vitals.c are `wc -lwc <vitals.c` should be     338   1577   9781
fi

chmod 444 vitals.c

exit 0