[net.sources] Fast wc

stevesu@copper.UUCP (04/11/87)
Here's something I came across in my bin directory.  According to
the modification time on the file, I must have written it back in
1984.  It's just like the "standard" wc (4.[23]bsd, anyway) with
the following two improvements:

     1.	It doesn't count what you don't ask for.  Therefore,
	wc -l is faster and wc -c is _m_u_c_h faster than when it has
	to count words (which is harder).  It also seems to be
	considerably faster than /usr/ucb/wc.

     2. It has the -v (verbose) and -p (count pages) options that
	some old version of wc (4.1? 2.9?) that I got used to had.

     3.	(Three! Three improvements! _N_obody expects...) It prints
	the fields in the order you ask for (i.e. wc -cwl gives
	you the reverse of the usual order).  (This isn't
	terribly important, and I've never made use of it, but
	for some reason I wrote it that way.)

There is also a -s flag that lets you set the page size used in
calculating page counts with -p.

Here is a timing comparison (on a 780 running Ultrix):

	$ cd /usr/dict
	$ time /usr/ucb/wc words web*
	   24259   24259  198596 words
	  234936  234936 2486813 web2
	   76205  121847 1012730 web2a
	  335400  381042 3698139 total
	      3:29.2 real        56.9 user        10.8 sys

	$ time wc.new words web* > /dev/null
	      1:44.1 real        26.7 user        11.7 sys

	$ time wc.new -w words web* > /dev/null
	      1:50.0 real        26.1 user        11.9 sys

	$ time wc.new -l words web* > /dev/null
	      1:08.5 real        14.6 user        11.4 sys

	$ time wc.new -c words web* > /dev/null
	        25.8 real         0.2 user         9.7 sys

Of course, if all you really care about is the character count,
an ls -l is faster still (although it will give you a different
answer if the file contains bad blocks, but I digress).

The word-counting algorithm probably isn't the one I would have
chosen, but it matches the one that /usr/ucb/wc uses.

If you're picky about plug compatibility, you should note that
the error handling is a bit different than the standard version.
(There is, regrettably, no "usage:" message.)

Following my signature are the source and man page.

                                           Steve Summit
                                           stevesu@copper.tek.com

cat > wc.c <<\%EOF%
/*
 *  wc [ -lwcp ] [ -spagesize ] [ -v ] [ files... ]
 *
 *  Count lines, words, characters, and pages.
 *
 *  Runs faster by doing less work if it doesn't have to count
 *  all quantities.
 *
 *  Use this program as you wish, but please leave this header intact.
 *
 *  Steve Summit 12/4/84
 */

#include <stdio.h>

#define TRUE 1
#define FALSE 0

long int totchars = 0;
long int totwords = 0;
long int totlines = 0;
long int totpages = 0;

#define LINES	04
#define WORDS	02
#define CHARS	01

int count = LINES | WORDS | CHARS;
char want[10] = "lwc";

int verbose = FALSE;

int pagelen = 66;

int errs = 0;

#define Isdigit(c) ((c) >= '0' && (c) <= '9')
#define Ctod(c) ((c) - '0')

#define Append(mask, letter)	if(deflt)				\
					{				\
					count = mask;			\
					(void)strcpy(want, letter);	\
					deflt = FALSE;			\
					}				\
				else	{				\
					count |= mask;			\
					(void)strcat(want, letter);	\
					}

#define Append2(letter)		if(deflt)				\
					{				\
					(void)strcpy(want, letter);	\
					deflt = FALSE;			\
					}				\
				else	(void)strcat(want, letter)

char *progname = "wc";

extern char *rindex();
extern char *strcat();
extern char *strcpy();

main(argc, argv)
int argc;
char *argv[];
{
int fd;
int deflt = TRUE;
int argi;
char *p;
int totals;

if(argc > 0)
	{
	p = rindex(argv[0], '/');
	if(p != NULL)
		progname = p + 1;
	else	progname = argv[0];
	}

for(argi = 1; argi < argc && argv[argi][0] == '-'; argi++)
	{
	for(p = &argv[argi][1]; *p != '\0'; p++)
		{
		switch(*p)
			{
			case 'l':
				Append(LINES, "l");
				break;

			case 'w':
				Append(WORDS, "w");
				break;

			case 'c':
				Append(CHARS, "c");
				break;

			case 'p':
				Append2("p");
				break;

			case 'v':
				verbose = TRUE;
				if(deflt)
					(void)strcpy(want, "lwcp");
				break;

			case 's':
				pagelen = 0;
				while(Isdigit(*(p + 1)))
					pagelen = 10 * pagelen + Ctod(*++p);
				break;

			default:
				fprintf(stderr, "%s: unknown option -%c\n",
								progname, *p);
			}
		}
	}

if(verbose)
	{
	for(p = want; *p != '\0'; p++)
		{
		switch(*p)
			{
			case 'l':
				printf("   lines");
				break;

			case 'w':
				printf("   words");
				break;

			case 'c':
				printf("   chars");
				break;

			case 'p':
				printf("   pages");
				break;
			}
		}

	putchar('\n');
	}

if(argi >= argc)
	wc("", 0);
else	{
	totals = (argi + 1) < argc;

	for(; argi < argc; argi++)
		{
		if((fd = open(argv[argi], 0)) < 0)
			{
			fprintf(stderr, "%s: can't open %s\n", progname,
								argv[argi]);
			perror("");
			errs++;
			continue;
			}
		wc(argv[argi], fd);
		(void)close(fd);
		}

	if(totals)
		{
		printit(totlines, totwords, totchars, totpages);
		printf(" total\n");
		}
	}

exit(errs);
}

#define Set(flag)	flag++
#define Clear(flag)	flag = FALSE

#define Checkline()	if(*p == '\n')					\
				lines++

#define Checkword()	if(' ' < *p && *p < '\177')			\
				{					\
				if(!inword)				\
					{				\
					words++;			\
					Set(inword);			\
					}				\
				continue;				\
				}

#define Checkword2()	else if(*p != ' ' && *p != '\t') 		\
				continue;				\
			Clear(inword)

#define Checkword3()	if(*p == ' ' || *p == '\n' || *p == '\t')	\
				Clear(inword)

#define Dochars()	chars += r

wc(name, fd)
char *name;
int fd;
{
char buf[BUFSIZ];
register char *bufend;
int r;
long int lines, words, chars, pages;
register char *p;
register int inword;

lines = words = chars = pages = 0;

Clear(inword);

switch(count)
	{
	case LINES:
		while((r = read(fd, buf, BUFSIZ)) > 0)
			{
			bufend = buf + r;
			for(p = buf; p < bufend; p++)
				Checkline();
			}
		break;

	case WORDS:
		while((r = read(fd, buf, BUFSIZ)) > 0)
			{
			bufend = buf + r;
			for(p = buf; p < bufend; p++)
				{
				Checkword();
				Checkword3();
				}
			}
		break;

	case CHARS:
		while((r = read(fd, buf, BUFSIZ)) > 0)
			Dochars();
		break;

	case LINES|CHARS:
		while((r = read(fd, buf, BUFSIZ)) > 0)
			{
			Dochars();

			bufend = buf + r;
			for(p = buf; p < bufend; p++)
				Checkline();
			}
		break;

	case LINES|WORDS:
		while((r = read(fd, buf, BUFSIZ)) > 0)
			{
			bufend = buf + r;
			for(p = buf; p < bufend; p++)
				{
				Checkword();
				Checkline();
				Checkword2();
				}
			}
		break;

	case WORDS|CHARS:
		while((r = read(fd, buf, BUFSIZ)) > 0)
			{
			Dochars();

			bufend = buf + r;
			for(p = buf; p < bufend; p++)
				{
				Checkword();
				Checkword3();
				}
			}
		break;

	case LINES|WORDS|CHARS:
		while((r = read(fd, buf, BUFSIZ)) > 0)
			{
			Dochars();

			bufend = buf + r;
			for(p = buf; p < bufend; p++)
				{
				Checkword();
				Checkline();
				Checkword2();
				}
			}
		break;
	}

if(r < 0)
	{
	fprintf(stderr, "%s: %s: read error\n", progname,
				*name != '\0' ? name : "standard input");
	perror("");
	errs++;
	}

pages = lines / pagelen + (lines % pagelen != 0 ? 1 : 0);

printit(lines, words, chars, pages);

if(*name != '\0')
	printf(" %s", name);
putchar('\n');

totlines += lines;
totwords += words;
totchars += chars;
totpages += pages;
}

printit(lines, words, chars, pages)
long int lines, words, chars, pages;
{
char *p;

for(p = want; *p != '\0'; p++)
	{
	switch(*p)
		{
		case 'l':
			printf(" %7ld", lines);
			break;

		case 'w':
			printf(" %7ld", words);
			break;

		case 'c':
			printf(" %7ld", chars);
			break;

		case 'p':
			printf(" %7ld", pages);
			break;
		}
	}
}
%EOF%
cat > wc.1 <<\%EOF%
.TH WC 1
.SH NAME
wc \- word count
.SH SYNOPSIS
.B wc
[
.B \-lwcp
] 
[
.B \-s\c
.I pagesize
] 
[
.B \-v
] 
[ name ... ]
.SH DESCRIPTION
.PP
.I Wc
counts lines, words, characters and (optionally) pages
in the named files, or in the standard input if no name appears.
A word is a maximal string of characters delimited by spaces, tabs or newlines.
.PP
If an argument beginning with one of ``lwcp'' is present,
the specified counts (lines, words, characters, or pages)
are selected by the letters
.BR l ,
.BR w ,
.BR c ,
or
.BR p .
The default is
.B \-lwc
unless
.B \-v
is specified.
.PP
The
.B \-s
option specifies that pages are
.I pagesize
lines long instead of the defaut 66.
.PP
The
.B \-v
option asks for a verbose output format, with column headers and
including pages by default.

.SH BUGS
%EOF%