[comp.os.minix] A spelling checker for MINIX

ast@cs.vu.nl (Andy Tanenbaum) (01/07/88)

I have written a spelling checker for MINIX.  It is a very simple one.
The program 'spell' is a shell script:

   prep document | sort -u | comm -23 - dictionary

The program 'prep' strips off some of the troff commands, if any, and then
outputs the words onto standard output, one word per line.  Then 'sort'
sorts the list, stripping duplicates.  Finally, 'comm' produces a list of
all words present in the input file but missing from the dictionary.

One problem is that the MINIX sort has a bug in it.  When sorting files
with more than 20000 lines it sometimes gets into trouble and inserts a
line of garbage in the file.  I have been unable to find the bug, but it
is consistent enough that if somebody wants to hunt for it, I can probably
send you a file that steps on the bug consistently.  In any event, I wrote
a little utility, ascii.c, that greps a file pulling out all the lines
containing 100% ASCII text, and omitting the junk lines.  With the -n flag
it only pulls out the junk, so you can use it in this mode to see if a file
has junk.  This is admittedly an awful kludge, but I couldn't find the bug.
Will someone please help?

Since this stategy does not know about suffixes and prefixes and plural forms
and conjugations and declinations and adjective agreement and all that kind of
wonderful stuff, the dictionary has to list all the words.  Thus it is not
sufficient to list multiply, you have to list multiply, multiplies, 
multiplied, multiplying, multiplier, and so on.  I have built up such a 
dictionary, which is being posted separately.  This dictionary is NOT based on
the UNIX dictionary so it is free of AT&T copyright.  I built the dictionary
from three sources.  First, I started by sorting and uniq'ing some public
domain dictionaries.  Second, as some of you probably know, I have written
somewhere between 3 and 6 books (depending on precisely what you count) and
an additional 50 published papers on operating systems, networks, compilers,
languages, etc.  This data base, which is online, is nonnegligible :-)
Finally, I added a number of words that I thought ought to be in the dictionary
including all the U.S. states, all the European and some other major countries,
principal U.S. and world cities, and a bunch of technical terms.  I don't want 
my spelling checker to barf on arpanet, diskless, modem, login, internetwork,
subdirectory, superuser, vlsi, or winchester just because Webster wouldn't
approve of them.  All in all, the dictionary is over 40,000 words.  If you
have any suggestions for additions or deletions, please post them.  But please
be sure you are not infringing on anyone's copyright in doing so.

Andy Tanenbaum (ast@cs.vu.nl)

: This is a shar archive.  Extract with sh, not csh.
: This archive ends with exit, so do not worry about trailing junk.
: --------------------------- cut here --------------------------
PATH=/bin:/usr/bin
echo Extracting \s\p\e\l\l
sed 's/^X//' > \s\p\e\l\l << '+ END-OF-FILE '\s\p\e\l\l
Xprep $1 | sort -u | comm -23 - /usr/lib/dictionary
X
+ END-OF-FILE spell
chmod 'u=rwx,g=rx,o=rx' \s\p\e\l\l
set `sum \s\p\e\l\l`
sum=$1
case $sum in
63799)	:;;
*)	echo 'Bad sum in '\s\p\e\l\l >&2
esac
echo Extracting \p\r\e\p\.\c
sed 's/^X//' > \p\r\e\p\.\c << '+ END-OF-FILE '\p\r\e\p\.\c
X/* prep - prepare file for statistics 	Author: Andy Tanenbaum */
X
X#include <stdio.h>
X#include <ctype.h>
X
X#define TROFF_CHAR	'.'	/* troff commands begin with this char */
X#define EOL		'\n'	/* end of line char */
X#define APOSTROPHE	047	/* single quote */
X#define BACKSLASH       '\\'	/* troff code */
X
Xint lfread;			/* set when last char read was lf */
Xint lfwritten;			/* set when last char written was lf */
X
Xmain(argc, argv)
Xint argc;
Xchar *argv[];
X{
X
X  char c, backslash();
X  FILE *freopen();
X
X  if (argc > 2) usage();
X  if (argc == 2) {
X	if (freopen(argv[1], "r", stdin) == NULL) {
X		printf("prep: cannot open %s\n", argv[1]);
X		exit(1);
X	}
X  }
X
X  while ( (c = getchar()) != EOF) {
X	/* Lines beginning with "." are troff commands -- skip them. */
X	if (lfread && c == TROFF_CHAR) {
X		skipline();
X		continue;
X	}
X	if (c == BACKSLASH) c = backslash();	/* eat troff stuff */
X		
X	if (isupper(c)) {
X		putchar(tolower(c));
X		lfwritten = 0;
X		lfread = 0;
X		continue;
X	}
X	if (islower(c)) {
X		putchar(c);
X		lfwritten = 0;
X		lfread = 0;
X		continue;
X	}
X	if (c == APOSTROPHE) {
X		putchar(c);
X		lfwritten = 0;
X		lfread = 0;
X		continue;
X	}
X	lfread = (c == EOL ? 1 : 0);
X	if (lfwritten) continue;
X	putchar(EOL);
X	lfwritten = 1;
X  }
X}
X
X
Xskipline()
X{
X  char c;
X
X  while ( (c = getchar()) != EOL) ;
X}
X
X	
Xchar backslash()
X{
X/* A backslash has been seen.  Eat troff stuff. */
X
X  char c;
X
X  c = getchar();
X  switch(c) {
X	case 'f':
X		c = getchar();
X		c = getchar();
X		return(c);
X
X	case 's':		/* \s7  or \s14 */
X		c = getchar();
X		c = getchar();
X		if (isdigit(c)) c = getchar();
X		return(c);
X
X	case 'n':		/* \na or \n(xx  */
X		c = getchar();
X		if (c == '(') {
X			c = getchar();
X			c = getchar();
X		}
X		c = getchar();
X		return(c);
X
X	case '*':		/* / * (XX */
X		c = getchar();
X		if (c == '(') {
X			c = getchar();
X			c = getchar();
X			c = getchar();
X			return(c);
X		}
X
X	case '(':		/* troff 4-character escape sequence */
X		c = getchar();
X		c = getchar();
X		c = getchar();
X		return(c);
X
X  }
X}
X
Xusage()
X{
X  printf("Usage: prep [file]\n");
X  exit(1);
X}
X
+ END-OF-FILE prep.c
chmod 'u=rw,g=r,o=r' \p\r\e\p\.\c
set `sum \p\r\e\p\.\c`
sum=$1
case $sum in
36979)	:;;
*)	echo 'Bad sum in '\p\r\e\p\.\c >&2
esac
echo Extracting \a\s\c\i\i\.\c
sed 's/^X//' > \a\s\c\i\i\.\c << '+ END-OF-FILE '\a\s\c\i\i\.\c
X/* ascii - list lines with/without ASCII chars	Author: Andy Tanenbaum */
X
X#define BUFSIZE 30000
X
Xchar buf[BUFSIZE];		/* input buffer */
Xchar *next;			/* start of line */
Xchar *limit;			/* last char of line */
Xint count;			/* # chars in buffer not yet processed */
Xint used;			/* how many chars used at start of buf */
Xint eof;			/* set when eof seen */
Xint nflag;			/* set if -n option given */
Xint exitstatus;			/* 0 if pure ASCII, 1 if junk seen */
X
Xmain(argc, argv)
Xint argc;
Xchar *argv[];
X{
X  int yes;
X  char *p;
X
X  if (argc > 3) usage();
X  if (strcmp(argv[1], "-n") == 0) nflag++;
X
X  if ((argc == 2 && nflag == 0) || argc == 3) {
X	close(0);
X	if (open(argv[argc-1], 0) < 0) {
X		std_err("ascii: cannot open ");
X		std_err(argv[1]);
X		std_err("\n");
X		exit(1);
X	}
X  }
X
X  while(eof == 0) {
X	yes = getline();
X	if (nflag != yes) output();
X	next = limit;
X  }
X  exit(exitstatus);
X}
X
Xint getline()
X{
X  char *p, c;
X  int asc = 1;
X
X  if (count == 0) load();
X  if (eof) exit(exitstatus);
X
X  p = next;
X  while (count > 0) {
X	c = *p++;
X	if (c & 0200) {asc = 0; exitstatus = 1;}
X	count--;
X	if (c == '\n') {
X		limit = p;
X		return(asc);
X	}
X	if (count == 0) {
X		/* Move the residual characters to the bottom of buf */
X		used = &buf[BUFSIZE] - next;
X		copy(next, buf, used);
X		load();
X		p = &buf[used];
X		used = 0;
X		if (eof) return(asc);
X	}
X  }
X}
X
Xload()
X{
X  count = read(0, &buf[used], BUFSIZE-used);
X  if (count <= 0) eof = 1;
X  next = buf;
X}
X
Xoutput()
X{
X  write(1, next, limit-next);
X}
X
Xusage()
X{
X  std_err("Usage: ascii [-n] file\n");
X  exit(1);
X}
X
Xcopy(s,d,ct)
Xregister char *s, *d;
Xint ct;
X{
X  while (ct--) *d++ = *s++;
X}
X
+ END-OF-FILE ascii.c
chmod 'u=rw,g=r,o=r' \a\s\c\i\i\.\c
set `sum \a\s\c\i\i\.\c`
sum=$1
case $sum in
39792)	:;;
*)	echo 'Bad sum in '\a\s\c\i\i\.\c >&2
esac
exit 0