ast@cs.vu.nl (Andy Tanenbaum) (01/07/88)
I have written a spelling checker for MINIX. It is a very simple one. The program 'spell' is a shell script: prep document | sort -u | comm -23 - dictionary The program 'prep' strips off some of the troff commands, if any, and then outputs the words onto standard output, one word per line. Then 'sort' sorts the list, stripping duplicates. Finally, 'comm' produces a list of all words present in the input file but missing from the dictionary. One problem is that the MINIX sort has a bug in it. When sorting files with more than 20000 lines it sometimes gets into trouble and inserts a line of garbage in the file. I have been unable to find the bug, but it is consistent enough that if somebody wants to hunt for it, I can probably send you a file that steps on the bug consistently. In any event, I wrote a little utility, ascii.c, that greps a file pulling out all the lines containing 100% ASCII text, and omitting the junk lines. With the -n flag it only pulls out the junk, so you can use it in this mode to see if a file has junk. This is admittedly an awful kludge, but I couldn't find the bug. Will someone please help? Since this stategy does not know about suffixes and prefixes and plural forms and conjugations and declinations and adjective agreement and all that kind of wonderful stuff, the dictionary has to list all the words. Thus it is not sufficient to list multiply, you have to list multiply, multiplies, multiplied, multiplying, multiplier, and so on. I have built up such a dictionary, which is being posted separately. This dictionary is NOT based on the UNIX dictionary so it is free of AT&T copyright. I built the dictionary from three sources. First, I started by sorting and uniq'ing some public domain dictionaries. Second, as some of you probably know, I have written somewhere between 3 and 6 books (depending on precisely what you count) and an additional 50 published papers on operating systems, networks, compilers, languages, etc. This data base, which is online, is nonnegligible :-) Finally, I added a number of words that I thought ought to be in the dictionary including all the U.S. states, all the European and some other major countries, principal U.S. and world cities, and a bunch of technical terms. I don't want my spelling checker to barf on arpanet, diskless, modem, login, internetwork, subdirectory, superuser, vlsi, or winchester just because Webster wouldn't approve of them. All in all, the dictionary is over 40,000 words. If you have any suggestions for additions or deletions, please post them. But please be sure you are not infringing on anyone's copyright in doing so. Andy Tanenbaum (ast@cs.vu.nl) : This is a shar archive. Extract with sh, not csh. : This archive ends with exit, so do not worry about trailing junk. : --------------------------- cut here -------------------------- PATH=/bin:/usr/bin echo Extracting \s\p\e\l\l sed 's/^X//' > \s\p\e\l\l << '+ END-OF-FILE '\s\p\e\l\l Xprep $1 | sort -u | comm -23 - /usr/lib/dictionary X + END-OF-FILE spell chmod 'u=rwx,g=rx,o=rx' \s\p\e\l\l set `sum \s\p\e\l\l` sum=$1 case $sum in 63799) :;; *) echo 'Bad sum in '\s\p\e\l\l >&2 esac echo Extracting \p\r\e\p\.\c sed 's/^X//' > \p\r\e\p\.\c << '+ END-OF-FILE '\p\r\e\p\.\c X/* prep - prepare file for statistics Author: Andy Tanenbaum */ X X#include <stdio.h> X#include <ctype.h> X X#define TROFF_CHAR '.' /* troff commands begin with this char */ X#define EOL '\n' /* end of line char */ X#define APOSTROPHE 047 /* single quote */ X#define BACKSLASH '\\' /* troff code */ X Xint lfread; /* set when last char read was lf */ Xint lfwritten; /* set when last char written was lf */ X Xmain(argc, argv) Xint argc; Xchar *argv[]; X{ X X char c, backslash(); X FILE *freopen(); X X if (argc > 2) usage(); X if (argc == 2) { X if (freopen(argv[1], "r", stdin) == NULL) { X printf("prep: cannot open %s\n", argv[1]); X exit(1); X } X } X X while ( (c = getchar()) != EOF) { X /* Lines beginning with "." are troff commands -- skip them. */ X if (lfread && c == TROFF_CHAR) { X skipline(); X continue; X } X if (c == BACKSLASH) c = backslash(); /* eat troff stuff */ X X if (isupper(c)) { X putchar(tolower(c)); X lfwritten = 0; X lfread = 0; X continue; X } X if (islower(c)) { X putchar(c); X lfwritten = 0; X lfread = 0; X continue; X } X if (c == APOSTROPHE) { X putchar(c); X lfwritten = 0; X lfread = 0; X continue; X } X lfread = (c == EOL ? 1 : 0); X if (lfwritten) continue; X putchar(EOL); X lfwritten = 1; X } X} X X Xskipline() X{ X char c; X X while ( (c = getchar()) != EOL) ; X} X X Xchar backslash() X{ X/* A backslash has been seen. Eat troff stuff. */ X X char c; X X c = getchar(); X switch(c) { X case 'f': X c = getchar(); X c = getchar(); X return(c); X X case 's': /* \s7 or \s14 */ X c = getchar(); X c = getchar(); X if (isdigit(c)) c = getchar(); X return(c); X X case 'n': /* \na or \n(xx */ X c = getchar(); X if (c == '(') { X c = getchar(); X c = getchar(); X } X c = getchar(); X return(c); X X case '*': /* / * (XX */ X c = getchar(); X if (c == '(') { X c = getchar(); X c = getchar(); X c = getchar(); X return(c); X } X X case '(': /* troff 4-character escape sequence */ X c = getchar(); X c = getchar(); X c = getchar(); X return(c); X X } X} X Xusage() X{ X printf("Usage: prep [file]\n"); X exit(1); X} X + END-OF-FILE prep.c chmod 'u=rw,g=r,o=r' \p\r\e\p\.\c set `sum \p\r\e\p\.\c` sum=$1 case $sum in 36979) :;; *) echo 'Bad sum in '\p\r\e\p\.\c >&2 esac echo Extracting \a\s\c\i\i\.\c sed 's/^X//' > \a\s\c\i\i\.\c << '+ END-OF-FILE '\a\s\c\i\i\.\c X/* ascii - list lines with/without ASCII chars Author: Andy Tanenbaum */ X X#define BUFSIZE 30000 X Xchar buf[BUFSIZE]; /* input buffer */ Xchar *next; /* start of line */ Xchar *limit; /* last char of line */ Xint count; /* # chars in buffer not yet processed */ Xint used; /* how many chars used at start of buf */ Xint eof; /* set when eof seen */ Xint nflag; /* set if -n option given */ Xint exitstatus; /* 0 if pure ASCII, 1 if junk seen */ X Xmain(argc, argv) Xint argc; Xchar *argv[]; X{ X int yes; X char *p; X X if (argc > 3) usage(); X if (strcmp(argv[1], "-n") == 0) nflag++; X X if ((argc == 2 && nflag == 0) || argc == 3) { X close(0); X if (open(argv[argc-1], 0) < 0) { X std_err("ascii: cannot open "); X std_err(argv[1]); X std_err("\n"); X exit(1); X } X } X X while(eof == 0) { X yes = getline(); X if (nflag != yes) output(); X next = limit; X } X exit(exitstatus); X} X Xint getline() X{ X char *p, c; X int asc = 1; X X if (count == 0) load(); X if (eof) exit(exitstatus); X X p = next; X while (count > 0) { X c = *p++; X if (c & 0200) {asc = 0; exitstatus = 1;} X count--; X if (c == '\n') { X limit = p; X return(asc); X } X if (count == 0) { X /* Move the residual characters to the bottom of buf */ X used = &buf[BUFSIZE] - next; X copy(next, buf, used); X load(); X p = &buf[used]; X used = 0; X if (eof) return(asc); X } X } X} X Xload() X{ X count = read(0, &buf[used], BUFSIZE-used); X if (count <= 0) eof = 1; X next = buf; X} X Xoutput() X{ X write(1, next, limit-next); X} X Xusage() X{ X std_err("Usage: ascii [-n] file\n"); X exit(1); X} X Xcopy(s,d,ct) Xregister char *s, *d; Xint ct; X{ X while (ct--) *d++ = *s++; X} X + END-OF-FILE ascii.c chmod 'u=rw,g=r,o=r' \a\s\c\i\i\.\c set `sum \a\s\c\i\i\.\c` sum=$1 case $sum in 39792) :;; *) echo 'Bad sum in '\a\s\c\i\i\.\c >&2 esac exit 0