ken@boring.UUCP (08/28/85)
References: Sender: ken@mcvax.UUCP (Ken Yap) Reply-To: ken@mcvax.UUCP (Ken Yap) Followup-To: net.sources.bugs Distribution: net Organization: Amoeba Project, CWI, Amsterdam Keywords: #! /bin/sh # This is a shell archive, meaning: # 1. Remove everything above the #! /bin/sh line. # 2. Save the resulting text in a file. # 3. Execute the file with /bin/sh (not csh) to create the files: # README # correct.1 # Makefile # correct.v0.c # word.c # word.h # This archive created: Wed Aug 28 10:28:39 1985 # By: Ken Yap (Amoeba Project, CWI, Amsterdam) export PATH; PATH=/bin:$PATH echo shar: extracting "'README'" '(811 characters)' if test -f 'README' then echo shar: will not over-write existing file "'README'" else cat << \SHAR_EOF > 'README' This is a simplistic spelling corrector. It takes the list of words on the command line (or one line of standard input), applies small perturbations to them and checks the variants against a standard dictionary (via the spell program). The survivors are then suggested as corrections for the presumably mispelled word. For example: $ correct calender arithmatic arithmetic calendar This idea came from "Computer Programs for Spelling Correction", Peterson, Springer-Verlag LNCS. Its deficiencies are noted in the manual page. I am working on a better version, but would be glad to hear of bug reports or improvements. I don't promise to do anything about such reports though. Ken 28th August 1985 Centrum voor Wiskunde en Informatica, Kruislaan 413, 1098 SJ Amsterdam, Netherlands. ken@mcvax.UUCP SHAR_EOF fi # end of overwriting check echo shar: extracting "'correct.1'" '(1609 characters)' if test -f 'correct.1' then echo shar: will not over-write existing file "'correct.1'" else cat << \SHAR_EOF > 'correct.1' .TH CORRECT 1 "2 August 1985" .SH NAME correct, lookup \- spelling corrector .SH SYNOPSIS .B correct [ .B \-D ] [ .B \-S ] [ .B \-f ] [ .B \-s ] [ .B \-d hlist ] [ words ] .PP .B lookup [ .B \-f ] [ words ] .SH DESCRIPTION .I Correct takes the presumably mispelled words, applies small perturbations to them and looks up the perturbations in a hashed dictionary. If these perturbations are found they are suggested as corrections for the mispelled word. If no words are given on the command line, correct reads one line from the standard input. .PP Under the .B \-f option, words are folded to lower case before processing. .PP Under the .B \-s option, sorting and duplicate filtering are supressed. .PP Under the .B \-S option, server mode is entered. .I Correct is run in the background and enquiries are sent to it by .I lookup. This requires the Amoeba (C) transaction library. .PP The .B \-D option turns on some debugging messages. .PP The hashed dictionary used may be specified by the argument following the .BR \-d option. .SH FILES /usr/dict/hlist[ab] hashed correcting lists, American & British, default for .B \-d .br /tmp/correct\(** temporary file .br .SH SEE ALSO spell(1), spellout(1), deroff(1), sort(1), tee(1), sed(1) .SH AUTHOR Ken Yap (Centrum voor Wiskunde en Informatica, Amsterdam) .SH BUGS Coverage of words in the dictionary is uneven. Absence of output may mean that the intended word was not found rather than that the spelling was correct. .PP Long words often have permutations that cause spurious hits on the dictionary. Take the output of this program with a grain of salt. SHAR_EOF fi # end of overwriting check echo shar: extracting "'Makefile'" '(523 characters)' if test -f 'Makefile' then echo shar: will not over-write existing file "'Makefile'" else cat << \SHAR_EOF > 'Makefile' # # Makefile for dictionary server # # Ken Yap, June 1985 # # Sources SRC = correct.v0.c word.c dict.c DICT = \"/usr/dict/hlistb\" CFLAGS = -O -DDEFAULT_DICT=$(DICT) correct.v0: correct.v0.o getopt.o word.o cc -o correct correct.v0.o getopt.o word.o lookup: lookup.o getopt.o trans.o cc -o lookup lookup.o getopt.o trans.o correct.v0.o: word.h word.o: word.h lint: lint -DDEFAULT_DICT=$(DICT) $(SRC) quietly: @rm -f nohup.out sh -ce 'nohup make &' backup: tar cf ../correct.tar *.c *.h *.1 Makefile SHAR_EOF fi # end of overwriting check echo shar: extracting "'correct.v0.c'" '(4367 characters)' if test -f 'correct.v0.c' then echo shar: will not over-write existing file "'correct.v0.c'" else cat << \SHAR_EOF > 'correct.v0.c' /* ** (C) Centrum voor Wiskunde en Informatica, 1985 ** ** This software may be freely distributed and used, save ** for profit or military purposes, provided always this notice ** is retained. ** ** No warranty is made on the suitability of this software ** for any purpose whatsoever. ** ** Last modified: ** ** Ken Yap (CWI) August 1985 */ /* ** A program to generate alternate spellings from a mispelled word ** and return those that are in the dictionary. ** ** Ken Yap, CWI, July 1985 */ #include <sys/types.h> #include <sys/file.h> #include <ctype.h> #include <stdio.h> #include <signal.h> #ifdef AMOEBA #include "amoeba.h" #endif AMOEBA #include "word.h" static char *dictfile = DEFAULT_DICT; static int server = 0; static int debug = 0; static int fold = 0; static int sortuniq = 1; static char ibuf[1024], buf[10240]; #ifdef AMOEBA header hdr; #endif AMOEBA /* ** Print message and exit on error */ chkerror(cc, msg) int cc; char *msg; { if (cc < 0) { perror(msg); exit(1); } } cleanup() { exit(0); } /* ** Generate one word's permutations ** Reject words containing non-alphabetics */ int altgen(word, buf, len) char *word, *buf; int len; { register int op; register char *p; for (p = word; *p != '\0'; p++) if (!isalpha(*p)) return (0); p = buf; for (op = DEL1CHAR; op <= ADD1CHAR; op++) { transform(word, INIT, p); while (transform(word, op, p)) { p += strlen(p); *p++ = '\n'; if (p - buf > len - 20) return (p - buf); } } *p = '\0'; return (p - buf); } /* ** Pick up one word from buf, returning updated position in buf */ char *getword(buf, word, wlen) char *buf, *word; int wlen; { while (isspace(*buf) && *buf != '\0') buf++; while (!isspace(*buf) && *buf != '\0') { if (wlen-- <= 0) break; *word++ = *buf++; } *word = '\0'; return (buf); } /* ** Lookup several words */ int lookup(words, alternates, altlen) char *words, *alternates; int altlen; { register int l, ch; register char *p, *tempfile; register FILE *tempf, *cmdpipe; char word[64]; int dup2(); char *getword(), *mktemp(); FILE *fopen(), *popen(); tempfile = mktemp("/tmp/correctXXXXXX"); if ((tempf = fopen(tempfile, "w")) == NULL) chkerror(-1, tempfile); p = words; while (*(p = getword(p, word, sizeof(word))) != '\0') { if (debug) printf("<%s>\n", word); l = altgen(word, alternates, altlen); fwrite(alternates, sizeof(char), l, tempf); } fclose(tempf); sprintf(word, "spellout -d %s < %s %s", dictfile, tempfile, sortuniq ? "| sort -u" : ""); if ((cmdpipe = popen(word, "r")) == NULL) return (-1); p = alternates; while ((ch = getc(cmdpipe)) != EOF) { *p++ = ch; if (p - alternates > altlen) break; } pclose(cmdpipe); unlink(tempfile); return (p - alternates); } #ifdef AMOEBA dictserver() { register int n; int amoeba_init(), getreq(), putrep(), lookup(); strncpy((char *)&hdr.h_port, "bodict", PORTSIZE); chkerror(amoeba_init(&hdr.h_port), "init"); for (;;) { do { if ((n = getreq(&hdr, ibuf, sizeof(ibuf))) < 0) { perror("getreq"); continue; } ibuf[n] = '\0'; n = lookup(ibuf, buf, sizeof(buf)); if (putrep(&hdr, buf, n) < 0) perror("putrep"); } while (n > 0); } } #endif AMOEBA lower(p) char *p; { for ( ; *p != '\0'; p++) if (isupper(*p)) *p = tolower(*p); } main(argc, argv) int argc; char *argv[]; { register int i; /* the option flag name */ register char *words; extern int optind; /* defined in getopt */ extern char *optarg; /* defined in getopt */ int getopt(); while ((i = getopt (argc, argv, "DSd:fs")) != EOF) { switch (i) { case 'D': debug++; break; case 'S': server++; break; case 's': sortuniq = 0; break; case 'd': dictfile = optarg; break; case 'f': fold++; break; default: fprintf (stderr, "usage: %s [-DSfs] [-d dictfile] [words]\n", argv[0]); exit (1); } } signal(SIGTERM, cleanup); #ifdef AMOEBA if (server) dictserver(); else #endif AMOEBA { words = ibuf; for (argc -= optind, argv += optind; argc > 0; argc--, argv++) { strcpy(words, *argv); words += strlen(words); *words++ = ' '; } i = (words == ibuf) ? (fgets(ibuf, sizeof(ibuf), stdin), strlen(words)) : words - ibuf; words[i] = '\0'; if (fold) lower(ibuf); chkerror((i = lookup(ibuf, buf, sizeof(buf))), "pipe"); write(1, buf, i); } } SHAR_EOF fi # end of overwriting check echo shar: extracting "'word.c'" '(2399 characters)' if test -f 'word.c' then echo shar: will not over-write existing file "'word.c'" else cat << \SHAR_EOF > 'word.c' /* ** (C) Centrum voor Wiskunde en Informatica, 1985 ** ** This software may be freely distributed and used, save ** for profit or military purposes, provided always this notice ** is retained. ** ** No warranty is made on the suitability of this software ** for any purpose whatsoever. ** ** Last modified: ** ** Ken Yap (CWI) August 1985 */ #include <ctype.h> #include "word.h" int transform(word, op, result) char *word, *result; int op; { register int i; static struct { int len, pos; char let; } context; switch (op) { case INIT: context.len = strlen(word); context.pos = 0; context.let = isupper(*word) ? 'A' : 'a'; break; case DEL1CHAR: if (context.pos >= context.len) return (0); for (i = 0; i < context.pos; i++) *result++ = word[i]; for (i = context.pos + 1; i < context.len; i++) *result++ = word[i]; context.pos++; break; case SWAP2CHARS: nextpos: if (context.pos >= context.len - 1) return (0); for (i = 0; i < context.pos; i++) *result++ = word[i]; if (word[i] == word[i+1]) { context.pos++; goto nextpos; } *result++ = word[i+1]; *result++ = word[i]; for (i = context.pos + 2; i < context.len; i++) *result++ = word[i]; context.pos++; break; case CHG1CHAR: if (context.pos >= context.len) return (0); for (i = 0; i < context.pos; i++) *result++ = word[i]; *result++ = context.let; for (i = context.pos + 1; i < context.len; i++) *result++ = word[i]; if (context.let == 'Z' || context.let == 'z') { context.pos++; context.let = isupper(word[context.pos]) ? 'A' : 'a'; } else context.let++; break; case ADD1CHAR: if (context.pos > context.len) return (0); for (i = 0; i < context.pos; i++) *result++ = word[i]; *result++ = context.let; for (i = context.pos; i < context.len; i++) *result++ = word[i]; if (context.let == 'Z' || context.let == 'z') { context.pos++; context.let = isupper(word[context.pos]) ? 'A' : 'a'; } else context.let++; break; default: ; } *result = '\0'; return (1); } #ifdef TEST main(argc, argv) int argc; char *argv[]; { register int op; char buf[5120]; if (argc <= 1) exit(1); for (op = DEL1CHAR; op <= SWAP2CHARS; op++) { printf("Transformation #%d\n", op); transform(argv[1], INIT, buf); while (transform(argv[1], op, buf)) printf("%s\n", buf); } exit(0); } #endif TEST SHAR_EOF fi # end of overwriting check echo shar: extracting "'word.h'" '(94 characters)' if test -f 'word.h' then echo shar: will not over-write existing file "'word.h'" else cat << \SHAR_EOF > 'word.h' #define INIT 0 #define DEL1CHAR 1 #define SWAP2CHARS 2 #define CHG1CHAR 3 #define ADD1CHAR 4 SHAR_EOF fi # end of overwriting check # End of shell archive exit 0 -- UUCP: ..!{seismo,okstate,garfield,decvax,philabs}!mcvax!ken Voice: Ken! Mail: Centrum voor Wiskunde en Informatica, Kruislaan 413, 1098 SJ, Amsterdam.