[net.sources] A spelling corrector

ken@boring.UUCP (08/28/85)
References:
Sender: ken@mcvax.UUCP (Ken Yap)
Reply-To: ken@mcvax.UUCP (Ken Yap)
Followup-To: net.sources.bugs
Distribution: net
Organization: Amoeba Project, CWI, Amsterdam
Keywords: 

#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
#	README
#	correct.1
#	Makefile
#	correct.v0.c
#	word.c
#	word.h
# This archive created: Wed Aug 28 10:28:39 1985
# By:	Ken Yap (Amoeba Project, CWI, Amsterdam)
export PATH; PATH=/bin:$PATH
echo shar: extracting "'README'" '(811 characters)'
if test -f 'README'
then
	echo shar: will not over-write existing file "'README'"
else
cat << \SHAR_EOF > 'README'
This is a simplistic spelling corrector. It takes the list of words on
the command line (or one line of standard input), applies small
perturbations to them and checks the variants against a standard
dictionary (via the spell program). The survivors are then suggested as
corrections for the presumably mispelled word.

For example:

$ correct calender arithmatic
arithmetic
calendar

This idea came from "Computer Programs for Spelling Correction",
Peterson, Springer-Verlag LNCS.

Its deficiencies are noted in the manual page. I am working on a better
version, but would be glad to hear of bug reports or improvements.  I
don't promise to do anything about such reports though.

	Ken

	28th August 1985
	Centrum voor Wiskunde en Informatica,
	Kruislaan 413, 1098 SJ Amsterdam,
	Netherlands.

	ken@mcvax.UUCP
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'correct.1'" '(1609 characters)'
if test -f 'correct.1'
then
	echo shar: will not over-write existing file "'correct.1'"
else
cat << \SHAR_EOF > 'correct.1'
.TH CORRECT 1 "2 August 1985"
.SH NAME
correct, lookup \- spelling corrector
.SH SYNOPSIS
.B correct
[
.B \-D
] [
.B \-S
] [
.B \-f
] [
.B \-s
] [
.B \-d
hlist ]
[ words ]
.PP
.B lookup
[
.B \-f
] [ words ]
.SH DESCRIPTION
.I Correct
takes the presumably mispelled words, applies small perturbations to
them and looks up the perturbations in a hashed dictionary.
If these perturbations are found
they are suggested as corrections for the mispelled word.
If no words are given on the command line,
correct reads one line from the standard input.
.PP
Under the
.B \-f
option, words are folded to lower case before processing.
.PP
Under the 
.B \-s
option, sorting and duplicate filtering are supressed.
.PP
Under the 
.B \-S
option, server mode is entered.
.I Correct
is run in the background and enquiries are
sent to it by
.I lookup.
This requires the Amoeba (C) transaction library.
.PP
The
.B \-D
option turns on some debugging messages.
.PP
The hashed dictionary used may be specified by
the argument following the
.BR \-d
option.
.SH FILES
/usr/dict/hlist[ab]	hashed correcting lists, American & British, default for
.B \-d
.br
/tmp/correct\(**		temporary file
.br
.SH SEE ALSO
spell(1), spellout(1), deroff(1), sort(1), tee(1), sed(1)
.SH AUTHOR
Ken Yap (Centrum voor Wiskunde en Informatica, Amsterdam)
.SH BUGS
Coverage of words in the dictionary is uneven.
Absence of output may mean that the intended word
was not found rather than that the spelling was correct.
.PP
Long words often have permutations that cause spurious hits
on the dictionary.
Take the output of this program with a grain of salt.
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'Makefile'" '(523 characters)'
if test -f 'Makefile'
then
	echo shar: will not over-write existing file "'Makefile'"
else
cat << \SHAR_EOF > 'Makefile'
#
# Makefile for dictionary server
#
# Ken Yap, June 1985
#

# Sources
SRC = correct.v0.c word.c dict.c

DICT = \"/usr/dict/hlistb\"

CFLAGS = -O -DDEFAULT_DICT=$(DICT)

correct.v0:	correct.v0.o getopt.o word.o
		cc -o correct correct.v0.o getopt.o word.o

lookup:		lookup.o getopt.o trans.o
		cc -o lookup lookup.o getopt.o trans.o

correct.v0.o:	word.h

word.o:		word.h

lint:
		lint -DDEFAULT_DICT=$(DICT) $(SRC)

quietly:
		@rm -f nohup.out
		sh -ce 'nohup make &'

backup:
		tar cf ../correct.tar *.c *.h *.1 Makefile
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'correct.v0.c'" '(4367 characters)'
if test -f 'correct.v0.c'
then
	echo shar: will not over-write existing file "'correct.v0.c'"
else
cat << \SHAR_EOF > 'correct.v0.c'
/*
**	(C) Centrum voor Wiskunde en Informatica, 1985
**
**	This software may be freely distributed and used, save
**	for profit or military purposes, provided always this notice
**	is retained.
**
**	No warranty is made on the suitability of this software
**	for any purpose whatsoever.
**
**	Last modified:
**
**	Ken Yap (CWI) August 1985
*/

/*
**	A program to generate alternate spellings from a mispelled word
**	and return those that are in the dictionary.
**
**	Ken Yap, CWI, July 1985
*/

#include	<sys/types.h>
#include	<sys/file.h>
#include	<ctype.h>
#include	<stdio.h>
#include	<signal.h>

#ifdef	AMOEBA
#include	"amoeba.h"
#endif	AMOEBA

#include	"word.h"

static char	*dictfile	= DEFAULT_DICT;
static int	server		= 0;
static int	debug		= 0;
static int	fold		= 0;
static int	sortuniq	= 1;
static char	ibuf[1024], buf[10240];

#ifdef	AMOEBA
header		hdr;
#endif	AMOEBA

/*
**	Print message and exit on error
*/
chkerror(cc, msg)
	int	cc;
	char	*msg;
{
	if (cc < 0)
	{
		perror(msg);
		exit(1);
	}
}

cleanup()
{
	exit(0);
}

/*
**	Generate one word's permutations
**	Reject words containing non-alphabetics
*/
int altgen(word, buf, len)
	char		*word, *buf;
	int		len;
{
	register int	op;
	register char	*p;

	for (p = word; *p != '\0'; p++)
		if (!isalpha(*p))
			return (0);
	p = buf;
	for (op = DEL1CHAR; op <= ADD1CHAR; op++)
	{
		transform(word, INIT, p);
		while (transform(word, op, p))
		{
			p += strlen(p);
			*p++ = '\n';
			if (p - buf > len - 20)
				return (p - buf);
		}
	}
	*p = '\0';
	return (p - buf);
}

/*
**	Pick up one word from buf, returning updated position in buf
*/
char *getword(buf, word, wlen)
	char		*buf, *word;
	int		wlen;
{
	while (isspace(*buf) && *buf != '\0')
		buf++;
	while (!isspace(*buf) && *buf != '\0')
	{
		if (wlen-- <= 0)
			break;
		*word++ = *buf++;
	}
	*word = '\0';
	return (buf);
}

/*
**	Lookup several words
*/
int lookup(words, alternates, altlen)
	char		*words, *alternates;
	int		altlen;
{
	register int	l, ch;
	register char	*p, *tempfile;
	register FILE	*tempf, *cmdpipe;
	char		word[64];
	int		dup2();
	char		*getword(), *mktemp();
	FILE		*fopen(), *popen();

	tempfile = mktemp("/tmp/correctXXXXXX");
	if ((tempf = fopen(tempfile, "w")) == NULL)
		chkerror(-1, tempfile);
	p = words;
	while (*(p = getword(p, word, sizeof(word))) != '\0')
	{
		if (debug) printf("<%s>\n", word);
		l = altgen(word, alternates, altlen);
		fwrite(alternates, sizeof(char), l, tempf);
	}
	fclose(tempf);
	sprintf(word, "spellout -d %s < %s %s", dictfile, tempfile,
		sortuniq ? "| sort -u" : "");
	if ((cmdpipe = popen(word, "r")) == NULL)
		return (-1);
	p = alternates;
	while ((ch = getc(cmdpipe)) != EOF)
	{
		*p++ = ch;
		if (p - alternates > altlen)
			break;
	}
	pclose(cmdpipe);
	unlink(tempfile);
	return (p - alternates);
}

#ifdef	AMOEBA
dictserver()
{
	register int	n;
	int		amoeba_init(), getreq(), putrep(), lookup();

	strncpy((char *)&hdr.h_port, "bodict", PORTSIZE);
	chkerror(amoeba_init(&hdr.h_port), "init");
	for (;;)
	{
		do {
			if ((n = getreq(&hdr, ibuf, sizeof(ibuf))) < 0)
			{
				perror("getreq");
				continue;
			}
			ibuf[n] = '\0';
			n = lookup(ibuf, buf, sizeof(buf));
			if (putrep(&hdr, buf, n) < 0)
				perror("putrep");
		} while (n > 0);
	}
}
#endif	AMOEBA

lower(p)
	char		*p;
{

	for ( ; *p != '\0'; p++)
		if (isupper(*p)) *p = tolower(*p);
}

main(argc, argv)
	int		argc;
	char		*argv[];
{
	register int 	i;		/* the option flag name */
	register char	*words;
	extern int	optind;		/* defined in getopt */
	extern char	*optarg;	/* defined in getopt */
	int		getopt();

	while ((i = getopt (argc, argv, "DSd:fs")) != EOF)
	{
		switch (i)
		{
		case 'D':	debug++; break;
		case 'S':	server++; break;
		case 's':	sortuniq = 0; break;
		case 'd':	dictfile = optarg; break;
		case 'f':	fold++; break;
		default:
				fprintf (stderr, "usage: %s [-DSfs] [-d dictfile] [words]\n", argv[0]);
				exit (1);
		}
	}
	signal(SIGTERM, cleanup);
#ifdef	AMOEBA
	if (server)
		dictserver();
	else
#endif	AMOEBA
	{
		words = ibuf;
		for (argc -= optind, argv += optind; argc > 0; argc--, argv++)
		{
			strcpy(words, *argv);
			words += strlen(words);
			*words++ = ' ';
		}
		i = (words == ibuf) ? (fgets(ibuf, sizeof(ibuf), stdin), strlen(words))
			: words - ibuf;
		words[i] = '\0';
		if (fold) lower(ibuf);
		chkerror((i = lookup(ibuf, buf, sizeof(buf))), "pipe");
		write(1, buf, i);
	}
}
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'word.c'" '(2399 characters)'
if test -f 'word.c'
then
	echo shar: will not over-write existing file "'word.c'"
else
cat << \SHAR_EOF > 'word.c'
/*
**	(C) Centrum voor Wiskunde en Informatica, 1985
**
**	This software may be freely distributed and used, save
**	for profit or military purposes, provided always this notice
**	is retained.
**
**	No warranty is made on the suitability of this software
**	for any purpose whatsoever.
**
**	Last modified:
**
**	Ken Yap (CWI) August 1985
*/

#include	<ctype.h>
#include	"word.h"

int transform(word, op, result)
	char		*word, *result;
	int		op;
{
	register int	i;
	static struct {
		int		len, pos;
		char		let;
		} context;

	switch (op)
	{
	case INIT:
		context.len = strlen(word);
		context.pos = 0;
		context.let = isupper(*word) ? 'A' : 'a';
		break;
	case DEL1CHAR:
		if (context.pos >= context.len)
			return (0);
		for (i = 0; i < context.pos; i++)
			*result++ = word[i];
		for (i = context.pos + 1; i < context.len; i++)
			*result++ = word[i];
		context.pos++;
		break;
	case SWAP2CHARS:
nextpos:
		if (context.pos >= context.len - 1)
			return (0);
		for (i = 0; i < context.pos; i++)
			*result++ = word[i];
		if (word[i] == word[i+1])
		{
			context.pos++;
			goto nextpos;
		}
		*result++ = word[i+1];
		*result++ = word[i];
		for (i = context.pos + 2; i < context.len; i++)
			*result++ = word[i];
		context.pos++;
		break;
	case CHG1CHAR:
		if (context.pos >= context.len)
			return (0);
		for (i = 0; i < context.pos; i++)
			*result++ = word[i];
		*result++ = context.let;
		for (i = context.pos + 1; i < context.len; i++)
			*result++ = word[i];
		if (context.let == 'Z' || context.let == 'z')
		{
			context.pos++;
			context.let = isupper(word[context.pos]) ? 'A' : 'a';
		}
		else
			context.let++;
		break;
	case ADD1CHAR:
		if (context.pos > context.len)
			return (0);
		for (i = 0; i < context.pos; i++)
			*result++ = word[i];
		*result++ = context.let;
		for (i = context.pos; i < context.len; i++)
			*result++ = word[i];
		if (context.let == 'Z' || context.let == 'z')
		{
			context.pos++;
			context.let = isupper(word[context.pos]) ? 'A' : 'a';
		}
		else
			context.let++;
		break;
	default:
		;
	}
	*result = '\0';
	return (1);
}

#ifdef	TEST
main(argc, argv)
	int		argc;
	char		*argv[];
{
	register int	op;
	char		buf[5120];

	if (argc <= 1)
		exit(1);
	for (op = DEL1CHAR; op <= SWAP2CHARS; op++)
	{
		printf("Transformation #%d\n", op);
		transform(argv[1], INIT, buf);
		while (transform(argv[1], op, buf))
			printf("%s\n", buf);
	}
	exit(0);
}
#endif	TEST
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'word.h'" '(94 characters)'
if test -f 'word.h'
then
	echo shar: will not over-write existing file "'word.h'"
else
cat << \SHAR_EOF > 'word.h'
#define	INIT		0
#define	DEL1CHAR	1
#define	SWAP2CHARS	2
#define	CHG1CHAR	3
#define	ADD1CHAR	4
SHAR_EOF
fi # end of overwriting check
#	End of shell archive
exit 0
-- 
UUCP: ..!{seismo,okstate,garfield,decvax,philabs}!mcvax!ken Voice: Ken!
Mail: Centrum voor Wiskunde en Informatica, Kruislaan 413, 1098 SJ, Amsterdam.