[alt.sources] Spelling utilities; crossword-helper & typo fixer.

gtoal@tharr.UUCP (Graham Toal) (06/29/90)

Archive-Name: dawgutils/update1.shar

To people who received the last post and got a bit annoyed at the
lack of a makefile, I apologise.  It was my first net posting.  I'll
know better next time.  Meanwhile here's an emergency fix;
cc dawg.c; mv a.out dawg
cc pdawg.c; mv a.out pdawg
cc pack.c; mv a.out pack
cc ppack.c; mv a.out ppack
cc dwgcheck.c; mv a.out dwgcheck
cc pckcheck.c; mv a.out pckcheck
cc tell.c; mv a.out tell
cc proot.c; mv a.out proot
dawg /usr/dict/words dict
pack dict.dwg dict.pck
dwgcheck this and that wroang wurd and another wurd
pckcheck this and that wroang wurd and another wurd
proot meta
tell stoopid speled wurds

Hope that helps a bit.  Now for todays additions:
cc typo.c; mv a.out typo
cc cross.c; mv a.out cross
typo spel
cross p\?zz\?e

#!/bin/sh-----cut here-----cut here-----cut here-----cut here-----
# shar:	Shell Archiver
#	Run the following text with /bin/sh to create:
#	README #	cross.c #	typo.c 
cat - << \SHAR_EOF > README
This is probably rather early for a follow up to my posting on
spelling checker utilities, but a couple of people have asked for
these...

(These programs need dawgutils/* as posted a couple of days ago)

typo.c
   A sample program showing how to use the dawg structure to efficiently
correct typos.

cross.c
   A 'crossword puzzle' solver - effectively the same as grepping the
word list allowing '?' as a single wild char, eg   cross p?zz?e
would return 'puzzle'.

I've also had some interest in my throwaway comments on a replacement
for Soundex; enough to convince me to document it properly and post it
here (a week away at the earliest).  Meanwhile, may I ask again:
If anyone has a dictionary of words and their phonetic representations
could they get in touch please?  Many thanks if you can.

Share & Enjoy,

Graham Toal (grA@m tOl) <gtoal%uk.ac.ed@nsfnet-relay.ac.uk>
SHAR_EOF
cat - << \SHAR_EOF > cross.c
/* On brain-dead PC's, with MICROSOFT, link with /ST:30000 */
/*

    File:          cross.c
    Author:        Graham Toal
    Purpose:       match words with single-char wildcards (cr?ssw?rd p?zzl?s)
    Creation date: 28/06/90 14:01:34
    Lastedit:      28/06/90 14:05:45

    Description:

    (The nice thing about the dawg structure is that it makes utilities
     like this easy to write; none of these small programs has taken
     more than about an hour. This one took five minutes ;-))

*/


/* Manadatory header files */
#include <stdio.h>
#include "dawg.h"
#include "grope.h"
#include "utils.c"

/* Headers here as needed on per-program basis */

/* Spelling library utilities */
#include "init.c"      /* Loading dicts */

#ifdef SYS_MAC
/* To compile with THINK C 4.0, place all the relevant .h and .c
   files in a folder.  Then create a project which contains this main.c
   and the libraries unix and ANSI.
*/
#include <unix.h>
#include <stdlib.h>
#include <console.h>
#endif

/* This one just gets wrong letters */
int
#ifdef PROTOTYPES
fix_cross(
  NODE PCCRAP *dawg, INDEX i,
  char *word, char *res,
  int len, int *found)
#else
fix_cross(dawg, i, word, res, len, found)
NODE PCCRAP *dawg;
INDEX i;
char *word;
char *res;
int len;
int *found;
#endif
{
int endsword, last, ch, target;
NODE node;
INDEX link;

  for (;;) {
    node = dawg[i++];
    ch = (int)((node >> V_LETTER) & M_LETTER);
    last = ((node & (INDEX)M_END_OF_NODE) != 0);
    endsword = ((node & M_END_OF_WORD) != 0);
    link = node & M_NODE_POINTER;

    res[len] = ch; res[len+1] = '\0';
    target = ((int)*word)&255;
    if (ch != 0) {
    if (ch == target || target == '?') {
      if (endsword && *(word+1) == '\0') {
        fprintf(stdout, "word: %s\n", res); (*found)++;
      }
      if (*(word+1) != '\0' && link != 0)
        (void) fix_cross(dawg, link, word+1, res, len+1, found);
    }
    }
    if (last) break;
  }
  return(0==0);
}

int
#ifdef PROTOTYPES
crossword(NODE PCCRAP *dawg, char *word)
#else
crossword(dawg, word)
NODE PCCRAP *dawg;
char *word;
#endif
{
char result[MAX_WORD_LEN];
int i = 0;
  (void)fix_cross(dawg, (INDEX)ROOT_NODE, word, result, 0, &i);
  return(i);
}

int
#ifdef PROTOTYPES
main(int argc, char **argv)
#else
main(argc, argv)
int argc;
char **argv;
#endif
{
NODE PCCRAP *dawg;
INDEX edges;
int each;

#ifdef SYS_MAC
  argc = ccommand(&argv);
#endif

  /* Your program goes here... */
  if (argc == 1) {
    fprintf(stderr, "usage: %s mispeled wurdz\n", argv[0]);
    exit(EXIT_ERROR);
  }
  if (!dawg_init("", &dawg, &edges)) exit(EXIT_ERROR);
  for (each = 1; each < argc; each++) {
    fprintf(stderr, "* Matches:\n");
    if (!crossword(dawg, argv[each])) {
      fprintf(stderr, "(none found)\n");
    }
    if (each+1 != argc) fprintf(stderr, "\n");
  }

  exit(EXIT_OK);
}
SHAR_EOF
cat - << \SHAR_EOF > typo.c
/* On brain-dead PC's, with MICROSOFT, link with /ST:30000 */
/*

    File:          typo.c
    Author:        Graham Toal
    Purpose:       offer correct spelling
    Creation date: 27/06/90
    Lastedit:      28/06/90 13:27:03

    Description:

       Like the unix 'spelltell' command but only fixes typos
    rather than soundslike errors.

    See my 'tell' program if you want soundex, or wait a few weeks
    for my new proper phonetic algorithm.

    It is a design decision that some of the functions will return the word
    presented if it is in fact correct.  You can call this a bug if you
    prefer.

    (The nice thing about the dawg structure is that it makes utilities
     like this easy to write; none of these small programs has taken
     more than about an hour.)

*/


/* Manadatory header files */
#include <stdio.h>
#include "dawg.h"
#include "grope.h"
#include "utils.c"

/* Headers here as needed on per-program basis */

/* Spelling library utilities */
#include "init.c"      /* Loading dicts */
#include "check.c"     /* Simple word-check */

#ifdef SYS_MAC
/* To compile with THINK C 4.0, place all the relevant .h and .c
   files in a folder.  Then create a project which contains this main.c
   and the libraries unix and ANSI.
*/
#include <unix.h>
#include <stdlib.h>
#include <console.h>
#endif

/* This one just gets wrong letters */
int
#ifdef PROTOTYPES
fix_typos(
  NODE PCCRAP *dawg, INDEX i,
  char *word, char *res,
  int len, int errs_allowed, int *found)
#else
fix_typos(dawg, i, word, res, len, errs_allowed, found)
NODE PCCRAP *dawg;
INDEX i;
char *word;
char *res;
int len;
int errs_allowed;
int *found;
#endif
{
int endsword, last, ch;
NODE node;
INDEX link;

  for (;;) {
    node = dawg[i++];
    ch = (int)((node >> V_LETTER) & M_LETTER);
    last = ((node & (INDEX)M_END_OF_NODE) != 0);
    endsword = ((node & M_END_OF_WORD) != 0);
    link = node & M_NODE_POINTER;

    res[len] = ch; res[len+1] = '\0';

    if (ch != 0) {
    if (ch == ((int)*word)&255) {
      if (endsword && *(word+1) == '\0') {
        fprintf(stdout, "word: %s\n", res); (*found)++;
      }
      if (*(word+1) != '\0' && link != 0)
        (void) fix_typos(dawg, link, word+1, res, len+1, errs_allowed, found);
    } else {
      /* Try a different letter here instead? */
      if (errs_allowed > 0) {
        if (endsword && *(word+1) == '\0') {
          fprintf(stdout, "word: %s\n", res); (*found)++;
        }
        if (*(word+1) != '\0' && link != 0)
          (void) fix_typos(
            dawg, link, word+1, res, len+1, errs_allowed-1, found);
      }
    }
    }
    if (last) break;
  }
  return(0==0);
}


/* And this one corrects omitted letters by inserting one. */

int
#ifdef PROTOTYPES
fix_insert(
  NODE PCCRAP *dawg, INDEX i,
  char *word, char *res,
  int len, int errs_allowed, int *found)
#else
fix_insert(dawg, i, word, res, len, errs_allowed, found)
NODE PCCRAP *dawg;
INDEX i;
char *word;
char *res;
int len;
int errs_allowed;
int *found;
#endif
{
int endsword, last, ch;
NODE node;
INDEX link;

  for (;;) {
    node = dawg[i++];
    ch = (int)((node >> V_LETTER) & M_LETTER);
    endsword = ((node & M_END_OF_WORD) != 0);
    last = ((node & M_END_OF_NODE) != 0);

    link = node & M_NODE_POINTER;

    res[len] = ch; res[len+1] = '\0';

    if (ch != 0) {

    if (endsword && *word == '\0' && errs_allowed > 0) {
      fprintf(stdout, "word: %s\n", res); (*found)++;
    }

    if (ch == ((int)*word)&255) {
      if (endsword && *(word+1) == '\0') {
        fprintf(stdout, "word: %s\n", res); (*found)++;
        /*return(0==0);*/
      }
      if (*word == '\0') {
        if (errs_allowed > 0)
          (void) fix_insert(
            dawg, link, word+1, res, len+1, errs_allowed, found);
      } else {
        if (link != 0)
          (void) fix_insert(
            dawg, link, word+1, res, len+1, errs_allowed, found);
      }
    }
    /* Insert this letter (len+1) and see if rest matches */
    if (errs_allowed > 0) {
      if (link != 0)
        (void) fix_insert(dawg, link, word, res, len+1, errs_allowed-1, found);
    }
    }
    if (last) break;
  }
  return(0==0);
}


/* And finally catch inserted letters by deleting one */

int
#ifdef PROTOTYPES
fix_delete(
  NODE PCCRAP *dawg, INDEX i,
  char *word, char *res,
  int len, int errs_allowed, int *found)
#else
fix_delete(dawg, i, word, res, len, errs_allowed, found)
NODE PCCRAP *dawg;
INDEX i;
char *word;
char *res;
int len;
int errs_allowed;
int *found;
#endif
{
int endsword, last, ch;
NODE node;
INDEX link;

  if (errs_allowed > 0) {
    if (*(word+1) != '\0') {
      (void) fix_delete(dawg, i, word+1, res, len, errs_allowed-1, found);
    } else {
      /* Shouldn't get this far, but does :-( */
      return(0==0);
    }
  }
  for (;;) {
    node = dawg[i++];
    ch = (int)((node >> V_LETTER) & M_LETTER);
    endsword = ((node & M_END_OF_WORD) != 0);
    last = ((node & M_END_OF_NODE) != 0);

    link = node & M_NODE_POINTER;

    res[len] = ch; res[len+1] = '\0';

    if (ch != 0) {
    if (ch == ((int)*word)&255) {

      if (errs_allowed > 0 &&
          endsword &&
          *(word+1) != '\0' &&
          *(word+2) == '\0') {
        fprintf(stdout, "word: %s\n", res); (*found)++;
      }

      if (endsword && *(word+1) == '\0') {
        fprintf(stdout, "word: %s\n", res); (*found)++;
        return(0==0);
      }
      if (*(word+1) != '\0' && link != 0)
        (void) fix_delete(dawg, link, word+1, res, len+1, errs_allowed, found);
    }
    }

    if (last) break;
  }
  return(0==0);
}


int
#ifdef PROTOTYPES
dawg_typo(NODE PCCRAP *dawg, char *word)
#else
dawg_typo(dawg, word)
NODE PCCRAP *dawg;
char *word;
#endif
{
char result[MAX_WORD_LEN];
int i = 0;
  (void)fix_typos(dawg, (INDEX)ROOT_NODE, word, result, 0, 1, &i);
  return(i);
}

int
#ifdef PROTOTYPES
dawg_insert(NODE PCCRAP *dawg, char *word)
#else
dawg_insert(dawg, word)
NODE PCCRAP *dawg;
char *word;
#endif
{
char result[MAX_WORD_LEN];
int i = 0;
  (void)fix_insert(dawg, (INDEX)ROOT_NODE, word, result, 0, 1, &i);
  return(i);
}

int
#ifdef PROTOTYPES
dawg_delete(NODE PCCRAP *dawg, char *word)
#else
dawg_delete(dawg, word)
NODE PCCRAP *dawg;
char *word;
#endif
{
char result[MAX_WORD_LEN];
int i = 0;
  (void)fix_delete(dawg, (INDEX)ROOT_NODE, word, result, 0, 1, &i);
  return(i);
}

int
#ifdef PROTOTYPES
dawg_transpose(NODE PCCRAP *dawg, char *word)
#else
dawg_transpose(dawg, word)
NODE PCCRAP *dawg;
char *word;
#endif
{
int i = 0, l, c;
  for (l = 0; word[l+1] != '\0'; l++) {
    c = word[l]; word[l] = word[l+1]; word[l+1] = c;
    if (dawg_check(dawg, word)) {
      i++;
      fprintf(stdout, "word: %s\n", word);
    }
    c = word[l]; word[l] = word[l+1]; word[l+1] = c;
  }
  return(i /* != 0 */);
}


int
#ifdef PROTOTYPES
main(int argc, char **argv)
#else
main(argc, argv)
int argc;
char **argv;
#endif
{
NODE PCCRAP *dawg;
INDEX edges;
int each;

#ifdef SYS_MAC
  argc = ccommand(&argv);
#endif

  /* Your program goes here... */
  if (argc == 1) {
    fprintf(stderr, "usage: %s mispeled wurdz\n", argv[0]);
    exit(EXIT_ERROR);
  }
  if (!dawg_init("", &dawg, &edges)) exit(EXIT_ERROR);
  for (each = 1; each < argc; each++) {
    fprintf(stderr, "* Wrong char:\n");
    if (!dawg_typo(dawg, argv[each])) {
      fprintf(stderr, "(none found)\n");
    }
    fprintf(stderr, "* Omitted char\n");
    if (!dawg_insert(dawg, argv[each])) {
      fprintf(stderr, "(none found)\n");
    }
    fprintf(stderr, "* Inserted char:\n");
    if (!dawg_delete(dawg, argv[each])) {
      fprintf(stderr, "(none found)\n");
    }
    fprintf(stderr, "* Transposed char:\n");
    if (!dawg_transpose(dawg, argv[each])) {
      fprintf(stderr, "(none found)\n");
    }
    if (each+1 != argc) fprintf(stderr, "\n");
  }

  exit(EXIT_OK);
}
SHAR_EOF