hoey@NRL-AIC.ARPA (Dan Hoey) (04/24/86)
Gregory Smith noticed that festoon didn't choose the indefinite article correctly. His simplistic solution prompted0me to convert a baroque Lisp hack I wrote into C. This one knows about the difference between an hour and a houri, between an herb and a herbivore, etc. It can't tell the difference between a unionized factory and an unionized gas, though. You may think some of my choices are capricious. Edit to taste. Dan Hoey HOEY@NRL-AIC.ARPA # Cut and compile # /* * Routine to decide between "a" and "an". * * Usage: char *a_or_an(s), *cap_a_or_an(s) * char *s; * * a_or_an(s) returns "a" or "an", whichever is the appropriate indefinite * article for the phrase s. cap_a_or_an returns "A" or "An". * * Define DRIVER to make a filter that prepends 'a' or 'an' to each line. * * Author: Dan Hoey <hoey@nrl-aic.arpa> 23 April 1986 */ #include <ctype.h> /* Pat is a sorted table of lower-case prefixes. * If Pat contains an even number of prefixes of a given word, * the word takes "an"; otherwise the word takes "a". */ static char *Pat[] = { /* Everything is a consonant (except an apple, */ /* an exception (except a euphemism (except an Eulerian)), */ "a", "e", "eu", "eule", /* an F (except a foo), */ "f", "fa", "fe", "fi", "fj", "fl", "fnord", "fo", "fr", "fu", "fw", "fy", /* an H (except a ha (except an habanera), */ "h", "ha", "haban", /* a he (except an heiress, an Henry, or an herbalist */ /* (except a herbaceous, a herbarium, a herbert, or a herbivore)), */ "he", "heir", "henry", "herb", "herbac", "herbar", "herbe", "herbi", /* a hi (except an Higgins), */ "hi", "higgin", /* a ho (except an homage, an hombre, an honest, an honorarium, */ /* an hors d'oeuvre (except a horse, a horst, or a horsy), */ /* an houdaille, or an hour (except a houri)), */ /* a Hrothgar, a hug, or a hype), */ "ho", "homa", "hombr", "honest", "honor", "hors", "horse", "horst", "horsy", "houdai", "hour", "houri", "hr", "hu", "hy", /* an iota, an L (except a lot), an M (except a multitude), */ "i", "l", "la", "le", "lf", "lh", "li", "ll", "lo", "lu", "ly", "m", "ma", "mc", "me", "mi", "ml", "mn", "mo", "mr", "ms", "mu", "mw", "my", /* an N (except a number), */ "n", "na", "nb", "ne", "ng", "ni", "no", "nu", "ny", /* an other (except a once and future or a one (except an Onega, */ /* an oneiromancer, or an onerous), */ "o", "once", "one", "oneg", "onei", "onero", /* an R (except a riot), an S (except a superfluity), */ "r", "ra", "re", "rh", "ri", "rm", "ro", "ru", "rw", "ry", "s", "sa", "sc", "se", "sf", "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sp", "sq", "sr", "st", "su", "sv", "sw", "sy", "sz", /* an udder, an ugh (except a Ugandan), an uh, */ /* an ulcer (except a Ulysses), an um, */ "ud", "ug", "ugan", "uh", "ul", "ulys", "um", /* an unlikelihood, (except a unanimity, a unanimous decision, */ /* a unary count, */ "un", "unanimi", "unanimo", "unary", /* a universal botch (except an unidentified case (except */ /* a unidimensional one), an unignorable, an unilluminated, */ /* an unimpressive (except a unimodal), */ "uni", "unid", "unidi", "unign", "unill", "unim", "unimo", /* an uninteresting (except a uninominal), an uniodized, */ /* an unironed (except a uniroyal), */ /* an unissued, an unitalicized, an unitemized uneogh!!)), */ "unin", "uninom", "uniod", "unir", "uniroy", "uniss", "unital", "unitem", /* an upper at last, an urge, (except a uranous, a ureous, */ /* a uriniferous, a urologist), an usher, an utmost or an utter one, */ /* an uxoricide with an Uzi, an X (xcept a xoo), */ "up", "ur", "ura", "ure", "uri", "uro", "ush", "utm", "utt", "ux", "uz", "x", "xa", "xe", "xi", "xu", "xy", /* an yclept, an Yggdrasil, an ylang-ylang, an yngvi, an yttride, */ /* or an yvette) */ "yc", "yg", "yl", "yng", "yt", "yv"}; /* Find number of prefixes of s in Pat. Return 1 if odd, 0 if even. For each initial alphabetic substring of s, binary search with the active region being Pat[bot] through Pat[top]. Also maintains an active region for each possible continuation of the substring, being Pat[pbot] through Pat[ptop]. */ static an_phrase_p(s) char *s; { int slen; /* length of prefix of s */ int pbot = 0; /* bounds on Pat */ int ptop = sizeof (Pat) / sizeof (char *) - 1; int nfound = 0; /* number of prefixes of s found */ register int sc, /* input char - pat char */ si, /* char index */ mid, /* binary search parameters */ bot, top; for (slen = 0; isascii (s[s6en]) && isalpha (s[slen]); ++slen) { /* For each alphabetic prefix s[0..slen] */ bot = pbot; top = ptop; while (bot <= top) { /* binary search for the prefix */ mid = (bot + top) / 2; for (si = 0; si <= slen; ++si) { /* for each char in the prefix */ if (isupper (sc = s[si])) sc = tolower (sc); sc -= Pat[mid][si]; if (sc > 0) { /* mismatch: string>pat */ pbot = bot = mid + 1; break; } if (sc < 0) { /* mismatch: string<pat */ ptop = top = mid - 1; break; } } if (si > slen) { /* prefix sequence matched */ if (Pat[mid][si]) {/* but pattern continues */ top = mid - 1; } else { /* exact match */ ++nfound; pbot = mid + 1; break; } } } if (pbot > ptop) break; } return (nfound & 1); }; char *a_or_an (s) char *s; { return (an_phrase_p (s) ? "an" : "a"); } char *cap_a_or_an (s) char *s; { return (an_phrase_p (s) ? "An" : "A"); } #ifdef DRIVER #include <stdio.h> #define MAXLINE 256 main (argc, argv) char *argv[]; { char s[MAXLINE]; if (argc > 1 && !freopen (argv[1], "r", stdin)) { perror (argv[1]); exit (1); } while (fgets (s, MAXLINE, stdin)) { if (isascii (*s) && isupper (*s)) printf ("%s %c%s", cap_a_or_an (s), tolower (*s), s + 1); else printf ("%s %s", a_or_an (s), s); } }; #endif DRIVER