[net.sources] A or An

hoey@NRL-AIC.ARPA (Dan Hoey) (04/24/86)

Gregory Smith noticed that festoon didn't choose the indefinite article
correctly.  His simplistic solution prompted0me to convert a baroque
Lisp hack I wrote into C.  This one knows about the difference between
an hour and a houri, between an herb and a herbivore, etc.  It can't
tell the difference between a unionized factory and an unionized gas,
though.

You may think some of my choices are capricious.  Edit to taste.

Dan Hoey
HOEY@NRL-AIC.ARPA

# Cut and compile #
/*
 * Routine to decide between "a" and "an".
 *
 * Usage: char *a_or_an(s), *cap_a_or_an(s)
 *	  char *s;
 *
 * a_or_an(s) returns "a" or "an", whichever is the appropriate indefinite
 * article for the phrase s.  cap_a_or_an returns "A" or "An".
 *
 * Define DRIVER to make a filter that prepends 'a' or 'an' to each line.
 *
 * Author: Dan Hoey <hoey@nrl-aic.arpa> 23 April 1986
 */
#include <ctype.h>

/* Pat is a sorted table of lower-case prefixes.
 * If Pat contains an even number of prefixes of a given word,
 * the word takes "an"; otherwise the word takes "a".
 */
static char *Pat[] = {
    /* Everything is a consonant (except an apple, */
    /* an exception (except a euphemism (except an Eulerian)), */
    "a", "e", "eu", "eule",
    /* an F (except a foo), */
    "f", "fa", "fe", "fi", "fj", "fl", "fnord", "fo", "fr", "fu", "fw", "fy",
    /* an H (except a ha (except an habanera), */
    "h", "ha", "haban",
    /* a he (except an heiress, an Henry, or an herbalist */
    /* (except a herbaceous, a herbarium, a herbert, or a herbivore)), */
    "he", "heir", "henry", "herb", "herbac", "herbar", "herbe", "herbi",
    /* a hi (except an Higgins), */
    "hi", "higgin",
    /* a ho (except an homage, an hombre, an honest, an honorarium, */
    /* an hors d'oeuvre (except a horse, a horst, or a horsy), */
    /* an houdaille, or an hour (except a houri)), */
    /* a Hrothgar, a hug, or a hype), */
    "ho", "homa", "hombr", "honest", "honor", "hors", "horse", "horst",
    "horsy", "houdai", "hour", "houri", "hr", "hu", "hy",
    /* an iota, an L (except a lot), an M (except  a multitude), */
    "i", "l", "la", "le", "lf", "lh", "li", "ll", "lo", "lu", "ly", "m",
    "ma", "mc", "me", "mi", "ml", "mn", "mo", "mr", "ms", "mu", "mw", "my",
    /* an N (except a number), */
    "n", "na", "nb", "ne", "ng", "ni", "no", "nu", "ny",
    /* an other (except a once and future or a one (except an Onega, */
    /* an oneiromancer, or an onerous), */
    "o", "once", "one", "oneg", "onei", "onero",
    /* an R (except a riot), an S (except a superfluity), */
    "r", "ra", "re", "rh", "ri", "rm", "ro",  "ru", "rw", "ry",
    "s", "sa", "sc", "se", "sf", "sh", "si", "sj", "sk", "sl", "sm", "sn",
    "so", "sp", "sq", "sr", "st", "su", "sv", "sw", "sy", "sz",
    /* an udder, an ugh (except a Ugandan), an uh, */
    /* an ulcer (except a Ulysses), an um, */
    "ud", "ug", "ugan", "uh", "ul", "ulys", "um",
    /* an unlikelihood, (except a unanimity, a unanimous decision, */
    /* a unary count, */
    "un", "unanimi", "unanimo", "unary",
    /* a universal botch (except an unidentified case (except */
    /* a unidimensional one), an unignorable,  an unilluminated, */
    /* an unimpressive (except a unimodal), */
    "uni", "unid", "unidi", "unign", "unill", "unim", "unimo",
    /* an uninteresting (except a uninominal), an uniodized, */
    /* an unironed (except a uniroyal), */
    /* an unissued, an unitalicized, an unitemized uneogh!!)), */
    "unin", "uninom", "uniod", "unir", "uniroy", "uniss", "unital", "unitem",
    /* an upper at last, an urge, (except a uranous, a ureous, */
    /* a uriniferous, a urologist), an usher, an utmost or an utter one, */
    /* an uxoricide with an Uzi, an X (xcept a xoo), */
    "up", "ur", "ura", "ure", "uri", "uro", "ush", "utm", "utt", "ux", "uz",
    "x", "xa", "xe", "xi", "xu", "xy",
    /* an yclept, an Yggdrasil, an ylang-ylang, an yngvi, an yttride, */
    /* or an yvette) */
    "yc", "yg", "yl", "yng", "yt", "yv"};

/* Find number of prefixes of s in Pat.  Return 1 if odd, 0 if even.
   For each initial alphabetic substring of s, binary search with the
   active region being Pat[bot] through Pat[top].  Also maintains an
   active region for each possible continuation of the substring, being
   Pat[pbot] through Pat[ptop].
 */

static
an_phrase_p(s) char *s;
{
    int     slen;		/* length of prefix of s */
    int     pbot = 0;		/* bounds on Pat */
    int     ptop = sizeof (Pat) / sizeof (char *) - 1;
    int     nfound = 0;		/* number of prefixes of s found */
    register int    sc,		/* input char - pat char */
                    si,		/* char index */
                    mid,	/* binary search parameters */
                    bot,
                    top;
    for (slen = 0; isascii (s[s6en]) && isalpha (s[slen]); ++slen) {
				/* For each alphabetic prefix s[0..slen] */
	bot = pbot;
	top = ptop;
	while (bot <= top) {	/* binary search for the prefix */
	    mid = (bot + top) / 2;
	    for (si = 0; si <= slen; ++si) {
				/* for each char in the prefix */
		if (isupper (sc = s[si]))
		    sc = tolower (sc);
		sc -= Pat[mid][si];
		if (sc > 0) {	/* mismatch: string>pat */
		    pbot = bot = mid + 1;
		    break;
		}
		if (sc < 0) {	/* mismatch: string<pat */
		    ptop = top = mid - 1;
		    break;
		}
	    }
	    if (si > slen) {	/* prefix sequence matched */
		if (Pat[mid][si]) {/* but pattern continues */
		    top = mid - 1;
		}
		else {		/* exact match */
		    ++nfound;
		    pbot = mid + 1;
		    break;
		}
	    }
	}
	if (pbot > ptop)
	    break;
    }
    return (nfound & 1);
};

char   *a_or_an (s) char   *s; {
    return (an_phrase_p (s) ? "an" : "a");
}

char   *cap_a_or_an (s) char   *s; {
    return (an_phrase_p (s) ? "An" : "A");
}

#ifdef DRIVER
#include <stdio.h>
#define MAXLINE 256

main (argc, argv) char *argv[]; {
    char    s[MAXLINE];
    if (argc > 1 && !freopen (argv[1], "r", stdin)) {
	perror (argv[1]);
	exit (1);
    }
    while (fgets (s, MAXLINE, stdin)) {
	if (isascii (*s) && isupper (*s))
	    printf ("%s %c%s", cap_a_or_an (s), tolower (*s), s + 1);
	else
	    printf ("%s %s", a_or_an (s), s);
    }
};
#endif DRIVER