[comp.sources.wanted] SOUNDEX algorithm

tif@cpe.UUCP (09/17/88)

Written  5:52 pm  Sep 15, 1988 by pyramid.UUCP!dhaile in cpe:comp.sources.w
>Hi y'all!  Had a need all of a sudden for a SOUNDEX or pseudo-SOUNDEX

I'll post this since it's small and he didn't say "I don't usually
read this group"  :-)

It comes from a bigger package which is a spelling aid (although
the spelling aid could not meet my requirements).  Note that
SOUNDEX is a pretty crude algorithm, nothing like phonemes.
(Somebody asked for that too, wish I had it to give.)

			Paul Chamberlain
			Computer Product Engineering, Tandy Corp.
			{convex,killer}!ninja!cpe!tif
#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create:
#	calcsoundex.c
# This archive created: Fri Sep 16 22:54:34 1988
export PATH; PATH=/bin:/usr/bin:$PATH
echo shar: "extracting 'calcsoundex.c'" '(2455 characters)'
if test -f 'calcsoundex.c'
then
	echo shar: "will not over-write existing file 'calcsoundex.c'"
else
sed 's/^	X//' << \SHAR_EOF > 'calcsoundex.c'
	X/* vi: set tabstop=4 : */
	X
	X/*
	X * calcsoundex - calculate soundex codes
	X *
	X * Permission is given to copy or distribute this program provided you
	X * do not remove this header or make money off of the program.
	X *
	X * Please send comments and suggestions to:
	X * Barry Brachman
	X * Dept. of Computer Science
	X * Univ. of British Columbia
	X * Vancouver, B.C. V6T 1W5
	X *
	X * .. {ihnp4!alberta, uw-beaver}!ubc-vision!ubc-cs!brachman
	X * brachman@cs.ubc.cdn
	X * brachman%ubc.csnet@csnet-relay.arpa
	X * brachman@ubc.csnet
	X */
	X
	X#include <stdio.h>
	X#include <ctype.h>
	X
	X#include "sp.h"
	X
	Xchar word[MAXWORDLEN + 2];
	X
	Xchar soundex_code_map[26] = {
	X/***	 A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P	***/ 
	X		 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 1,
	X
	X/***	 Q  R  S  T  U  V  W  X  Y  Z			***/
	X		 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
	X};
	X
	Xmain(argc, argv)
	Xint argc;
	Xchar **argv;
	X{
	X	register int c, i, soundex_length, digit_part, previous_code;
	X	int ch, len, vflag;
	X	short soundex;
	X	char *gets();
	X
	X	vflag = 0;
	X	if (argc > 2 || (argc == 2 && strcmp(argv[1], "-v"))) {
	X		fprintf(stderr, "Usage: calcsoundex [-v]\n");
	X		exit(1);
	X	}
	X	if (argc > 1)
	X		vflag = 1;
	X
	X	while (fgets(word, sizeof(word), stdin) != (char *) NULL) {
	X		len = strlen(word);
	X		if (word[len - 1] != '\n') {
	X			fprintf(stderr, "calcsoundex: Word too long: %s", word);
	X			while ((ch = getchar()) != '\n')	/* flush rest of line */
	X				putc(ch, stderr);
	X			putc('\n', stderr);
	X			continue;
	X		}
	X		word[--len] = '\0';
	X		if (len > MAXWORDLEN) {
	X			fprintf(stderr, "calcsoundex: Word too long: %s\n", word);
	X			continue;
	X		}
	X
	X		for (i = 0; word[i] != '\0'; i++) {
	X			if (isupper(word[i]))
	X				word[i] = tolower(word[i]);
	X		}
	X		if (!isalpha(word[0]))
	X			continue;
	X
	X		digit_part = 0;
	X		soundex_length = 0;
	X		previous_code = soundex_code_map[word[0] - 'a'];
	X		for (i = 1; word[i] != '\0' && soundex_length < 3; i++) {
	X			if (!isalpha(word[i]))
	X				continue;
	X			c = soundex_code_map[word[i] - 'a'];
	X			if (c == 0 || previous_code == c) {
	X				previous_code = c;
	X				continue;
	X			}
	X			digit_part = digit_part * 10 + c;
	X			previous_code = c;
	X			soundex_length++;
	X		}
	X		while (soundex_length++ < 3)
	X			digit_part *= 10;
	X		soundex = digit_part << 5 + word[0] - 'a';
	X		printf("%c", word[0]);
	X		if (digit_part < 100)
	X			putchar('0');
	X		if (digit_part < 10)
	X			putchar('0');
	X		if (digit_part == 0)
	X			putchar('0');
	X		else
	X			printf("%d", digit_part);
	X		if (vflag)
	X			printf(" %s", word);
	X		putchar('\n');
	X	}
	X	putchar('\n');
	X	exit(0);
	X}
	X
SHAR_EOF
if test 2455 -ne "`wc -c < 'calcsoundex.c'`"
then
	echo shar: "error transmitting 'calcsoundex.c'" '(should have been 2455 characters)'
fi
fi
exit 0
#	End of shell archive