[comp.sources.misc] string compare for 8-bit non-English characters

root@hobbes.UUCP (John Plocher) (08/17/87)
In response to the discussion in comp.std.internat and the amiga group
about how programmers don't take into account non-English character sets
and their properties, I submit the following:  A strcmp() replacement
which correctly handles accented characters and non-ASCII collating.

	-John Plocher

/* This is NOT a shar */

/****************************************************************************
 *
 *	stracmp.c	string compare with accented characters and
 *			non-ASCII collating sequences
 *
 *	Copyright 1985, 1987, 1987 by John Plocher    (plocher@hobbes.UUCP)
 *	May be used in any product as long as this notice is retained and
 *	credit is given.
 *
 ****************************************************************************
 *
 *	Revision Control Information
 *
 *	By:		$Author: plocher $
 *			$Revision: 1.3 $
 *	Last modified:	$Date: 87/08/15 01:32:36 $
 *	Source is in:	$Source: /usr/src/local/biblos/RCS/stracmp.c,v $
 *	Release state:	$State: Usenet $
 *
 *	Library module
 *
 *	Modification Log
 *	----------------
 *
 *	$Log:	stracmp.c,v $
 *	Revision 1.3  87/08/15  01:32:36  plocher
 *	fixed crt-independent 8-bit character output
 *	
 *	Revision 1.2  87/08/15  01:17:28  plocher
 *	passes lint with no complaints
 *
 *	Revision 1.1  86/04/12
 *	Revision 1.0  85/05/27
 *	
 *
 ****************************************************************************
 *
 *	Compile with
 *
 *		cc -c stracmp.c			# for a library object file
 *		  - or -
 *              cc -o stracmp -DMAIN stracmp.c	# for a standalone testbed
 *
 ****************************************************************************
 *
 *	stracmp() implements a string compare which correctly handles
 *	accented (non English) characters which have been encoded using
 *	8-bit characters.  It uses character lookup tables for doing 
 *	string compares when accented characters are present and/or a
 *	non-ASCII collating sequence is desired.
 *
 *	Also, because this is used in bibliographic lookups, this routine
 *	supports the concept of comments within a string.  Everything
 *	between [ and ] (inclusive) is ignored for all comparisons.
 *	Comments may NOT be nested.  Comments are also delimited by
 *	an end of string ('\0'), but that is not the "correct" way.
 *
 *	Reference:
 *
 *	Gibaldi, and Walter S. Achtert.  _MLA_Handbook_for_Writers_of_Research_
 *	    Papers_.  New York: Modern Language Association of America, 1984.
 *	    Page 76.
 *
 ****************************************************************************
 *
 *	Theory:
 *	  The correct way of sorting (or comparing) strings which contain
 *	accented characters is to first compare the strings with all accents
 *	stripped. If the two strings are the same, then and only then are the
 *	accents used.  This second comparison involves only the accents.
 *	You can think of this as comparing the two strings with all the letters
 *	stripped.
 *
 *	  Also, there are times when the "normal" ASCII collating sequence is
 *	not appropriate for lexical ordering.  (ie.  A <AE> B C <CEDILLA> D ...>
 *
 ****************************************************************************
 *	Examples:
 *****
 *			     ,  :
 *	Comparing Junta and Junta	(the second word has diacritical
 *					 marks over the two vowels)
 *
 *	    first we compare("Junta", "Junta")	which shows them EQUAL
 *	then we must compare("     ", " '  :")
 *
 *				  ,  :
 *	Thus, Junta comes before Junta in the lexical ordering of the two words.
 *
 *****
 *		   ,          ,
 *	Comparing Junta  and Junto	(both words have accented 'u's)
 *
 *	    first we compare("Junta", "Junto"); since they are
 *	different  we do not need to do anything more with the accents:
 *	  ,                    ,
 *	"Junta" is less than "Junto".
 *
 ****************************************************************************
 *
 *	Implementation:
 *
 *	The accented string is broken into two strings:
 *		1) a string of letter values with accents stripped, and
 *		2) a string of accent values with letters stripped.
 *
 *	The comparison is table based in order to speed things up and
 *	allow arbitrary collating sequences.
 *
 *	For a given character x, translate[x] is the "value"
 *	used for sorting with strcmp(), and accent[x]
 *	tells whether the character carries an accent, should 
 *	be ignored, or is a normal character.  If accent[] indicates
 *	that the character carries a diacritical, the value of accent[]
 *	is used to rank the accented character against the same letter
 *	but different diacritics:
 *			   ,              .
 *	    ie. The letter a differs from a; which is less depends on the
 *	values of accent[].  If the values in accent[] for these two letters
 *      are the same, the accented letters are considered identical.
 *
 *	The stracmp() routine is fully protected against NUL pointers
 *	being passed as parameters,
 *	All internal space needed is taken from the heap with a single malloc()
 *	and free()'d on exit.  The heap space needed is
 *		2 * ( strlen(s1) + strlen(s2) ) + 4
 *	The stack space needed is 3 ints and 4 pointers.
 *	There are two static 256 element arrays of unsigned chars used for
 *	defining the accents and collation sequence.
 *
 *	The runtime time is
 *			       TIME( strlen(s1) )
 *			     + TIME( strcpy(x,s1) ) * K
 *			     + TIME( strlen(s2) )
 *			     + TIME( strcpy(x,s2) ) * K
 *			     + TIME( strcmp(t1,t2) )
 *			     +[TIME( strcmp(a1,a2) )] (* iff needed *)
 *			     + TIME( malloc() )
 *			     + TIME( free() )
 *     where 1 < K < 2
 *
 ****************************************************************************
 */

#define VERSION		"$Revision: 1.3 $"
/* #define MAIN			/* compile as a test program, not a library */
/* #define ON_IBMPC		/* iff MAIN is defined does your crt show */
				/* the IBM character set?		  */
#define BRACKET_COMMENTS	/* if defined, stuff within [ ]'s is ignored */

#define IBMPC_ROM		/* Tables match the IBM PC ROM tables */
/* #define ISO_LATIN_1		/* Tables for ISO LATIN-1 (ISO 8859-1) */

/***************************************************************************/

#if defined(IBMPC_ROM) + defined(ISO_LATIN_1) != 1

   One and only one of these may be defined.

#endif


#ifdef MAIN
#  include <stdio.h>		/* For confidence test */
#  ifdef ON_IBMPC
#    define PRINT printf
#  else
#    define PRINT crtaccent
#  endif
#endif

#ifndef lint
    static char rcsid[] =
      "$Header: stracmp.c,v 1.3 87/08/15 01:32:36 plocher Usenet $";
#endif

extern char *malloc();
extern void exit();
extern void free();


#ifdef IBMPC_ROM

/* IBM-PC ROM based character set */

/* The translate table maps from a printable character to a "value".   This
 * "value" is used to determine sorting order ( a smaller "value" is less
 * than a larger "value" ).  
 *
 *  Note that in the following table, the letters 'C' and <Cedilla> are
 * both given the same "value".  This is because these two letters are
 * "the same" WHEN ACCENT MARKS ARE IGNORED.  (Same for all other accented
 * characters - they share the same value with the underlying character.)
 *
 *  The table following this has the entry for <Cedilla> flagged as an
 * accent, the entry for 'C' does not.  Therefore, when sorting, a 
 * <Cedilla> will sort with, but following, the entries beginning with 'C'.
 *
 *  The accent table is used solely to differentiate between letters which
 * have the same value in the translate table.  The reasons for two tables
 * instead of one table of shorts are that strcmp() works with char*'s, not
 * short*'s, and that the tables are easier to understand this way.
 *
 * One could also increment the values for 'D'..'~' by 1 and give the value of
 * <Cedilla> as value('C') + 1.  In this case the accent table would not be
 * needed to distinguish between the two.
 */

static unsigned char translate[256] = {
/*     0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F  */
/*     -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -  */
/*0*/  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
/*1*/ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
/*2*/ ' ','!',34, '#','$','%','&',39, '(',')','*','+',',','-','.','/',
/*3*/ '0','1','2','3','4','5','6','7','8','9',':',';','<','=','>','?',
/*4*/ '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
/*5*/ 'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
/*6*/ '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
/*7*/ 'p','q','r','s','t','u','v','w','x','y','z','{','|','}','~',127,

/*8*/ 'C','u','e','a','a','a','a','c','e','e','e','i','i','i','A','A',
/*9*/ 'E',145,146,'o','o','o','u','u','y','O','U',155,156,157,158,159,
/*A*/ 'a','i','o','u','n','N','a','o','?',169,170,171,172,'!',174,175,
/*B*/ 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
/*C*/ 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
/*D*/ 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
/*E*/ 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
/*F*/ 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
};


/*
 * for a given character x, accent[x] determines if the
 * character should be ignored (0), or used as given by translate[x] but
 * marked as an accent (1..n).
 *
 * Accents have a sorting order given by the value stored in this table.
 *  (This feature is currently used in the following way:  Accent value=
 *		0	Character is totally ignored in all sorting operations
 *		1	Normal unaccented character (ASCII)
 *		2..n	accents from the extended IBM charset
 */
 
static unsigned char accent[256] = {
/*     0  1  2  3   4  5  6  7      8  9  A  B   C  D  E  F  */
/*     -  -  -  -   -  -  -  -      -  -  -  -   -  -  -  -  */
/*0*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* control */
/*1*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* chars   */
/*2*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1, /* alphanumerics */
/*3*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1,
/*4*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1,
/*5*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1,
/*6*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1,
/*7*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 0, /* DEL */

/*8*/  2, 2, 2, 2,  2, 2, 2, 2,     2, 2, 2, 2,  2, 2, 2, 2, /* accented chars*/
/*9*/  2, 0, 0, 2,  2, 2, 2, 2,     2, 2, 2, 0,  0, 0, 0, 0,
/*A*/  2, 2, 2, 2,  2, 2, 0, 0,     2, 0, 0, 0,  0, 2, 0, 0, /* aeiou ? ! */
/*B*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* line graphics */
/*C*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* line graphics */
/*D*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* line graphics */
/*E*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* greek */
/*F*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0  /* math */
};

#endif

#ifdef ISO_LATIN_1

/* ISO Latin-1 character set */

/* 
 * Attached is the repertoire of ISO Latin Alphabet Nr 1 (IS 8859-1). I have
 * indicated an alternate name where there might be confusion in the U.S..
 *
 *  List is from  Tim Lasko  Digital Equipment Corporation  Maynard, MA
 * 
 * R/C - row/column of code table
 * Dec - Decimal
 * Oct - Octal
 *
 * R/C  Dec Oct Symbol Name 
 *
 * 02/00 032 040   SP   SPACE
 *  ... same as USASCII ...
 * 07/14 126 176   ~    TILDE
 *  
 * 10/00 160 240  NBSP  NO-BREAK SPACE 
 * 10/01 161 241        INVERTED EXCLAMATION MARK
 * 10/02 162 242        CENT SIGN
 * 10/03 163 243        POUND SIGN
 * 10/04 164 244        CURRENCY SIGN                                
 * 10/05 165 245        YEN SIGN
 * 10/06 166 246        BROKEN BAR                                   
 * 10/07 167 247        PARAGRAPH SIGN, (U.S.) SECTION SIGN 
 * 10/08 168 250        DIERESIS                                    
 * 10/09 169 251        COPYRIGHT SIGN
 * 10/10 170 252        FEMININE ORDINAL INDICATOR
 * 10/11 171 253        LEFT ANGLE QUOTATION MARK
 * 10/12 172 254        NOT SIGN                                     
 * 10/13 173 255   SHY  SOFT HYPHEN                               
 * 10/14 174 256        REGISTERED TRADEMARK SIGN                   
 * 10/15 175 257        MACRON                                       
 *  
 * 11/00 176 260        RING ABOVE, DEGREE SIGN
 * 11/01 177 261        PLUS-MINUS SIGN
 * 11/02 178 262        SUPERSCRIPT TWO
 * 11/03 179 263        SUPERSCRIPT THREE
 * 11/04 180 264        ACUTE ACCENT                                 
 * 11/05 181 265        MICRO SIGN
 * 11/06 182 266        PILCROW SIGN, (U.S.) PARAGRAPH
 * 11/07 183 267        MIDDLE DOT                      
 * 11/08 184 270        CEDILLA
 * 11/09 185 271        SUPERSCRIPT ONE
 * 11/10 186 272        MASCULINE ORDINAL INDICATOR
 * 11/11 187 273        RIGHT ANGLE QUOTATION MARK
 * 11/12 188 274        VULGAR FRACTION ONE QUARTER
 * 11/13 189 275        VULGAR FRACTION ONE HALF
 * 11/14 190 276        VULGAR FRACTION THREE QUARTERS               
 * 11/15 191 277        INVERTED QUESTION MARK
 *  
 * 12/00 192 300        LATIN CAPITAL LETTER A WITH GRAVE ACCENT
 * 12/01 193 301        LATIN CAPITAL LETTER A WITH ACUTE ACCENT
 * 12/02 194 302        LATIN CAPITAL LETTER A WITH CIRCUMFLEX ACCENT
 * 12/03 195 303        LATIN CAPITAL LETTER A WITH TILDE
 * 12/04 196 304        LATIN CAPITAL LETTER A WITH DIAERESIS
 * 12/05 197 305        LATIN CAPITAL LETTER A WITH RING ABOVE
 * 12/06 198 306        CAPITAL DIPHTHONG AE
 * 12/07 199 307        LATIN CAPITAL LETTER C WITH CEDILLA
 * 12/08 200 310        LATIN CAPITAL LETTER E WITH GRAVE ACCENT 
 * 12/09 201 311        LATIN CAPITAL LETTER E WITH ACUTE ACCENT 
 * 12/10 202 312        LATIN CAPITAL LETTER E WITH CIRCUMFLEX ACCENT
 * 12/11 203 313        LATIN CAPITAL LETTER E WITH DIAERESIS
 * 12/12 204 314        LATIN CAPITAL LETTER I WITH GRAVE ACCENT 
 * 12/13 205 315        LATIN CAPITAL LETTER I WITH ACUTE ACCENT 
 * 12/14 206 316        LATIN CAPITAL LETTER I WITH CIRCUMFLEX ACCENT
 * 12/15 207 317        LATIN CAPITAL LETTER I WITH DIAERESIS
 *  
 * 13/00 208 320        CAPITAL ICELANDIC LETTER ETH                 
 * 13/01 209 321        LATIN CAPITAL LETTER N WITH TILDE
 * 13/02 210 322        LATIN CAPITAL LETTER O WITH GRAVE ACCENT 
 * 13/03 211 323        LATIN CAPITAL LETTER O WITH ACUTE ACCENT 
 * 13/04 212 324        LATIN CAPITAL LETTER O WITH CIRCUMFLEX ACCENT
 * 13/05 213 325        LATIN CAPITAL LETTER O WITH TILDE
 * 13/06 214 326        LATIN CAPITAL LETTER O WITH DIAERESIS
 * 13/07 215 327        MULTIPLICATION SIGN                          
 * 13/08 216 330        LATIN CAPITAL LETTER O WITH OBLIQUE STROKE
 * 13/09 217 331        LATIN CAPITAL LETTER U WITH GRAVE ACCENT 
 * 13/10 218 332        LATIN CAPITAL LETTER U WITH ACUTE ACCENT 
 * 13/11 219 333        LATIN CAPITAL LETTER U WITH CIRCUMFLEX
 * 13/12 220 334        LATIN CAPITAL LETTER U WITH DIAERESIS
 * 13/13 221 335        LATIN CAPITAL LETTER Y WITH ACUTE ACCENT  
 * 13/14 222 336        CAPITAL ICELANDIC LETTER THORN               
 * 13/15 223 337        SMALL GERMAN LETTER SHARP s
 *  
 * 14/00 224 340        LATIN SMALL LETTER a WITH GRAVE ACCENT
 * 14/01 225 341        LATIN SMALL LETTER a WITH ACUTE ACCENT
 * 14/02 226 342        LATIN SMALL LETTER a WITH CIRCUMFLEX ACCENT
 * 14/03 227 343        LATIN SMALL LETTER a WITH TILDE
 * 14/04 228 344        LATIN SMALL LETTER a WITH DIAERESIS
 * 14/05 229 345        LATIN SMALL LETTER a WITH RING ABOVE
 * 14/06 230 346        SMALL DIPHTHONG ae
 * 14/07 231 347        LATIN SMALL LETTER c WITH CEDILLA
 * 14/08 232 350        LATIN SMALL LETTER e WITH GRAVE ACCENT
 * 14/09 233 351        LATIN SMALL LETTER e WITH ACUTE ACCENT
 * 14/10 234 352        LATIN SMALL LETTER e WITH CIRCUMFLEX ACCENT
 * 14/11 235 353        LATIN SMALL LETTER e WITH DIAERESIS
 * 14/12 236 354        LATIN SMALL LETTER i WITH GRAVE ACCENT
 * 14/13 237 355        LATIN SMALL LETTER i WITH ACUTE ACCENT
 * 14/14 238 356        LATIN SMALL LETTER i WITH CIRCUMFLEX ACCENT
 * 14/15 239 357        LATIN SMALL LETTER i WITH DIAERESIS
 *  
 * 15/00 240 360        SMALL ICELANDIC LETTER ETH                   
 * 15/01 241 361        LATIN SMALL LETTER n WITH TILDE
 * 15/02 242 362        LATIN SMALL LETTER o WITH GRAVE ACCENT
 * 15/03 243 363        LATIN SMALL LETTER o WITH ACUTE ACCENT
 * 15/04 244 364        LATIN SMALL LETTER o WITH CIRCUMFLEX ACCENT
 * 15/05 245 365        LATIN SMALL LETTER o WITH TILDE
 * 15/06 246 366        LATIN SMALL LETTER o WITH DIAERESIS
 * 15/07 247 367        DIVISION SIGN                                
 * 15/08 248 370        LATIN SMALL LETTER o WITH OBLIQUE STROKE
 * 15/09 249 371        LATIN SMALL LETTER u WITH GRAVE ACCENT
 * 15/10 250 372        LATIN SMALL LETTER u WITH ACUTE ACCENT
 * 15/11 251 373        LATIN SMALL LETTER u WITH CIRCUMFLEX ACCENT
 * 15/12 252 374        LATIN SMALL LETTER u WITH DIAERESIS
 * 15/13 253 375        LATIN SMALL LETTER y WITH ACUTE ACCENT       
 * 15/14 254 376        SMALL ICELANDIC LETTER THORN                 
 * 15/15 255 377        LATIN SMALL LETTER y WITH DIAERESIS          
 */


unsigned char translate[256] = {
/*     0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F  */
/*     -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -  */
/*0*/  0 ,' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
/*1*/ ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
/*2*/ ' ','!',34, '#','$','%','&',39, '(',')','*','+',',','-','.','/',
/*3*/ '0','1','2','3','4','5','6','7','8','9',':',';','<','=','>','?',
/*4*/ '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
/*5*/ 'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
/*6*/ '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
/*7*/ 'p','q','r','s','t','u','v','w','x','y','z','{','|','}','~',127,

/*8*/ ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
/*9*/ ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
/*A*/ 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
/*B*/ 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
/*C*/ 'A','A','A','A','A','A',198,'C','E','E','E','E','I','I','I','I',
/*D*/ 208,'N','O','O','O','O','O',215,'O','U','U','U','U','Y',222,'s',
/*E*/ 'a','a','a','a','a','a',230,'c','e','e','e','e','i','i','i','i',
/*F*/ 240,'n','o','o','o','o','o',247,'o','u','u','u','u','y',254,'y'
};

static unsigned char accent[256] = {
/*     0  1  2  3   4  5  6  7      8  9  A  B   C  D  E  F  */
/*     -  -  -  -   -  -  -  -      -  -  -  -   -  -  -  -  */
/*0*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* control */
/*1*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* chars   */
/*2*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1, /* alphanumerics */
/*3*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1,
/*4*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1,
/*5*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1,
/*6*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 1,
/*7*/  1, 1, 1, 1,  1, 1, 1, 1,     1, 1, 1, 1,  1, 1, 1, 0, /* DEL */

/*8*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* control */
/*9*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* chars   */
/*A*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* punctuation */
/*B*/  0, 0, 0, 0,  0, 0, 0, 0,     0, 0, 0, 0,  0, 0, 0, 0, /* punctuation */
/*C*/  2, 2, 2, 2,  2, 2, 3, 2,     2, 2, 2, 2,  2, 2, 2, 2, /* A C, E I */
/*D*/  2, 2, 2, 2,  2, 2, 2, 0,     2, 2, 2, 2,  2, 2, 0, 2, /* O N U Y */
/*E*/  2, 2, 2, 2,  2, 2, 3, 2,     2, 2, 2, 2,  2, 2, 2, 2, /* a c, e i */
/*F*/  2, 2, 2, 2,  2, 2, 2, 0,     2, 2, 2, 2,  2, 2, 0, 2  /* o n u y */
};

#endif

#ifdef BRACKET_COMMENTS
#define REDUCE(ORIG, ACCENTS, ASCII)	{				\
    char *pa, *pt;							\
    pa = ACCENTS;							\
    pt = ASCII;								\
    while (*ORIG) {							\
	if ( *ORIG == '[' ) {						\
	    while ( *ORIG && *ORIG != ']' )				\
		ORIG++;		/* ignore anything within []'s */	\
	    if (*ORIG)							\
		ORIG++;		/* skip trailing ] */			\
	    continue;							\
	}								\
	if (accent[ *ORIG ]) {						\
	    *pa++ = accent[ (unsigned)(*ORIG) ];			\
	    *pt++ = translate[ (unsigned)(*ORIG) ]; /* set collating seq */ \
	}								\
	ORIG++;								\
    }									\
    *pa = *pt = '\0';							\
}
#else
#define REDUCE(ORIG, ACCENTS, ASCII)	{				\
    char *pa, *pt;							\
    pa = ACCENTS;							\
    pt = ASCII;								\
    while (*ORIG) {							\
	if (accent[ *ORIG ]) {						\
	    *pa++ = accent[ (unsigned)(*ORIG) ];			\
	    *pt++ = translate[ (unsigned)(*ORIG) ]; /* set collating seq */ \
	}								\
	ORIG++;								\
    }									\
    *pa = *pt = '\0';							\
}
#endif

#define MALLOC( pointer, type, size )			\
    pointer = ( type *)malloc((unsigned) size );	\
    if ( pointer == (type *)NULL) {			\
	(void)printf("\n MALLOC returned NULL:  pointer (size)");	\
	exit(0);					\
    }

#define FREE( pointer )		\
	(void)free( (char *)pointer );

int stracmp(s1,s2)
unsigned char *s1, *s2;
{
    int value;
    unsigned int i1,  i2;	/* length of given strings */
    char *as1, *as2;		/* accent strings */
    char *ts1, *ts2;		/* strings with accent marks stripped */


    if (s1 == NULL)			/* cover our ass */
	if (s2 == NULL)
	    return 0;		/* NULL == NULL :-) */
	else return -1;		/* NULL < "anything" */
    else if (s2 == NULL)
	return 1;		/* "anything > NULL */
    
    i1 = strlen((char *)s1) + 1;
    i2 = strlen((char *)s2) + 1;

    MALLOC(as1, char, 2 * (i1 + i2) + 4);	/* accent chars */
    ts1 = as1 + i1 + 1;
    as2 = ts1 + i1 + 1;
    ts2 = as2 + i2 + 1;

    REDUCE( s1, as1, ts1);
    REDUCE( s2, as2, ts2);
    
    if ( (value = strcmp(ts1, ts2) ) ) {
	FREE(as1);
	return( value );		/* strings differ already */
    }
    /*
     *	at this point, ts1 == ts2, and we need to decide if
     *  the accents (if any) break the tie.
     */
    value = strcmp( as1, as2 );
    FREE(as1);
    return value;
}


#ifdef MAIN

#ifndef ON_IBMPC
#include <ctype.h>

void crtaccents(s)
unsigned char *s;
{
    while ( s && *s ) {
	switch( *s ) {
	    case '\r':	(void)printf("\\r");		break;
	    case '\b':	(void)printf("\\b");		break;
	    case '\t':	(void)printf("\\t");		break;
	    case '\f':	(void)printf("\\f");		break;
	    default  :  if (isascii( *s ))
	                    (void)putchar(*s);
			else
	                    (void)printf("\\%03o",*s);
			break;
	}
	s++;
    }
}
#endif

#define COMPARE( check, s1, s2 )	{	\
    (void)PRINT(s1) ;				\
    result = stracmp( (unsigned char *)s1, (unsigned char *)s2 );	\
    if (result < 0) (void)printf(" < ");	\
    else if (result > 0) (void)printf(" > ");	\
    else (void)printf(" = ");			\
    (void)PRINT(s2); (void)putchar('\t');	\
    if (result == check) (void)printf("OK\n");	\
    else (void)printf("WRONG!!!\n");		\
}

main() {
    int  result;
    
    (void)printf("stracmp demo - version %s\n", VERSION);
    
    /* These tests assume IBM ROM tables */

    COMPARE( 0,"John Plocher", "John Plocher");			/* = */
    COMPARE( 0,"John[ Michael] Plocher[@hobbes.UUCP]", "John Plocher");	/* = */
    COMPARE( 0,"John Plocher", "John[ Michael] Plocher");	/* = */
    COMPARE(-1,"John Plocher", "J\242hn Plocher");		/* < */
    COMPARE( 1,"J\242hn Pl\242cher", "J\242hn Plocher");	/* > */
    COMPARE( 1,"J\242hn P\242lcher", "J\242hn Pl\242cher");	/* > */
    COMPARE( 0,"J\242hn Pl\242cher", "J\242hn Pl\242cher");	/* = */
    COMPARE( 1,"J\242\242n Pl\242cher", "J\242hn Pl\242cher");	/* > */
    return(0);
}

#endif

-- 
John Plocher uwvax!geowhiz!uwspan!plocher  plocher%uwspan.UUCP@uwvax.CS.WISC.EDU