[comp.text.tex] new version of the ascii-german -> diacritics conversion program

NEUMANN@awiwuw11.wu-wien.ac.at (Gustaf Neumann) (11/19/90)

Below you will find a new version of the ascii-german to diacritical
german conversion Program. It is still not perfect, but i think it is
pretty good by now. I was able to translate all German words in the book

\bibitem[{Neu88}]{neumann88}
        G.~Neumann: \T{Metaprogrammierung und Prolog},
        Addison--Wesley, Bonn 1988.

correctly from diacritics -> ascii german -> diactitics.

There are several known problems such as "Masse" ("im hohen Masse" vs.
"Gesteinsmasse" ) and "Busse" ("Autobusse" vs. "tuet Busse"). In both
cases the first varian is assumed to be correct. You can achieve the
other alternatives by writing "Gesteinsmas{}se" and "tuet Bu{}sse"
resp. Other mis-translated words are welcome.

-Gustaf Neumann
-------------------------------------------------------------------
Gustaf Neumann       neumann@dec4.wu-wien.ac.at, neumann@awiwuw11.bitnet
Vienna University of Economics and Business Administration
Augasse 2-6,  A-1090 Vienna, Austria
Tel: +43 (222) 31-336 x4533     Fax 347-555

------------------------------------- cut here -----diac.shar-----------
# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by neumann on Sun Nov 18 23:20:06 1990
#
# This archive contains:
#	diac.l		Makefile	diacaux.c	diacaux.h	
#

LANG=""; export LANG

echo x - diac.l
cat >diac.l <<'@EOF'
%{

/* diac.l
 * lex file for converting Ascii German into diacritical German
 * Version 1.0 written by
 *  Dorai Sitaram, Rice University, 1990   dorai@titan.rice.edu
 *
 * Version 1.1:
 * General rewrite, using some Material from
 *    H.Kaeslin, Behandlung der Umlaute bei der Verarbeitung deutscher
 *    Texte unter Unix, in: it, Vol 1, 1988
 * and Duden - die Rechtschreibung.
 *
 * Gustaf Neumann, Wirtschaftsuniversitaet Wien, October 1990
 * neumann@dec4.wu-wien.ac.at         neumann@awiwuw11.bitnet
 *
 * The resulting LaTeX file uses german.sty!
 * Representation of umlaut characters:    \"a \"A \"o \"O \"u \"U {\ss}
 * The style file german.sty would allow    "a  "A  "o  "O  "u  "U "s
 * as well, but the latter representation makes it impossible to
 * to distinguish between umlaut characters and quoted text. This distinction
 * is necessesay in cases where quotes should be changed into opening and
 * closing german quotes (\glqq and \qrqq) in an automated way (another
 * lex program).
 *
 * If you do NOT want to use GERMAN.STY, replace underneath the ruleset
 * for \documentstyle with the following rule:

\\documentstyle[^\}]*\}	{ printf("%s\n", yytext);
			printf("\\newskip\\zeeskip\n");
			printf("\\zeeskip=0pt plus0pt minus0pt\n");
			printf("\\def\\1{\\nobreak\\hskip\\zeeskip}\n");
			printf("\\let\\umlaut\\\"\n");
			printf("\\def\\\"#1{\\1\\umlaut#1\\1}\n");
			printf("\\let\\oldss\\ss\n");
			printf("\\def\\ss{\\1\\oldss\\1}\n"); }
 *
 *
 * To prevent the conversion from Ascii German into diacritical German,
 * it is necessary to insert empty groups into the words (e.g. Ka{}eslin).
 */

#include "diacaux.h"
int i;
%}

%p 6500
%n 1000
%e 2500
%a 4000
%k 2500
%o 3500


V	[AEIOUaeiou]
C	[B-DF-HJ-NP-TV-Zb-df-hj-np-tv-z]
W	[ "'\t\n,;!?().]
b       [ \t\n]

%%

\\documentstyle{b}*\{   printf("\\documentstyle[german]{");
\\documentstyle{b}*\[.*german.*\]{b}*\{   ECHO;
\\documentstyle{b}*\[.*\]{b}*\{ {
                          for(i=13;yytext[i]=='[';i++);
                          printf("\\documentstyle[german,%s",&yytext[i+2]);}



\\input{b}*\{[^\}]+\}	{ texfile = getfilenamebrack(&yytext[6]);
			tempfile = maketempfilename(texfile);
			printf("\\input{%s}", tempfile);
			dosubdiac(texfile, tempfile); }

\\input{b}*[^ \t\n]+	{ texfile = getfilename(&yytext[6]);
			tempfile = maketempfilename(texfile);
			printf("\\input %s", tempfile);
			dosubdiac(texfile, tempfile); }

\\begin\{.+\}	ECHO;
\\end\{.+\}	ECHO;
\\[A-Za-z]+	ECHO;


%{ /* ue */
%}

[Rr]euessier 	printf("%ce\\\"ussier", yytext[0]);
[^igGbB][Ee]ue 	ECHO;
[QqAa]ue 	ECHO;
[Uu]e[iu] 	ECHO;
[Gg]etue{W}	ECHO;
[a-rt-z]tuend	ECHO;
{W}tuet{W}	ECHO;
[Nn]ichtstuend	ECHO;
[Nn]ichtstuer	ECHO;
Tuerei{W}	ECHO;
[a-z]tuerei	ECHO;
[a-z]tuerisch	ECHO;
[Aa]bzue[b-z][a-z]*[elr]n	ECHO;
[Aa]nzue[b-z][a-z]*[elr]n	ECHO;
[Aa]u[fs]zue[b-z][a-z]*[elr]n	ECHO;
[Ee]inzue[b-z][a-z]*[elr]n	ECHO;
[Hh]inzue[b-z][a-z]*[elr]n	ECHO;
[Mm]itzue[b-z][a-z]*[elr]n	ECHO;
[Nn]achzue[b-z][a-z]*[elr]n	ECHO;
[Vv]orzue[b-z][a-z]*[elr]n	ECHO;
[Ww]iederzue[b-z][a-z]*[elr]n	ECHO;
[Zz]ue[b-z][a-z]*[elr]n		ECHO;
[Zz]urueckzue[b-z][a-z]*[elr]n	printf("%cur\\\"uckzu%s",yytext[0],&yytext[9]);
tuendere	ECHO;
[Aa]biguen	ECHO;
[Aa]ffluen	ECHO;
[Bb]u[ea]nos	ECHO;
[Dd]uett	ECHO;
[Dd]uell	ECHO;
entuell 	ECHO;
[Gg]raduell	ECHO;
[Gg]uerill	ECHO;
[Ii]ndividuen	ECHO;
[Ii]nfluen	ECHO;
Lueger	        ECHO;
[krx]tuell	ECHO;
[Kk]ongruen	ECHO;
[Kk]onstituen	ECHO;
[Mm]enuett	ECHO;
[Mm]anuell	ECHO;
[Mm]igue[tl]	ECHO;
[Pp]irouett	ECHO;
[Pp]uerto	ECHO;
[Rr]esiduen	ECHO;
[Ss]tatue	ECHO;
[Ss]exuell	ECHO;
[Ss]uez		ECHO;
[Vv]enezuel	ECHO;
[Vv]isuell	ECHO;
[Zz]uerkannt	ECHO;
[Zz]uerteil	ECHO;
[Zz]uerst	ECHO;


%{ /* ae */
%}

[Aa]ero		ECHO;
[Dd]odekae	ECHO;
[Hh]exae	ECHO;
[Ii]kosae	ECHO;
[Ii]srael	ECHO;
[Kk]afkaesk	ECHO;
aeuel           printf("\\\"auel");
[Mm]ichael	ECHO;
[Mm]etae	ECHO;
[Oo]ctae	ECHO;
[Pp]entae	ECHO;
[Pp]harmae	ECHO;
[Rr]affael	ECHO;
[Rr]afael	ECHO;
[Rr]aphael	ECHO;
[Tt]etrae	ECHO;
[Tt]hemae	ECHO;
[Ss]chemae	ECHO;
[Ss]amuel	ECHO;
[Vv]alue{W}	ECHO;
[Tt]rue{W}	ECHO;


%{ /* oe */
%}

[Aa]utoe	ECHO;
[Bb]enzoe	ECHO;
[Cc]hemoe	ECHO;
[Dd]iarrhoea	ECHO;
[Ee]lektroe	ECHO;
[Gg]oethe	ECHO;
[Hh]eroen 	ECHO;
[Hh]o[ml]oe 	ECHO;
[Hh]ydroe 	ECHO;
[Ii]ndoeuro	ECHO;
Joel	        ECHO;
[Kk]inoe 	ECHO;
[Kk]oedukat 	ECHO;
[Kk]oeffizi 	ECHO;
[Kk]oerzi 	ECHO;
[Kk]oexist 	ECHO;
[Cc]oexist 	ECHO;
[Kk]oenzym 	ECHO;
[Kk]ontoe 	ECHO;
[Ss]oeben 	ECHO;
Soest   	ECHO;
[Mm]etazoe 	ECHO;
[Mm][ai][ck]roe ECHO;
[Mm]onoe 	ECHO;
[Nn]euroe 	ECHO;
[Oo]boe 	ECHO;
[Oo]erlikon 	ECHO;
[Oo]ldesloe	ECHO;
[Oo]kto 	ECHO;
[Oo]pto 	ECHO;
[Pp]oesie 	ECHO;
[Pp]oebene 	ECHO;
[Pp]iezo 	ECHO;
[Pp]hoto 	ECHO;
[Pp]hysioe 	ECHO;
[Pp]oe[mt]i 	ECHO;
[Pp]oe[mt][^a-z]	ECHO;
[Pp]orto 	ECHO;
[Pp]roenzy	ECHO;
[Pp]roto	ECHO;
[Pp]rotozoe 	ECHO;
[Pp]seudo 	ECHO;
[Pp]sycho 	ECHO;
[Pp]yro 	ECHO;
[Rr]adio 	ECHO;
[Tt]otoer	ECHO;
[Tt]urbo	ECHO;
[Vv]ideo	ECHO;


%{ /* ss */
%}

{V}sss		printf("%c{\\ss}s",yytext[0]);
[EeAu][iu]ss	printf("%c%c{\\ss}", yytext[0],yytext[1]);
{C}{V}sser{W}	ECHO;
{C}{V}sser{V}	ECHO;
{C}{V}ssen	ECHO;
[^r]uesse[ln] 	printf("%c\\\"usse%c",yytext[0],yytext[6]);
luesse 		printf("l\\\"usse");
iess		printf("ie{\\ss}");
ssung 		ECHO;
ssel 		ECHO;
ssoren 		ECHO;
ssiez 		ECHO;
ccess 		ECHO;
ssidy 		ECHO;
chss 		ECHO;
ssch 		ECHO;
sspr 		ECHO;
ssier 		ECHO;
nisse		ECHO;
lss 		ECHO;
ss' 		ECHO;
tionss		ECHO;
tss		ECHO;
ussisch		ECHO;
ungss		ECHO;
usserl{W}	ECHO;
[Aa]ssoz	ECHO;
[Aa]ssist	ECHO;
[Aa]ssemb	ECHO;
[Aa]uss[^e]	ECHO;
[Aa]usse[^rn]	ECHO;
[Aa]ussende	ECHO;
[Ee]sse		ECHO;
[Bb]isschen	printf("%ci{\\ss}chen", yytext[0]);
[Bb]usiness	ECHO;
[Bb]usse	ECHO;
[Bb]ussard	ECHO;
triebss		ECHO;
beitss		ECHO;
[Dd]iskussion	ECHO;
[Dd]issert	ECHO;
[Dd]asselb	ECHO;
[Ee]ssi		ECHO;
[Ff]lusse	ECHO;
[Ff]luess[ie]	printf("%cl\\\"uss%c", yytext[0],yytext[6]);
Grass		ECHO;
[Gg]enosse	ECHO;
[Gg]rosse       printf("%cro{\\ss}e",yytext[0]);
[Ii]nteress	ECHO;
[Kk]lass[ie]	ECHO;
[Kk]assette	ECHO;
[Ll]asse	ECHO;
[Ll]aessig	printf("%c\\\"assig", yytext[0]);
[Mm]assa[^nr]	ECHO;
[Mm]asseu	ECHO;
[Mm]isser{C}	printf("%ci{\\ss}er%c", yytext[0],yytext[6]);
[Mm]iss[ei]	ECHO;
[Ee]rmassen	printf("%crma{\\ss}en", yytext[0]);
[Mm]assi	ECHO;
[Pp]rivatissi	ECHO;
[Pp]assiv	ECHO;
[Pp]rozessor	ECHO;
[Ss]tossen	printf("%cto{\\ss}en", yytext[0]);
[Rr]essource	ECHO;
[Ww][ia]sse	ECHO;

{C}ss{C}	ECHO;

[AaOoUu]e	printf("\\\"%c", yytext[0]);
ss		printf("{\\ss}");

@EOF

chmod 644 diac.l

echo x - Makefile
cat >Makefile <<'@EOF'
#
# if you do not have flex available, deactivate the definitions of
# LEX and LEXLIB; The program compiled with flex works also with the
# standard lex library (-ll).
#
LEX=flex
LEXLIB=-lfl
PROGS= diac

all: ${PROGS}

diac: diac.l diacaux.h diacaux.c
	${LEX} ${LFLAGS} diac.l
	cc -O ${DEFINES} -o $@ diacaux.c lex.yy.c ${LEXLIB}
	strip $@
	rm lex.yy.c lex.yy.o diacaux.o

clean:
	rm -f ${PROGS} *.o *~ #* core


shar:
	shar diac.l Makefile diacaux.c diacaux.h > diac.shar
@EOF

chmod 644 Makefile

echo x - diacaux.c
cat >diacaux.c <<'@EOF'
/* diacaux.c
 * to be linked with lex.yy.c from diac.l
 * written by Dorai Sitaram, Rice University, 1990
 */

#include "diacaux.h"

int slen(s)
char *s;
{
  int i;

  for (i = 1; s[i] != '\0'; i++)
	;

  return i;
}

char *strap(s,t)
char *s,*t;
{
  char *r = (char *) malloc(slen(s) + slen(t));
  int i,j;

  for (i = 0; s[i] != '\0'; i++)
	r[i] = s[i];

  for (j = 0; t[j] != '\0'; i++, j++)
	r[i] = t[j];

  r[i] = '\0';

  return r;
}

char *getfilename(s)
char *s;
{
  char *r = (char *) malloc(slen(s));
  int i,j;

  for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n'; i++)
	;

  for (j = 0; s[i] != '\0'; i++, j++)
	r[j] = s[i];

  r[j] = '\0';

  return r;
}
			
char *getfilenamebrack(s)
char *s;
{
  char *r = (char *) malloc(slen(s));
  int i,j;

  for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n' ||
		s[i] == '{'; i++)
	;

  for (j = 0; s[i] != '}'; i++, j++)
	r[j] = s[i];

  r[j] = '\0';

  return r;
}

char *maketempfilename(s)
char *s;
{
  char *r = (char *)malloc(slen(s));
  int i,j;

  for (i = 0, j = 0; s[j] != '\0'; i++, j++) {
	r[i] = s[j];
	if (r[i] == '/') r[i] = '_';
  }

  r[i] = '\0';

  return strap("/tmp/",r);
}

void dosubdiac(s,t)
char *s,*t;
{
  system(strap("diac <",
	strap(texfile,
	strap(" > ", tempfile))));
}
@EOF

chmod 644 diacaux.c

echo x - diacaux.h
cat >diacaux.h <<'@EOF'
/* diac.h
 * to be included in diac.l and diac.c
 * written by Dorai Sitaram, Rice University, 1990
 */

char *texfile;
char *tempfile;
int slen();
char *strap();
char *getfilename();
char *getfilenamebrack();
char *maketempfilename();
void dosubdiac();
@EOF

chmod 644 diacaux.h

exit 0