NEUMANN@awiwuw11.wu-wien.ac.at (Gustaf Neumann) (11/19/90)
Below you will find a new version of the ascii-german to diacritical german conversion Program. It is still not perfect, but i think it is pretty good by now. I was able to translate all German words in the book \bibitem[{Neu88}]{neumann88} G.~Neumann: \T{Metaprogrammierung und Prolog}, Addison--Wesley, Bonn 1988. correctly from diacritics -> ascii german -> diactitics. There are several known problems such as "Masse" ("im hohen Masse" vs. "Gesteinsmasse" ) and "Busse" ("Autobusse" vs. "tuet Busse"). In both cases the first varian is assumed to be correct. You can achieve the other alternatives by writing "Gesteinsmas{}se" and "tuet Bu{}sse" resp. Other mis-translated words are welcome. -Gustaf Neumann ------------------------------------------------------------------- Gustaf Neumann neumann@dec4.wu-wien.ac.at, neumann@awiwuw11.bitnet Vienna University of Economics and Business Administration Augasse 2-6, A-1090 Vienna, Austria Tel: +43 (222) 31-336 x4533 Fax 347-555 ------------------------------------- cut here -----diac.shar----------- # This is a shell archive. Remove anything before this line, # then unpack it by saving it in a file and typing "sh file". # # Wrapped by neumann on Sun Nov 18 23:20:06 1990 # # This archive contains: # diac.l Makefile diacaux.c diacaux.h # LANG=""; export LANG echo x - diac.l cat >diac.l <<'@EOF' %{ /* diac.l * lex file for converting Ascii German into diacritical German * Version 1.0 written by * Dorai Sitaram, Rice University, 1990 dorai@titan.rice.edu * * Version 1.1: * General rewrite, using some Material from * H.Kaeslin, Behandlung der Umlaute bei der Verarbeitung deutscher * Texte unter Unix, in: it, Vol 1, 1988 * and Duden - die Rechtschreibung. * * Gustaf Neumann, Wirtschaftsuniversitaet Wien, October 1990 * neumann@dec4.wu-wien.ac.at neumann@awiwuw11.bitnet * * The resulting LaTeX file uses german.sty! * Representation of umlaut characters: \"a \"A \"o \"O \"u \"U {\ss} * The style file german.sty would allow "a "A "o "O "u "U "s * as well, but the latter representation makes it impossible to * to distinguish between umlaut characters and quoted text. This distinction * is necessesay in cases where quotes should be changed into opening and * closing german quotes (\glqq and \qrqq) in an automated way (another * lex program). * * If you do NOT want to use GERMAN.STY, replace underneath the ruleset * for \documentstyle with the following rule: \\documentstyle[^\}]*\} { printf("%s\n", yytext); printf("\\newskip\\zeeskip\n"); printf("\\zeeskip=0pt plus0pt minus0pt\n"); printf("\\def\\1{\\nobreak\\hskip\\zeeskip}\n"); printf("\\let\\umlaut\\\"\n"); printf("\\def\\\"#1{\\1\\umlaut#1\\1}\n"); printf("\\let\\oldss\\ss\n"); printf("\\def\\ss{\\1\\oldss\\1}\n"); } * * * To prevent the conversion from Ascii German into diacritical German, * it is necessary to insert empty groups into the words (e.g. Ka{}eslin). */ #include "diacaux.h" int i; %} %p 6500 %n 1000 %e 2500 %a 4000 %k 2500 %o 3500 V [AEIOUaeiou] C [B-DF-HJ-NP-TV-Zb-df-hj-np-tv-z] W [ "'\t\n,;!?().] b [ \t\n] %% \\documentstyle{b}*\{ printf("\\documentstyle[german]{"); \\documentstyle{b}*\[.*german.*\]{b}*\{ ECHO; \\documentstyle{b}*\[.*\]{b}*\{ { for(i=13;yytext[i]=='[';i++); printf("\\documentstyle[german,%s",&yytext[i+2]);} \\input{b}*\{[^\}]+\} { texfile = getfilenamebrack(&yytext[6]); tempfile = maketempfilename(texfile); printf("\\input{%s}", tempfile); dosubdiac(texfile, tempfile); } \\input{b}*[^ \t\n]+ { texfile = getfilename(&yytext[6]); tempfile = maketempfilename(texfile); printf("\\input %s", tempfile); dosubdiac(texfile, tempfile); } \\begin\{.+\} ECHO; \\end\{.+\} ECHO; \\[A-Za-z]+ ECHO; %{ /* ue */ %} [Rr]euessier printf("%ce\\\"ussier", yytext[0]); [^igGbB][Ee]ue ECHO; [QqAa]ue ECHO; [Uu]e[iu] ECHO; [Gg]etue{W} ECHO; [a-rt-z]tuend ECHO; {W}tuet{W} ECHO; [Nn]ichtstuend ECHO; [Nn]ichtstuer ECHO; Tuerei{W} ECHO; [a-z]tuerei ECHO; [a-z]tuerisch ECHO; [Aa]bzue[b-z][a-z]*[elr]n ECHO; [Aa]nzue[b-z][a-z]*[elr]n ECHO; [Aa]u[fs]zue[b-z][a-z]*[elr]n ECHO; [Ee]inzue[b-z][a-z]*[elr]n ECHO; [Hh]inzue[b-z][a-z]*[elr]n ECHO; [Mm]itzue[b-z][a-z]*[elr]n ECHO; [Nn]achzue[b-z][a-z]*[elr]n ECHO; [Vv]orzue[b-z][a-z]*[elr]n ECHO; [Ww]iederzue[b-z][a-z]*[elr]n ECHO; [Zz]ue[b-z][a-z]*[elr]n ECHO; [Zz]urueckzue[b-z][a-z]*[elr]n printf("%cur\\\"uckzu%s",yytext[0],&yytext[9]); tuendere ECHO; [Aa]biguen ECHO; [Aa]ffluen ECHO; [Bb]u[ea]nos ECHO; [Dd]uett ECHO; [Dd]uell ECHO; entuell ECHO; [Gg]raduell ECHO; [Gg]uerill ECHO; [Ii]ndividuen ECHO; [Ii]nfluen ECHO; Lueger ECHO; [krx]tuell ECHO; [Kk]ongruen ECHO; [Kk]onstituen ECHO; [Mm]enuett ECHO; [Mm]anuell ECHO; [Mm]igue[tl] ECHO; [Pp]irouett ECHO; [Pp]uerto ECHO; [Rr]esiduen ECHO; [Ss]tatue ECHO; [Ss]exuell ECHO; [Ss]uez ECHO; [Vv]enezuel ECHO; [Vv]isuell ECHO; [Zz]uerkannt ECHO; [Zz]uerteil ECHO; [Zz]uerst ECHO; %{ /* ae */ %} [Aa]ero ECHO; [Dd]odekae ECHO; [Hh]exae ECHO; [Ii]kosae ECHO; [Ii]srael ECHO; [Kk]afkaesk ECHO; aeuel printf("\\\"auel"); [Mm]ichael ECHO; [Mm]etae ECHO; [Oo]ctae ECHO; [Pp]entae ECHO; [Pp]harmae ECHO; [Rr]affael ECHO; [Rr]afael ECHO; [Rr]aphael ECHO; [Tt]etrae ECHO; [Tt]hemae ECHO; [Ss]chemae ECHO; [Ss]amuel ECHO; [Vv]alue{W} ECHO; [Tt]rue{W} ECHO; %{ /* oe */ %} [Aa]utoe ECHO; [Bb]enzoe ECHO; [Cc]hemoe ECHO; [Dd]iarrhoea ECHO; [Ee]lektroe ECHO; [Gg]oethe ECHO; [Hh]eroen ECHO; [Hh]o[ml]oe ECHO; [Hh]ydroe ECHO; [Ii]ndoeuro ECHO; Joel ECHO; [Kk]inoe ECHO; [Kk]oedukat ECHO; [Kk]oeffizi ECHO; [Kk]oerzi ECHO; [Kk]oexist ECHO; [Cc]oexist ECHO; [Kk]oenzym ECHO; [Kk]ontoe ECHO; [Ss]oeben ECHO; Soest ECHO; [Mm]etazoe ECHO; [Mm][ai][ck]roe ECHO; [Mm]onoe ECHO; [Nn]euroe ECHO; [Oo]boe ECHO; [Oo]erlikon ECHO; [Oo]ldesloe ECHO; [Oo]kto ECHO; [Oo]pto ECHO; [Pp]oesie ECHO; [Pp]oebene ECHO; [Pp]iezo ECHO; [Pp]hoto ECHO; [Pp]hysioe ECHO; [Pp]oe[mt]i ECHO; [Pp]oe[mt][^a-z] ECHO; [Pp]orto ECHO; [Pp]roenzy ECHO; [Pp]roto ECHO; [Pp]rotozoe ECHO; [Pp]seudo ECHO; [Pp]sycho ECHO; [Pp]yro ECHO; [Rr]adio ECHO; [Tt]otoer ECHO; [Tt]urbo ECHO; [Vv]ideo ECHO; %{ /* ss */ %} {V}sss printf("%c{\\ss}s",yytext[0]); [EeAu][iu]ss printf("%c%c{\\ss}", yytext[0],yytext[1]); {C}{V}sser{W} ECHO; {C}{V}sser{V} ECHO; {C}{V}ssen ECHO; [^r]uesse[ln] printf("%c\\\"usse%c",yytext[0],yytext[6]); luesse printf("l\\\"usse"); iess printf("ie{\\ss}"); ssung ECHO; ssel ECHO; ssoren ECHO; ssiez ECHO; ccess ECHO; ssidy ECHO; chss ECHO; ssch ECHO; sspr ECHO; ssier ECHO; nisse ECHO; lss ECHO; ss' ECHO; tionss ECHO; tss ECHO; ussisch ECHO; ungss ECHO; usserl{W} ECHO; [Aa]ssoz ECHO; [Aa]ssist ECHO; [Aa]ssemb ECHO; [Aa]uss[^e] ECHO; [Aa]usse[^rn] ECHO; [Aa]ussende ECHO; [Ee]sse ECHO; [Bb]isschen printf("%ci{\\ss}chen", yytext[0]); [Bb]usiness ECHO; [Bb]usse ECHO; [Bb]ussard ECHO; triebss ECHO; beitss ECHO; [Dd]iskussion ECHO; [Dd]issert ECHO; [Dd]asselb ECHO; [Ee]ssi ECHO; [Ff]lusse ECHO; [Ff]luess[ie] printf("%cl\\\"uss%c", yytext[0],yytext[6]); Grass ECHO; [Gg]enosse ECHO; [Gg]rosse printf("%cro{\\ss}e",yytext[0]); [Ii]nteress ECHO; [Kk]lass[ie] ECHO; [Kk]assette ECHO; [Ll]asse ECHO; [Ll]aessig printf("%c\\\"assig", yytext[0]); [Mm]assa[^nr] ECHO; [Mm]asseu ECHO; [Mm]isser{C} printf("%ci{\\ss}er%c", yytext[0],yytext[6]); [Mm]iss[ei] ECHO; [Ee]rmassen printf("%crma{\\ss}en", yytext[0]); [Mm]assi ECHO; [Pp]rivatissi ECHO; [Pp]assiv ECHO; [Pp]rozessor ECHO; [Ss]tossen printf("%cto{\\ss}en", yytext[0]); [Rr]essource ECHO; [Ww][ia]sse ECHO; {C}ss{C} ECHO; [AaOoUu]e printf("\\\"%c", yytext[0]); ss printf("{\\ss}"); @EOF chmod 644 diac.l echo x - Makefile cat >Makefile <<'@EOF' # # if you do not have flex available, deactivate the definitions of # LEX and LEXLIB; The program compiled with flex works also with the # standard lex library (-ll). # LEX=flex LEXLIB=-lfl PROGS= diac all: ${PROGS} diac: diac.l diacaux.h diacaux.c ${LEX} ${LFLAGS} diac.l cc -O ${DEFINES} -o $@ diacaux.c lex.yy.c ${LEXLIB} strip $@ rm lex.yy.c lex.yy.o diacaux.o clean: rm -f ${PROGS} *.o *~ #* core shar: shar diac.l Makefile diacaux.c diacaux.h > diac.shar @EOF chmod 644 Makefile echo x - diacaux.c cat >diacaux.c <<'@EOF' /* diacaux.c * to be linked with lex.yy.c from diac.l * written by Dorai Sitaram, Rice University, 1990 */ #include "diacaux.h" int slen(s) char *s; { int i; for (i = 1; s[i] != '\0'; i++) ; return i; } char *strap(s,t) char *s,*t; { char *r = (char *) malloc(slen(s) + slen(t)); int i,j; for (i = 0; s[i] != '\0'; i++) r[i] = s[i]; for (j = 0; t[j] != '\0'; i++, j++) r[i] = t[j]; r[i] = '\0'; return r; } char *getfilename(s) char *s; { char *r = (char *) malloc(slen(s)); int i,j; for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n'; i++) ; for (j = 0; s[i] != '\0'; i++, j++) r[j] = s[i]; r[j] = '\0'; return r; } char *getfilenamebrack(s) char *s; { char *r = (char *) malloc(slen(s)); int i,j; for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '{'; i++) ; for (j = 0; s[i] != '}'; i++, j++) r[j] = s[i]; r[j] = '\0'; return r; } char *maketempfilename(s) char *s; { char *r = (char *)malloc(slen(s)); int i,j; for (i = 0, j = 0; s[j] != '\0'; i++, j++) { r[i] = s[j]; if (r[i] == '/') r[i] = '_'; } r[i] = '\0'; return strap("/tmp/",r); } void dosubdiac(s,t) char *s,*t; { system(strap("diac <", strap(texfile, strap(" > ", tempfile)))); } @EOF chmod 644 diacaux.c echo x - diacaux.h cat >diacaux.h <<'@EOF' /* diac.h * to be included in diac.l and diac.c * written by Dorai Sitaram, Rice University, 1990 */ char *texfile; char *tempfile; int slen(); char *strap(); char *getfilename(); char *getfilenamebrack(); char *maketempfilename(); void dosubdiac(); @EOF chmod 644 diacaux.h exit 0