[comp.lang.perl] Converting formatted text files to TeX

CFGROB@weizmann.bitnet (01/05/91)

I wrote this script to be able to print some bulletins and other
running formated texts in TeX. I thought it is a good example on how
powerful the text manipulation facilities of Perl are and I thought
there might be a general interest for the script.

#########################################################################
##  The program takes a formated text file and translates it into TeX
##  format. It will try to interpret the intentions of the format as
##  much as possible and translate these into the corresponding TeX
##  sequences.
##
##  There are still several formatting items that could be added,
##  e.g. recognizing columns and removing page numbering.
##
##  Dov Grobgeld
##  Department of Chemical Physics
##  The Weizmann Institute of Science
##  Rehovot Israel
##  Bitnet: CFGROB@WEIZMANN
##
##  Version 0.1
##  3 January 1991
##
##  This program is donated to the public domain. If modified I would
##  be pleased to receive information thereof, so I can include new
##  features in my next version.
###########################################################################

$*=1;    # Yes, do multiline pattern matching...

# 1. Scan through the file to find the longest line
#    The information will be used for identifying centered lines.
$infile=@ARGV[0]; open(INFILE, $infile);
$maxline=0;
while (<INFILE>) {
  if (length > $maxline) {$maxline = length;}
}

# 2. Scan again and try to interpret the format of the text author
open(INFILE, $infile);

# Print some TeX commands in the output file

# Redefine the paragraph skip
print '\parskip=0pt plus 2pt',"\n";

# Define some macros
print '\def\emptylineskip{\vskip10pt plus 2pt minus 2pt}',"\n";
print '\def\horizontalline{\vskip2mm\hrule\vskip2mm}',"\n";
print "\n";

$_=<INFILE>;
while (!eof) {
  # convert tabs to spaces
  1 while s/\t/" " x (8 - length($`) % 8)/e;

  # Get line length and amount of indention
  $l=length;
  if (s/(^ +)//) {$indent=length($1);} else { $indent=0; }
  s/ +\n/\n/; #strip trailing spaces

  # An empty line, jump an emptylineskip
  if (length($_)==1) {
    $par="\\emptylineskip\n";
    eof  ($_=<INFILE>);
  }

  # A repeated character during the whole line is converted to a horizontal
  # line.
  # A very naive solution, could be made much more flexible...
  elsif ($_ eq substr($_,0,1) x (length($_)-1) . "\n") {
    $par="\\horizontalline\n";
    eof  ($_=<INFILE>);
  }

  # A centered line
  elsif (($indent >= 8) && (($maxline - $l) - $indent < $maxline/3)) {
    chop; # Take of the line feed
    $_="\\centerline\{" . $_ . "\}\n";
    $par=$_;
    eof  ($_=<INFILE>);
  }

  # An index line
  elsif ($l < $maxline/2) {
    if ($indent==0) { $_="\\noindent ".$_; }
    $par=$_."\n";                  # Add an extra line to make into a paragraph
    eof  ($_=<INFILE>);          # Get the next line
  }

  # A paragraph
  else {
    # Check if the paragraph has the form of an item
    if ((s/^\s*(\S+)\s\s//)  # First word is separated by two or more spaces
                               # from the second word
       (s/^\s*(\w+\.)//)     # First word is followed by a full stop
       (s/^\s*(\w+\))//) ) {   # First word is followed by a closing bracket
      $par="\\item\{$1\}";
    } else { $par=""; };

    $linesinpar=0;
#    if ($indent==0) { $par="\\noindent\n".$par; }
    $line=$_;
    $maxl=0;  # Will hold the length of the longest line in the paragraph
    do {
      $linesinpar++;
      $par.=$line;  # Add line to paragraph

      eof  ($_=<INFILE>);
      $line=$_;

      # convert tabs to spaces
      1 while $line=~ s/\t/" " x (8 - length($`) % 8)/e;

      # Save old line information
      $oldind=$indent; $oldl=$l;

      # Get line length and indention as for first line above
      $l=length($line);
      if ($line=~ s/(^\s+)//) { $indent=length($1);}
      else { $indent=0; }

      # Test if it is the longest line in the paragraph
      if ($l > $maxl) { $maxl = $l; }
    } until      # List all conditions for end of paragraph
         # We reached the end of the file
            eof
         # The current line is empty
             (length($line) <= 1)
         # The current line is not the first line in the paragraph and
         # the indention of the current line is different from the
         # last line in the paragraph.
             (($linesinpar > 1) && !($oldind==$indent))
         # The last line in the paragraph is shorter than half of
         # the longest line in the paragraph
             ($oldl < $maxl/2)
         # The current line is a composed of only one repeated character
             ($_ eq substr($_,0,1) x (length($_)-1) . "\n");

    # If it was an empty line that ended the paragraph get next line
    if (length($line)<=1) {$_=<INFILE>;}

    # If it was the end of file that ended the paragraph, add current line
    # to the paragraph
    if (eof) { $par.=$line;}
    $par.="\n";
  }

  # Try to compensate for multiple spaces. Really should try to figure
  # out if the text file has columns... I'll save that for later...
  $par=~ s/    /\\quad/g;

  # protect certain characters
  $par=~ s/\#/\\#/g;
  $par=~ s/\&/\\&/g;
  $par=~ s/\%/\\%/g;
  $par=~ s/\$/\\\$/g;
  $par=~ s/\_/\\_/g;
  $par=~ s/\~/\$\\ast\$/g;
  $par=~ s/\.\.\./\{\\dots\}/g;

  # Replace "..." with ``...''
  while (($par=~ s/"/\`\`/) && ( $par=~ s/"/\'\'/ )) {};

  print $par;
}
print "\n\\end"