[comp.lang.icon] gettext text-base routines

goer%sophist@GARGOYLE.UCHICAGO.EDU (Richard Goerwitz) (02/08/91)

I posted a set of text-base utilities a while back.  They have
been somewhat improved since then, and although I really haven't
had as much feedback as I'd like since the initial posting, I
believe the changes are enough to warrant a repost.

Please, if you use these programs - and especially if you find
bugs - drop me a line.

-Richard (goer@sophist.uchicago.edu)

---- Cut Here and feed the following to sh ----
#!/bin/sh
# This is a shell archive (produced by shar 3.49)
# To extract the files from this archive, save it to a file, remove
# everything above the "!/bin/sh" line above, and type "sh file_name".
#
# made 02/08/1991 06:07 UTC by goer@sophist.uchicago.edu
# Source directory /u/richard/Gettext
#
# existing files will NOT be overwritten unless -c is specified
# This format requires very little intelligence at unshar time.
# "if test", "cat", "rm", "echo", "true", and "sed" may be needed.
#
#                                                                          
#                                                                          
#
# This shar contains:
# length  mode       name
# ------ ---------- ------------------------------------------
#   2799 -r--r--r-- idxtext.icn
#   6225 -r--r--r-- gettext.icn
#   1310 -r--r--r-- adjuncts.icn
#   2230 -rw-r--r-- README
#    600 -rw-r--r-- Makefile.dist
#
if test -r _shar_seq_.tmp; then
	echo 'Must unpack archives in sequence!'
	echo Please unpack part `cat _shar_seq_.tmp` next
	exit 1
fi
# ============= idxtext.icn ==============
if test -f 'idxtext.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping idxtext.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting idxtext.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'idxtext.icn' &&
X############################################################################
X#
X#	Name:	 idxtext.icn
X#
X#	Title:	 idxtext (index text-base for gettext() routine)
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.9
X#
X############################################################################
X#
X#      Idxtext turns a file associated with gettext() routine into an
X#  indexed text- base.  Though gettext() will work fine with files
X#  that haven't been indexed via idxtext(), access is faster if the
X#  indexing is done if the file is, say, over 10k (on my system the
X#  crossover point is actually about 5k).
X#
X#      Usage is simply "idxtext file1 [file2 [...]]," where file1,
X#  file2, etc are the names of gettext-format files that are to be
X#  (re-)indexed.
X#
X#      Indexed files have a very simple format: keyname tab offset
X#  [tab offset [etc.]]\n.  The first line of the index file is a
X#  pointer to the last indexed byte of the text-base file it indexes.
X#
X#  BUGS:  Index files are too large.
X#
X############################################################################
X#
X#  Links: ./adjuncts.icn
X#
X#  Requires: UNIX or MS-DOS
X#
X#  See also: gettext.icn
X#
X############################################################################
X
X
Xglobal _slash, _baselen
X
Xprocedure main(a)
X
X    local temp_name, fname, infile, outfile
X    initial {
X	if find("UNIX", &features) then {
X	    _slash := "/"
X	    _baselen := 10
X	}
X	else if find("MS-DOS", &features) then {
X	    _slash := "\\"
X	    _baselen := 8
X	}
X	else stop("idxtext:  OS not supported")
X    }
X
X    # Check to see if we have any arguments.
X    *a = 0 & stop("usage:  idxtext file1 [file2 [...]]")
X
X    # Start popping filenames off of the argument list.
X    while fname := pop(a) do {
X
X	# Open input file.
X	infile := open(fname) | stop("idxtext:  ",fname," not found")
X	# Get index file name.
X	outfile := open(temp_name := Pathname(fname)||getidxname(fname),"w") |
X	    stop("idxtext:  ",temp_name," not found")
X
X	# Write index to index.IDX file.
X	write_index(infile, outfile)
X
X	every close(infile | outfile)
X
X    }
X
Xend
X
X
Xprocedure write_index(in, out)
X
X    local key_offset_table, w, line, KEY
X
X    # Write to out all keys in file "in," with their byte
X    # offsets.
X
X    key_offset_table := table()
X
X    while (w := where(in), line := read(in)) do {
X	line ? {
X	    if ="::" then {
X		KEY := trim(tab(0))
X		if not (/key_offset_table[KEY] := KEY || "\t" || w)
X		then stop("idxtext:  duplicate key, ",KEY)
X	    }
X	}
X    }
X
X    # First line of index contains the offset of the last
X    # indexed byte in write_index, so that we can still
X    # search unindexed parts of in.
X    write(out, where(in))
X
X    # Write sorted KEY\toffset lines.
X    if *key_offset_table > 0 then
X	every write(out, (!sort(key_offset_table))[2])
X
X    return
X
Xend
SHAR_EOF
true || echo 'restore of idxtext.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= gettext.icn ==============
if test -f 'gettext.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping gettext.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting gettext.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'gettext.icn' &&
X############################################################################
X#
X#	Name:	 gettext.icn
X#
X#	Title:	 gettext (simple text-base routines)
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.14
X#
X############################################################################
X#
X#  Gettext() and associated routines allow the user to maintain a file
X#  of KEY/value combinations such that a call to gettext(KEY, FNAME)
X#  will produce value.  Gettext() fails if no such KEY exists.
X#  Returns an empty string if the key exists, but has no associated
X#  value in the file, FNAME.
X#
X#  The file format is simple.  Keys belong on separate lines, marked
X#  as such by an initial colon+colon (::).  Values begin on the line
X#  following their respective keys, and extend up to the next
X#  colon+colon-initial line or EOF.  E.g.
X#
X#    ::sample.1
X#    Notice how the key above, sample.1, has :: prepended to mark it
X#    out as a key.  The text you are now reading represents that key's
X#    value.  To retrieve this text, you would call gettext() with the
X#    name of the key passed as its first argument, and the name of the
X#    file in which this text is stored as its second argument (as in
X#    gettext("sample.1","tmp.idx")).
X#    ::next.key
X#    etc...
X#
X#  For faster access, an indexing utility is included, idxtext.  Idxtext
X#  creates a separate index for a given text-base file.  If an index file
X#  exists in the same directory as FNAME, gettext() will make use of it.
X#  The index becomes worthwhile (at least on my system) after the text-
X#  base file becomes longer than 5 kilobytes.
X#
X#  Donts:
X#      1) Don't nest gettext text-base files.
X#      2) Don't use spaces and/or tabs in key names.
X#      3) Don't modify indexed files in any way other than to append
X#         additional keys/values (unless you want to re-index).
X#
X#  This program is intended for situations where keys tend to have
X#  very large values, and use of an Icon table structure would be
X#  unweildy.
X#
X#  BUGS:  Gettext() relies on the Icon runtime system and the OS to
X#  make sure the last text/index file it opens gets closed.
X#
X#  Note:  This program is NOT YET TESTED UNDER DOS.  In particular,
X#  I have no idea whether the indexing mechanism will work, due to
X#  translation that has to be done on MS-DOS text files.
X#
X############################################################################
X#
X#  Links: ./adjuncts.icn
X#
X#  Requires: UNIX (maybe MS-DOS; untested)
X#
X############################################################################
X
X
Xglobal _slash, _baselen
X
Xprocedure gettext(KEY,FNAME)
X
X    local line, value
X    static last_FNAME, intext, inidx
X    initial {
X	if find("UNIX", &features) then {
X	    _slash := "/"
X	    _baselen := 10
X	}
X	else if find("MS-DOS", &features) then {
X	    _slash := "\\"
X	    _baselen := 8
X	}
X	else stop("gettext:  OS not supported")
X    }
X
X    (/KEY | /FNAME) & stop("error (gettext):  null argument")
X
X    if FNAME ~== \last_FNAME then {
X	seek(intext, 1)
X	seek(\inidx, 1)
X    }
X    else {
X	# We've got a new text-base file.  Close the old one.
X	every close(\intext | \inidx)
X        # Try to open named text-base file.
X	intext := open(FNAME) | stop("gettext:  ",FNAME," not found")
X        # Try to open index file.
X	inidx := open(Pathname(FNAME) || getidxname(FNAME))
X    }
X    last_FNAME := FNAME
X
X    # Find offsets for key KEY in index file.  If inidx (the index
X    # file) is null (which happens when none was found), get_offsets()
X    # defaults to 1.  Otherwise it returns the offset for KEY in the
X    # index file, and then returns the last indexed byte of the file.
X    # Returning the last indexed byte lets us seek to the end and do a
X    # sequential search of any key/value entries that have been added
X    # since the last time idxtext was run.
X
X    seek(intext, get_offsets(KEY, inidx))
X
X    # Find key.  Should be right there, unless the user has appended
X    # key/value pairs to the end without re-indexing, or else has not
X    # bothered to index in the first place.  In this case we're
X    # supposed to start a sequential search for KEY upto EOF.
X
X    while line := (read(intext) | fail) do {
X	line ? {
X	    if (="::", =KEY, pos(0))
X	    then break
X	}
X    }
X
X    # Collect all text upto the next colon+colon-initial line (::)
X    # or EOF.
X    value := ""
X    while line := read(intext) do {
X	match("::",line) & break
X	value ||:= line || "\n"
X    }
X
X    # Note that a key with an empty value returns an empty string.
X    return trim(value, '\n')
X
Xend
X
X
X
Xprocedure get_offsets(KEY, inidx)
X
X    local bottom, top, loc, firstpart, offset
X    # Use these to store values likely to be reused.
X    static old_inidx, firstline, SOF, EOF
X
X    # If there's no index file, then just return an offset of 1.
X    if /inidx then
X	return 1
X
X    # First line contains offset of last indexed byte in the main
X    # text file.  We need this later.  Save it.  Start the binary
X    # search routine at the next byte after this line.
X    seek(inidx, 1)
X    if not (inidx === \old_inidx) then {
X
X	# Get first line.
X	firstline := !inidx
X	# Set "bottom."
X	1 = (SOF := where(inidx)-1) &
X	    stop("get_offsets:  corrupt .IDX file; reindex")
X	# How big is this file?
X	seek(inidx, 0)
X	EOF := where(inidx)
X
X	old_inidx := inidx
X    }
X    # SOF, EOF constant for a given inidx file.
X    bottom := SOF; top := EOF
X
X    # If bottom gets bigger than top, there's no such key.
X    until bottom > top do {
X
X	loc := (top+bottom) / 2
X	seek(inidx, loc)
X
X	# Move past next newline.  If at EOF, break.
X	incr := 1
X	until reads(inidx) == "\n" do
X	    incr +:= 1
X	if loc+incr = EOF then {
X	    top := loc-1
X	    next
X	}
X
X	# Check to see if the current line contains KEY.
X	read(inidx) ? {
X
X	    # .IDX file line format is KEY\toffset
X	    firstpart := tab(find("\t"))
X	    if KEY == firstpart then {
X		# return offset
X		return (move(1), tab(0))
X	    }
X	    # Ah, this is what all binary searches do.
X	    else {
X		if KEY << firstpart
X		then top := loc-1
X		else bottom := loc + incr + *&subject
X	    }
X	}
X    }
X
X    # First line of the index file contains offset of last indexed
X    # byte + 1.  Might be the only line in the file (if it had no
X    # keys when it was indexed).
X    return firstline
X
Xend
SHAR_EOF
true || echo 'restore of gettext.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= adjuncts.icn ==============
if test -f 'adjuncts.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping adjuncts.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting adjuncts.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'adjuncts.icn' &&
X############################################################################
X#
X#	Name:	 adjuncts.icn
X#
X#	Title:	 adjuncts (adjunct utilities for gettext and idxtext)
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.1
X#
X############################################################################
X#  
X#  Pretty mundane stuff.  Basename(), Pathname(), Strip(), and a utility
X#  for creating index filenames.
X#
X############################################################################
X#
X#  Links: none
X#
X#  See also: gettext.icn, idxtext,icn
X#
X############################################################################
X
X
Xprocedure Basename(s)
X
X    # global _slash
X    s ? {
X	while tab(find(_slash)+1)
X	return tab(0)
X    }
X
Xend
X
X
Xprocedure Pathname(s)
X
X    # global _slash
X    s2 := ""
X    s ? {
X	while s2 ||:= tab(find(_slash)+1)
X	return s2
X    }
X
Xend
X
X
Xprocedure getidxname(FNAME)
X
X    #
X    # Discard path component.  Cut basename down to a small enough
X    # size that the OS will be able to handle addition of the ex-
X    # tension ".IDX"
X    #
X
X    # global _slash, _baselen
X    return right(Strip(Basename(FNAME,_slash),'.'), _baselen, "x") || ".IDX"
X
Xend
X
X
Xprocedure Strip(s,c)
X
X    local s2
X
X    s2 := ""
X    s ? {
X	while s2 ||:= tab(upto(c))
X	do tab(many(c))
X	s2 ||:= tab(0)
X    }
X    return s2
X
Xend
SHAR_EOF
true || echo 'restore of adjuncts.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= README ==============
if test -f 'README' -a X"$1" != X"-c"; then
	echo 'x - skipping README (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting README (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'README' &&
X
XThis archive contains gettext() and associated routines.  These allow
Xthe user to maintain a file of key/value combinations such that a call
Xto gettext(key, FNAME) will produce value.  Fails if no such key
Xexists.  Returns an empty string if the key exists, but has no
Xassociated value in the file named in arg 2 (FNAME).  Gettext() is
Xintended for use in situations where keys need to be associated with
Xvery large strings (i.e. where hand-inserting these values into hash
Xtables would be unweildy, and would take up a sizable chunk of
Xmemory).
X
XThe file format is simple.  Keys belong on separate lines, marked
Xas such by an initial colon+colon (::).  Values begin on the line
Xfollowing their respective keys, and extend up to the next
Xcolon+colon-initial line or EOF.  E.g.
X
X   ::sample.1
X   Notice how the key above, sample.1, has :: prepended to mark it
X   out as a key.  The text you are now reading represents that key's
X   value.  To retrieve this text, you would call gettext() with the
X   name of the key passed as its first argument, and the name of the
X   file in which this text is stored as its second argument (as in
X   gettext("sample.1","tmp.idx")).
X   ::next.key
X   etc...
X
XFor faster access, an indexing utility is included, idxtext.  Idxtext
Xcreates a separate index for a given text-base file.  If an index file
Xexists in the same directory as FNAME, gettext() will make use of it.
XOtherwise, it just does a sequential search of the entire file (this
Xworks fine for smaller files).  Please don't change a file, once you've
Xrun idxtext on it, except to append key/value entries to it.  If you
Xalter the indexed portion of the file in any way, you must reindex.
X
XBoth idxtext.icn and gettext.icn need to be linked with a common set
Xof utilities, ./adjuncts.icn.
X
XFor a list of do/dont's, see the comments prepended to gettext.icn.
XNote that these routines are, thus far, tested only under UNIX, and
Xhave not yet been used on a system for which seek() gets skewed by
Xline-end translations done on text files (e.g. MS-DOS).  I would ex-
Xpect them to work under DOS, though I cannot say how robust they
Xwould prove to be.
X
XSend bug reports/fixes, comments, etc. to -
X
XRichard Goerwitz (goer@sophist.uchicago.edu)
SHAR_EOF
true || echo 'restore of README failed'
rm -f _shar_wnt_.tmp
fi
# ============= Makefile.dist ==============
if test -f 'Makefile.dist' -a X"$1" != X"-c"; then
	echo 'x - skipping Makefile.dist (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting Makefile.dist (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'Makefile.dist' &&
X# Please edit these to reflect your local file structure & conventions.
XDESTDIR = /usr/local/bin
XOWNER = bin
XGROUP = bin
X
Xidxtext:
X	icont idxtext.icn adjuncts.icn
X
X# Pessimistic assumptions regarding the environment (in particular,
X# I don't assume you have the BSD "install" shell script).
Xinstall: idxtext
X	@echo "\nInstalling idxtext in $(DESTDIR).\n"
X	@sh -c "test -d $(DESTDIR) || (mkdir $(DESTDIR) && chmod 755 $(DESTDIR))"
X	cp idxtext $(DESTDIR)/
X	chgrp $(GROUP) $(DESTDIR)/idxtext
X	chown $(OWNER) $(DESTDIR)/idxtext
X	@echo "\nInstallation done.\n"
X
Xclean:
X	-rm -f *u? *~
X	-rm -f idxtext test
SHAR_EOF
true || echo 'restore of Makefile.dist failed'
rm -f _shar_wnt_.tmp
fi
exit 0