[comp.lang.icon] simple text-base utility

goer@quads.uchicago.edu (Richard L. Goerwitz) (01/24/91)

Ever want to work with tables, in order to retrieve entries on the
basis of a key, but yet have entries that are so big that the whole
process gets terribly unweildy?  If you have, here's a little util-
ity that lets you dump all your text into a file, but yet still be
able to access it as if it were in a table.

I wrote this for a project I was doing the other day, but I think
it would be of general interest.  Note that it's *not* been extensively
tested.

-Richard


---- Cut Here and feed the following to sh ----
#!/bin/sh
# This is a shell archive (produced by shar 3.49)
# To extract the files from this archive, save it to a file, remove
# everything above the "!/bin/sh" line above, and type "sh file_name".
#
# made 01/22/1991 16:13 UTC by goer@sophist.uchicago.edu
# Source directory /u/richard/Gettext
#
# existing files will NOT be overwritten unless -c is specified
# This format requires very little intelligence at unshar time.
# "if test", "cat", "rm", "echo", "true", and "sed" may be needed.
#
#                                                                          
#                                                                          
#
# This shar contains:
# length  mode       name
# ------ ---------- ------------------------------------------
#   1943 -r--r--r-- idxtext.icn
#   4681 -r--r--r-- gettext.icn
#   1310 -r--r--r-- adjuncts.icn
#   2222 -rw-r--r-- README
#    611 -rw-r--r-- Makefile.dist
#
if test -r _shar_seq_.tmp; then
	echo 'Must unpack archives in sequence!'
	echo Please unpack part `cat _shar_seq_.tmp` next
	exit 1
fi
# ============= idxtext.icn ==============
if test -f 'idxtext.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping idxtext.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting idxtext.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'idxtext.icn' &&
X############################################################################
X#
X#	Name:	 idxtext.icn
X#
X#	Title:	 idxtext (index text-base for gettext() routine)
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.2
X#
X############################################################################
X#  
X#  Turns a file associated with gettext() routine into an indexed text-
X#  base.  Though gettext() will work fine with files that haven't been
X#  indexed via idxtext(), access is much faster if the indexing is done.
X#
X#  Usage is simply:
X#
X#      idxtext file1 [file2 [...]]
X#
X#  where file1, file2, etc are the names of gettext-format files that
X#  are to be (re-)indexed.
X#
X############################################################################
X#
X#  Links: ./adjuncts.icn
X#
X#  Requires: UNIX or MS-DOS
X#
X#  See also: gettext.icn
X#
X############################################################################
X
X
Xglobal _slash, _baselen
X
Xprocedure main(a)
X
X    local temp_name, fname, infile, outfile
X    initial {
X	if find("UNIX", &features) then {
X	    _slash := "/"
X	    _baselen := 10
X	}
X	else if find("MS-DOS", &features) then {
X	    _slash := "\\"
X	    _baselen := 8
X	}
X	else stop("idxtext:  OS not supported")
X    }
X
X    # Check to see if we have any arguments.
X    *a = 0 & stop("usage:  idxtext file1 [file2 [...]]")
X
X    # Start popping filenames off of the argument list.
X    while fname := pop(a) do {
X
X	# Open input file.
X	infile := open(fname) | stop("idxtext:  ",fname," not found")
X	# Get index file name.
X	outfile := open(temp_name := Pathname(fname)||getidxname(fname),"w") |
X	    stop("idxtext:  ",temp_name," not found")
X
X	# Write index to temporary file.
X	write_index(infile, outfile)
X
X	every close(infile | outfile)
X
X    }
X
Xend
X
X
Xprocedure write_index(in, out)
X
X    local w, line
X 
X    while (w := where(in), line := read(in)) do {
X	line ? {
X	    if ="::" then
X		write(out, trim(tab(0)), "\t", w)
X	}
X    }
X
X    return
X
Xend
SHAR_EOF
true || echo 'restore of idxtext.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= gettext.icn ==============
if test -f 'gettext.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping gettext.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting gettext.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'gettext.icn' &&
X############################################################################
X#
X#	Name:	 gettext.icn
X#
X#	Title:	 gettext (simple text-base routines)
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.4
X#
X############################################################################
X#
X#  Gettext() and associated routines allow the user to maintain a file
X#  of KEY/value combinations such that a call to gettext(KEY, FNAME)
X#  will produce value.  Fails if no such KEY exists.  Returns an empty
X#  string if the key exists, but has no associated value in the file,
X#  FNAME.
X#
X#  The file format is simple.  Keys belong on separate lines, marked
X#  as such by an initial colon+colon (::).  Values begin on the line
X#  following their respective keys, and extend up to the next
X#  colon+colon-initial line or EOF.  E.g.
X#
X#    ::sample.1
X#    Notice how the key above, sample.1, has :: prepended to mark it
X#    out as a key.  The text you are now reading represents that key's
X#    value.  To retrieve this text, you would call gettext() with the
X#    name of the key passed as its first argument, and the name of the
X#    file in which this text is stored as its second argument (as in
X#    gettext("sample.1","tmp.idx")).
X#    ::next.key
X#    etc...
X#
X#  For faster access, an indexing utility is included, idxtext.  Idxtext
X#  creates a separate index for a given text-base file.  If an index file
X#  exists in the same directory as FNAME
X#
X#  Donts:
X#      1) Don't nest gettext text-base files.
X#      2) Don't use spaces and/or tabs in key names.
X#      3) Don't modify indexed files in any way other than to append
X#         additional keys/values (unless you want to re-index).
X#
X#  This program is intended for situations where keys tend to have
X#  very large values, and use of an Icon table structure would be
X#  unweildy.
X#
X#  BUGS:  Fairly slow.  Could be modified to use the library routine
X#  findre.icn, and do regexp pattern matches on keys.  Wouldn't that
X#  be nice?  Should be modified to alphabetize indices, and then do a
X#  real binary search of the index.  This would preclude easy regexp
X#  pattern matches, but would be worth it for larger databases.
X#
X#  Note:  This program is NOT YET TESTED UNDER DOS.  In particular,
X#  I have no idea whether the indexing mechanism will work, due to
X#  translation that has to be done on MS-DOS text files.
X#
X############################################################################
X#
X#  Links: ./adjuncts.icn
X#
X#  Requires: UNIX (maybe MS-DOS; untested)
X#
X############################################################################
X
X
Xglobal _slash, _baselen
X
Xprocedure gettext(KEY,FNAME)
X
X    local line, value
X    static last_FNAME, intext, inidx
X    initial {
X	if find("UNIX", &features) then {
X	    _slash := "/"
X	    _baselen := 10
X	}
X	else if find("MS-DOS", &features) then {
X	    _slash := "\\"
X	    _baselen := 8
X	}
X	else stop("gettext:  OS not supported")
X    }
X
X    (/KEY | /FNAME) & stop("error (gettext):  null argument")
X
X    if FNAME ~== \last_FNAME then {
X	seek(intext, 1)
X	seek(inidx, 1)
X    }
X    else {
X	# We've got a new text-base file.  Close the old one.
X	every close(\intext | \inidx)
X        # Try to open named text-base file.
X	intext := open(FNAME) | stop("gettext:  ",FNAME," not found")
X        # Try to open index file.
X	inidx := open(Pathname(FNAME) || getidxname(FNAME))
X    }
X    last_FNAME := FNAME
X
X    # Find offsets for key KEY in index file.  Defaults to 1.
X    every seek(intext, get_offsets(KEY, inidx)) do {
X
X	# Find key.  Should be right there, unless the user has
X	# appended key/value pairs to the end without re-indexing, or
X	# else has not bothered to index in the first place.
X	while line := (read(intext) | fail) do {
X	    line ? {
X		if (="::", =KEY, pos(0))
X		then break
X	    }
X	}
X
X        # Collect all text upto the next colon+colon-initial line (::)
X	# or EOF.
X	value := ""
X	while line := read(intext) do {
X	    match("::",line) & break
X	    value ||:= line || "\n"
X	}
X
X        # Note that a key with an empty value returns an empty string.
X	suspend value
X
X    }
X
Xend
X
X
Xprocedure get_offsets(KEY, inidx)
X
X    # If there's no index file, then just return an offset of 1.
X    if /inidx then
X	return 1
X
X    # I guess we could insert some sort of binary search routine
X    # here, but I'm feeling lazy.
X
X    # Find every instance of this key, KEY, in the index file.
X    while line := read(inidx) do {
X	line ? {
X	    # Tab separates key from offset.
X	    (=KEY, ="\t") | next
X	    # Rest of the line contains the offset.
X	    suspend integer(tab(0))
X	}
X    }
X
X    # Last line of the index file contains offset of last indexed
X    # byte + 1.
X    return integer(line)
X
Xend
SHAR_EOF
true || echo 'restore of gettext.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= adjuncts.icn ==============
if test -f 'adjuncts.icn' -a X"$1" != X"-c"; then
	echo 'x - skipping adjuncts.icn (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting adjuncts.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'adjuncts.icn' &&
X############################################################################
X#
X#	Name:	 adjuncts.icn
X#
X#	Title:	 adjuncts (adjunct utilities for gettext and idxtext)
X#
X#	Author:	 Richard L. Goerwitz
X#
X#	Version: 1.1
X#
X############################################################################
X#  
X#  Pretty mundane stuff.  Basename(), Pathname(), Strip(), and a utility
X#  for creating index filenames.
X#
X############################################################################
X#
X#  Links: none
X#
X#  See also: gettext.icn, idxtext,icn
X#
X############################################################################
X
X
Xprocedure Basename(s)
X
X    # global _slash
X    s ? {
X	while tab(find(_slash)+1)
X	return tab(0)
X    }
X
Xend
X
X
Xprocedure Pathname(s)
X
X    # global _slash
X    s2 := ""
X    s ? {
X	while s2 ||:= tab(find(_slash)+1)
X	return s2
X    }
X
Xend
X
X
Xprocedure getidxname(FNAME)
X
X    #
X    # Discard path component.  Cut basename down to a small enough
X    # size that the OS will be able to handle addition of the ex-
X    # tension ".IDX"
X    #
X
X    # global _slash, _baselen
X    return right(Strip(Basename(FNAME,_slash),'.'), _baselen, "x") || ".IDX"
X
Xend
X
X
Xprocedure Strip(s,c)
X
X    local s2
X
X    s2 := ""
X    s ? {
X	while s2 ||:= tab(upto(c))
X	do tab(many(c))
X	s2 ||:= tab(0)
X    }
X    return s2
X
Xend
SHAR_EOF
true || echo 'restore of adjuncts.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= README ==============
if test -f 'README' -a X"$1" != X"-c"; then
	echo 'x - skipping README (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting README (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'README' &&
X
XThis archive contains gettext() and associated routines.  These allow
Xthe user to maintain a file of key/value combinations such that a call
Xto gettext(key, FNAME) will produce value.  Fails if no such key
Xexists.  Returns an empty string if the key exists, but has no
Xassociated value in the file named in arg 2 (FNAME).  Gettext() is
Xintended for use in situations where keys need to be associated with
Xvery large strings (i.e. where hand-inserting these values into hash
Xtables would be unweildy, and would take up a sizable chunk of
Xmemory).
X
XThe file format is simple.  Keys belong on separate lines, marked
Xas such by an initial colon+colon (::).  Values begin on the line
Xfollowing their respective keys, and extend up to the next
Xcolon+colon-initial line or EOF.  E.g.
X
X   ::sample.1
X   Notice how the key above, sample.1, has :: prepended to mark it
X   out as a key.  The text you are now reading represents that key's
X   value.  To retrieve this text, you would call gettext() with the
X   name of the key passed as its first argument, and the name of the
X   file in which this text is stored as its second argument (as in
X   gettext("sample.1","tmp.idx")).
X   ::next.key
X   etc...
X
XFor faster access, an indexing utility is included, idxtext.  Idxtext
Xcreates a separate index for a given text-base file.  If an index file
Xexists in the same directory as FNAME.
X
XBoth idxtext.icn and gettext.icn need to be linked with a common set
Xof utilities, ./adjuncts.icn.
X
XThere are lots of things that might be done to gettext/idxtext, such
Xas implement a binary search mechanism in the routine get_offsets(),
Xand compress or pack portions or the index file.  Might also be sen-
Xsible to offer regex patterns as an option for key/value retrievals.
XThis is a pretty rough version, as it stands, and I hope someone will
Xsee fit to modify it some time.  Note that it's untested under MS-DOS,
Xbut I suppose it could be made to work there.  The only big problem
XI foresee is with translation of CR+LF sequences and the consequent
Xdisruption of where() and seek().
X
XFor a list of do's/dont's, see the comments prepended to gettext.icn.
XSend bug reports/fixes, comments, etc. to -
X
XRichard Goerwitz (goer@sophist.uchicago.edu)
SHAR_EOF
true || echo 'restore of README failed'
rm -f _shar_wnt_.tmp
fi
# ============= Makefile.dist ==============
if test -f 'Makefile.dist' -a X"$1" != X"-c"; then
	echo 'x - skipping Makefile.dist (File already exists)'
	rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting Makefile.dist (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'Makefile.dist' &&
X# Please edit these to reflect your local file structure & conventions.
XDESTDIR = /usr/local/bin
XOWNER = bin
XGROUP = bin
X
Xidxtext: idxtext.icn
X	icont idxtext.icn adjunct.icn
X
X# Pessimistic assumptions regarding the environment (in particular,
X# I don't assume you have the BSD "install" shell script).
Xinstall: idxtext
X	@echo "\nInstalling idxtext in $(DESTDIR).\n"
X	@sh -c "test -d $(DESTDIR) || (mkdir $(DESTDIR) && chmod 755 $(DESTDIR))"
X	cp idxtext $(DESTDIR)/
X	chgrp $(GROUP) $(DESTDIR)/idxtext
X	chown $(OWNER) $(DESTDIR)/idxtext
X	@echo "\nInstallation done.\n"
X
Xclean:
X	-rm -f *u? *~
X	-rm -f idxtext test
SHAR_EOF
true || echo 'restore of Makefile.dist failed'
rm -f _shar_wnt_.tmp
fi
exit 0