[alt.sources] lq-text Full Text Retrieval Database Part 02/13

lee@sq.sq.com (Liam R. E. Quin) (03/04/91)
: cut here --- cut here --
: To unbundle, sh this file
#! /bin/sh
: part 02
echo x - lq-text/src/filters/Makefile 1>&2
sed 's/^X//' >lq-text/src/filters/Makefile <<'@@@End of lq-text/src/filters/Makefile'
X# filters/Makefile -- Copyright 1990 Liam R. Quin.  All Rights Reserved.
X# This code is NOT in the public domain.
X# See the file ../COPYRIGHT for full details.
X
X# This Makefile belongs in the "src/filters" directory.
X#
X# Note that most of the actual configuration is done in ../Makefile and
X# in ../h/global.h, and not here.
X
X# $Id: Makefile,v 1.4 90/10/06 00:57:26 lee Rel $
X
X
X# This is what gets made:
XTARGETS = MailFilter NewsFilter
XLIBFILES=$(TARGETS)
XEXTRA=-DMAILFILTER='$(MAILFILTER)' -DNEWSFILTER='$(NEWSFILTER)' $(EXTRA)
X
XSRCS = FilterMain.c FilterType.c MailFilter.c NewsFilter.c
XOBJS = FilterMain.o FilterType.o MailFilter.o NewsFilter.o
X
XPWD=filters
X
XDESTDIR=../lib
XLQ=../lib/liblq.a
XMODE=755
X
X# for compiling:
XEXTRA=-I../h
XRANLIB=echo
X
Xall: $(TARGETS)
X
Xsaber_src:
X	echo $(PWD)
X	#cd $(PWD)
X	#load $(CFLAGS) $(SRCS)
X	#cd ..
X
Xsaber_obj:
X	#cd $(PWD)
X	#load $(CFLAGS) $(SRCS)
X	#cd ..
X
Xinstall: all
X	for i in $(LIBFILES); do cp "$$i" $(DESTDIR); \
X	strip "$(DESTDIR)/$$i" ; \
X	chmod $(MODE) "$(DESTDIR)/$$i" ; \
X	done
X
Xtidy:
X	/bin/rm -f *.o core m.log tags
X
Xclean: tidy
X	/bin/rm -f $(TARGETS) $(TEST)
X
Xdepend:
X	mkdep $(CFLAGS) *.c
X
XCFilter: FilterMain.o CFilter.o
X	$(CC) $(CFLAGS) -o CFilter FilterMain.o CFilter.o $(MALLOC) $(LQ)
X
XNewsFilter: FilterMain.o NewsFilter.o
X	$(CC) $(CFLAGS) -o NewsFilter FilterMain.o NewsFilter.o $(MALLOC) $(LQ)
X
XMailFilter: FilterMain.o MailFilter.o
X	$(CC) $(CFLAGS) -o MailFilter FilterMain.o MailFilter.o $(MALLOC) $(LQ)
X
XCDMSFilter: FilterMain.o CDMSFilter.o
X	$(CC) $(CFLAGS) -o CDMSFilter FilterMain.o CDMSFilter.o $(MALLOC) $(LQ)
X
X#
X# $Log:	Makefile,v $
X# Revision 1.4  90/10/06  00:57:26  lee
X# Prepared for first beta release.
X# 
X# Revision 1.3  90/10/03  21:14:45  lee
X# Added MAILFILTER stuff.
X# 
X# Revision 1.2  90/09/28  21:54:43  lee
X# No longer uses OWNER.
X# 
X# Revision 1.1  90/08/09  19:17:58  lee
X# Initial revision
X 
X# DO NOT PUT ANYTHING AFTER THIS LINE
X# DO NOT DELETE THIS LINE -- mkdep uses it.
X# DO NOT PUT ANYTHING AFTER THIS LINE, IT WILL GO AWAY.
X
XFilterMain.o: FilterMain.c
XMailFilter.o: MailFilter.c /usr/include/malloc.h
XMailFilter.o: ../h/wordrules.h ../h/emalloc.h
XNewsFilter.o: NewsFilter.c 
XNewsFilter.o: ../h/wordrules.h ../h/emalloc.h
XTroffFilter.o: TroffFilter.c 
XTroffFilter.o: ../h/wordrules.h ../h/emalloc.h
X
X# IF YOU PUT ANYTHING HERE IT WILL GO AWAY
@@@End of lq-text/src/filters/Makefile
echo x - lq-text/src/filters/NewsFilter.c 1>&2
sed 's/^X//' >lq-text/src/filters/NewsFilter.c <<'@@@End of lq-text/src/filters/NewsFilter.c'
X/* NewsFilter.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* $Id: NewsFilter.c,v 1.5 90/10/06 00:57:27 lee Rel1-10 $
X */
X
X/* Filter for usenet articles.
X * Throw away all of the header except
X * Subject
X * From
X * Organi[sz]ation
X *
X * Probably ought to keep Message-ID, but I can't store it anyway!
X *
X * See FilterMain and wordrules.h for more info.
X *
X */
X
X#ifdef SYSV
X extern int _filbuf(), _flsbuf(); /* for lint! */
X#endif
X#include <stdio.h>
X#include <malloc.h>
X#include <ctype.h>
X#include "wordrules.h"
X
X#include "emalloc.h"
X
X#define STREQ(boy, girl) ((*(boy) == *(girl)) && !strcmp(boy, girl))
X
X/** C Library functions that need to be declared: **/
X#ifndef tolower
X extern int tolower();
X#endif
Xextern int strcmp();
X/** Functions in this file that need to be declared **/
Xint GetChar();
Xvoid Header(), Body();
X/** **/
X
Xextern char *progname;
Xvoid Filter();
X
Xchar *KeepThese[] = { /* these must be sorted on the first character */
X    "from",
X    "keywords",
X    "summary",
X    "subject",
X    "organisation",
X    "organization",
X    0
X};
X
Xint icstreq(s1, s2) /* case insensitive strcmp */
X    char *s1, *s2;
X{
X    register char ch1, ch2;
X
X    while (*s1 && *s2) {
X	if (*s1 != *s2) {
X	    if (isupper(*s1)) {
X		ch1 = tolower(*s1);
X		ch2 = (*s2);
X	    } else if (isupper(*s2)) {
X		/* Note that we only have to test one character for case! */
X		ch1 = (*s1);
X		ch2 = tolower(*s2);
X	    } else {
X		return 0; /* they are different */
X	    }
X	    if (ch1 != ch2) return 0; /* the strings differ */
X	}
X	s1++; s2++;
X    }
X    if (!*s1 && !*s2) {
X	return 1; /* they are the same */
X    }
X    return 0; /* they are different */
X}
X
Xint
XIsWanted(String)
X    char *String;
X{
X    char **p;
X    int ch = String[0];
X
X    if (isupper(ch)) ch = tolower(ch);
X
X    for (p = KeepThese; *p && **p; p++) {
X	if (ch < **p) return 0; /* gone too far */
X 	else if (icstreq(String, *p)) return 1;
X    }
X    return 0;
X}
X
Xvoid
XFilter(InputFile, Name)
X    FILE *InputFile;
X    char *Name;
X{
X    Header(InputFile, Name);
X    Body(InputFile, Name);
X}
X
Xtypedef enum {
X    F_NotSeenAnythingYet,
X    F_InTheFirstWord,
X    F_AfterTheFirstWord
X} t_FirstWord;
X
Xint InWord = 0;
X
Xvoid
XHeader(InputFile, Name)
X    FILE *InputFile;
X    char *Name;
X{
X    int AtStartOfLine = 1;
X    int IgnoreLine = 1; /* initialised for lint and gcc -W really... */
X    t_FirstWord FirstWord = F_NotSeenAnythingYet;
X    int ch;
X    static int BufLen;
X    static char *Buffer = 0;
X    int AtStartOfWord;
X    register char *q;
X
X    if (Buffer == 0) {
X	BufLen = 24;
X	Buffer = emalloc(BufLen);
X    }
X
X    q = Buffer;
X    InWord = 0;
X
X    while ((ch = GetChar(InputFile)) != EOF) {
X	if (ch == '\n') {
X	    if (AtStartOfLine) { /* a blank line */
X		putchar('\n');
X		return;
X	    }
X	}
X
X	InWord = InWord ? WithinWord(ch) : StartsWord(ch);
X
X	switch (FirstWord) {
X	case F_NotSeenAnythingYet:
X	    if (InWord) {
X		FirstWord = F_InTheFirstWord;
X		if (q - Buffer >= BufLen - 1) {
X		    int where = q - Buffer;
X
X		    BufLen += 24;
X		    Buffer = erealloc(Buffer, BufLen);
X		    q = &Buffer[where];
X		}
X		*q++ = ch;
X	    } else {
X		putchar(' ');
X	    }
X	    break;
X	case F_InTheFirstWord:
X	    if (InWord) {
X		if (q - Buffer >= BufLen - 1) {
X		    int where = q - Buffer;
X
X		    BufLen += 24;
X		    Buffer = erealloc(Buffer, BufLen += 24);
X		    q = &Buffer[where];
X		}
X		*q++ = ch;
X		break;
X	    } else { /* reached the end of the first word on the line */
X		*q = '\0';
X		/* See if it's a keyword */
X  		if ((IgnoreLine = !IsWanted(Buffer)) != 0) {
X		    /* Turn the word into one that won't get indexed,
X		     * so that word counmts are unaffected:
X		     * We use qxxxxxxx (any number of x's) for this.
X		     */
X		    for (q = Buffer; *q; q++) {
X			putchar((q == Buffer) ? 'q' : 'x');
X		    }
X		    putchar (ch == '\n' ? '\n' : ' ');
X		} else {
X		    printf("%s%c", Buffer, ch == '\n' ? ch : ' ');
X		}
X		FirstWord = F_AfterTheFirstWord;
X	    }
X	    break;
X	default:
X	    if ((AtStartOfLine = (ch == '\n'))) {
X		IgnoreLine = 0;
X		q = Buffer;
X		FirstWord = F_NotSeenAnythingYet;
X		AtStartOfWord = 1;
X	    }
X	    if (InWord && !IgnoreLine) {
X		putchar(ch);
X	    } else {
X		if (AtStartOfWord && InWord) {
X		    putchar('q');
X		    AtStartOfWord = 0;
X		} else if (InWord) {
X		    putchar('x');
X		} else if (isspace(ch)) {
X		    putchar(ch);
X		} else {
X		    putchar(' ');
X		}
X	    }
X	    if (!InWord) AtStartOfWord = 1;
X	}
X	if ((AtStartOfLine = (ch == '\n'))) {
X	    IgnoreLine = 0;
X	    q = Buffer;
X	    FirstWord = F_NotSeenAnythingYet;
X	    AtStartOfWord = 1;
X	}
X    }
X    if (ch == EOF) {
X	fprintf(stderr, "%s: warning: Mail folder %s has no message body\n",
X			progname, Name);
X    }
X}
X
Xvoid
XBody(InputFile, Name)
X    FILE *InputFile;
X    char *Name;
X{
X    int ch;
X
X    while ((ch = GetChar(InputFile)) != EOF) {
X	if (InWord = InWord ? WithinWord(ch) : StartsWord(ch)) {
X	    putchar(ch);
X	} else {
X	    putchar((ch == '\n') ? '\n' : ' ');
X	}
X    }
X}
X
X#ifdef __GNU__
Xinline
X#endif
Xint
XGetChar(fd)
X    FILE *fd;
X{
X    static int LastChar = 0;
X
X    if (LastChar) {
X	int ch = LastChar;
X	LastChar = 0;
X	return ch;
X    }
X
X    /* Only return a single quote if it is surrounded by letters */
X    if ((LastChar = getc(fd)) == '\'') {
X	LastChar = getc(fd);
X	if (InWord && isalpha(LastChar)) return '\'';
X	else return ' ';
X    } else {
X	int ch = LastChar;
X	LastChar = 0;
X	return ch;
X    }
X}
X
X/*
X * $Log:	NewsFilter.c,v $
X * Revision 1.5  90/10/06  00:57:27  lee
X * Prepared for first beta release.
X * 
X * Revision 1.4  90/09/20  16:36:59  lee
X * Fixed icstrcmp() and IsWanted() so that the unwanted parts of headers
X * get deleted again.... (oops!)
X * 
X * Revision 1.3  90/09/19  21:19:50  lee
X * Now supports turning unindexed stuff into qxxxxx-words.
X * 
X * Revision 1.2  90/08/29  21:56:58  lee
X * Alpha release.
X * 
X * Revision 1.1  90/08/09  19:17:57  lee
X * Initial revision
X * 
X * Revision 1.2  89/09/16  21:16:01  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:05:48  lee
X * Initial revision
X * 
X */
@@@End of lq-text/src/filters/NewsFilter.c
echo x - lq-text/src/h/Liamdbm.h 1>&2
sed 's/^X//' >lq-text/src/h/Liamdbm.h <<'@@@End of lq-text/src/h/Liamdbm.h'
X/* Liamdbm.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* $Id: Liamdbm.h,v 1.2 90/10/06 02:18:14 lee Rel1-10 $
X *
X * This is used with gdbm.  I have not linked with gdbm, and, if you
X * wish to do so, you must be careful not to voilate any copyright
X * notices... (sigh)
X *
X * The version of gdbm for which I had a manual is rather old and had no
X * ndbm compatibility.
X */
X
X#include "gdbm.h"
Xextern datum gdbm_fetch();
Xextern datum gdbm_firstkey();
Xextern datum gdbm_nextkey();
X
Xtypedef char DBM;
X
X#define dbm_store(db, key, data, mode) gdbm_store(db, key, data)
X/* gdbm_open is stupder than ndbm_open.... */
X#define dbm_open(FileName, Mode, m) gdbm_open(FileName, 512, Mode, 0)
X#define dbm_fetch gdbm_fetch
X#define dbm_close gdbm_close
X#define dbm_firstkey gdbm_firstkey
X#define dbm_nextkey gdbm_nextkey
X
X/*
X * $Log:	Liamdbm.h,v $
X * Revision 1.2  90/10/06  02:18:14  lee
X * Prepared for first beta release.
X * 
X *
X */
@@@End of lq-text/src/h/Liamdbm.h
echo x - lq-text/src/h/Revision.h 1>&2
sed 's/^X//' >lq-text/src/h/Revision.h <<'@@@End of lq-text/src/h/Revision.h'
X/* This header file gets updated with every distributed change to any source
X * file anywhere in the lq-text package.
X * A short description of the change is added to the Log here, too.
X * Lee.
X */
X
X#define LQTEXTREVISION "Release 1.10"
X
X/* $Revision: 1.10 $
X *
X * Revision 1.6  90/10/04  17:12:45  lee
X * lqtext now compiles and mostly works under BSD.
X * Fixes bug in phrase matching -- PhraseMatchLevel now works on one-word
X * phrases.
X * 
X * Revision 1.5  90/09/28  22:19:36  lee
X * Made GetChar() a macro in lqaddfile -- speed improvement...
X * 
X * Revision 1.4  90/09/20  16:37:35  lee
X * Fixed Mail and News filters so that they throw away the unwanted header
X * parts correctly.
X * 
X * Revision 1.3  90/09/20  12:51:24  lee
X * Major sdbm initialisation bug fixed.
X * 
X * Revision 1.2  90/09/20  11:52:35  lee
X * Fixed the filters so that lqshow highlights the right word (the qxx fix)
X * 
X * Revision 1.1  90/09/20  11:52:18  lee
X * Initial revision
X * 
X *
X */
@@@End of lq-text/src/h/Revision.h
echo x - lq-text/src/h/blkheader.h 1>&2
sed 's/^X//' >lq-text/src/h/blkheader.h <<'@@@End of lq-text/src/h/blkheader.h'
X/* blkheader.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X *
X * (was called blockheader.h, but this was too long on SysV for RCS)
X */
X
X/* descibe the physical WOrdPlace database...
X *
X * $Header: /usr/src/cmd/lq-text/src/h/RCS/blkheader.h,v 1.2 90/03/20 20:57:46 lee Rel1-10 $
X *
X * $Log:	blkheader.h,v $
X * Revision 1.2  90/03/20  20:57:46  lee
X * removed WID from the block.  This reduces checking, but should also
X * noticeably reduce the size of the database.
X * 
X * Revision 1.1  90/03/20  20:54:44  lee
X * Initial revision
X *
X */
X
X/* The header of each block -- I can't use sReadNumber, because I don't know
X * the size of NextOffset until I get to the end, and it's too late by then!
X *
X * I should really store the block offset, and not the byte offset.  This
X * would save a whole byte -- I could use 3 bytes for the NextBlock!
X */
Xtypedef struct {
X    unsigned long NextOffset; /* a byte offset */
X    char Data[1]; /* the address of this is where the number start... */
X} t_BlockHeader;
@@@End of lq-text/src/h/blkheader.h
echo x - lq-text/src/h/emalloc.h 1>&2
sed 's/^X//' >lq-text/src/h/emalloc.h <<'@@@End of lq-text/src/h/emalloc.h'
X/* emalloc.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* emalloc.h -- header file for emalloc.c, Liam Quin's malloc() wrapper
X *
X * $Id: emalloc.h,v 1.5 91/03/02 19:40:04 lee Rel1-10 $
X *
X * $Log:	emalloc.h,v $
X * Revision 1.5  91/03/02  19:40:04  lee
X * Simpler version of malloc defines if MALLOCTRACE unused...
X * 
X * Revision 1.4  91/03/02  18:31:21  lee
X * Simpler call to malloc wrappers if MALLOCTRACE undefined.
X * 
X * Revision 1.3  90/10/06  02:18:26  lee
X * Prepared for first beta release.
X * 
X * Revision 1.2  90/08/29  21:57:44  lee
X * removed most of the testing code
X * 
X * Revision 1.1  90/08/09  19:14:48  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:45:20  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X *
X */
X
Xextern int _LiamIsInCurses;
X
X#define InitScr() (_LiamIsInCurses = initscr())
X#define EndWin() (_LiamIsInCurses ? (_LiamIsInCurses = 0), endwin() : 0)
X
Xextern char *_emalloc(), *_erealloc(), *_ecalloc();
Xextern void _efree();
X
X#ifdef MALLOCTRACE
X#define emalloc(u) _emalloc(u, __FILE__, __LINE__)
X#define erealloc(s, u) _erealloc(s, u, __FILE__, __LINE__)
X#define ecalloc(n, siz) _ecalloc(n, siz, __FILE__, __LINE__)
X#define efree(s) _efree(s, __FILE__, __LINE__)
X#else
X#define emalloc _emalloc
X#define erealloc _erealloc
X#define ecalloc _ecalloc
X#define efree _efree
X#endif
@@@End of lq-text/src/h/emalloc.h
echo x - lq-text/src/h/fileinfo.h 1>&2
sed 's/^X//' >lq-text/src/h/fileinfo.h <<'@@@End of lq-text/src/h/fileinfo.h'
X/* fileinfo.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* Internal structure used by NX-Text to represent a word */
X
X/* Needs: sys/types.h */
X
X/* $Id: fileinfo.h,v 1.2 90/10/06 02:18:27 lee Rel1-10 $
X *
X * $Log:	fileinfo.h,v $
X * Revision 1.2  90/10/06  02:18:27  lee
X * Prepared for first beta release.
X * 
X * Revision 1.1  90/08/09  19:14:57  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:45:57  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:14:29  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.2  89/09/16  21:15:19  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:00:34  lee
X * Initial revision
X * 
X *
X */
X
Xtypedef unsigned long t_FID;
X
Xtypedef struct {
X    char *Name;
X    t_FID FID; /* File Identifier */
X    int FilterType; /* command to ASCIIify, 0 unknown, 1 none */
X    time_t Date; /* when the file was last indexed */
X    FILE *Stream;
X} t_FileInfo;
X
X#define FindFile(name) ((*(name) == '/') ? (name) : _FindFile(name))
Xextern char *_FindFile();
@@@End of lq-text/src/h/fileinfo.h
echo x - lq-text/src/h/filter.h 1>&2
sed 's/^X//' >lq-text/src/h/filter.h <<'@@@End of lq-text/src/h/filter.h'
X/* filter.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* filter.h -- define filter table for NX-Text, Liam Quin's text retrieval
X * program.
X * This table is built from a file like a simplified /etc/magic, normally
X * stored in /usr/local/lib/nx-text/lib/filtertable
X * but you can set this either here or in the Makefile.
X *
X * NEEDS: stdio.h
X *
X * $Id: filter.h,v 1.6 91/03/02 18:45:04 lee Rel1-10 $
X *
X * $Log:	filter.h,v $
X * Revision 1.6  91/03/02  18:45:04  lee
X * Spell MAILFILTER correctly in the ifdef...
X * 
X * Revision 1.5  90/10/13  03:11:31  lee
X * Now defines filters for easier stand-alone testing of stuff...
X * 
X * Revision 1.4  90/10/06  02:18:28  lee
X * Prepared for first beta release.
X * 
X * Revision 1.3  90/09/28  23:03:16  lee
X * Now use MAILFILTER and NEWSFILTER...
X * 
X * Revision 1.2  90/08/29  21:57:57  lee
X * removed most of the testing code
X * 
X * Revision 1.1  90/08/09  19:15:01  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:46:04  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:14:33  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X *
X */
X
X#define FTYPE_NEWS  1
X#define FTYPE_MAIL  2
X#define FTYPE_CDMS  3
X#define FTYPE_MOSTLYASCII 4
X#define FTYPE_C_SOURCE 5
X
X/* The Type field in each array entry is so that I can do some very simple
X * checking...
X */
Xextern int fclose(), pclose();
Xstruct s_FilterTable {
X    int Type;
X    int (* close)(); /* how to close the darned stream */
X    char *String;
X};
X#ifndef FILTERDEF
Xextern struct s_FilterTable FilterTable[];
X#else
Xstruct s_FilterTable FilterTable[] = {
X    { 0, fclose, 0 }, /* use fopen() */
X#ifndef NEWSFILTER
X# define NEWSFILTER "NewsFilter"
X#endif
X    { FTYPE_NEWS, pclose, NEWSFILTER },
X#ifndef MAILFILTER
X# define MAILFILTER "MailFilter"
X#endif
X    { FTYPE_MAIL, pclose, MAILFILTER },
X#ifdef FTYPE_CDMS /* CrystalWriter from Syntactics... */
X    { FTYPE_CDMS, pclose, "CDMSFilter" },
X#endif
X#ifdef FTYPE_NTROFF
X    { FTYPE_NTROFF, pclose, "lqderoff" }, /* not yet released, sorry */
X#endif
X    { FTYPE_MOSTLYASCII, pclose, "AsciiFilter" },
X#ifdef FTYPE_C_SOURCE
X    { FTYPE_C_SOURCE, pclose, "CFilter" }, /* leave me last! */
X#endif
X    /* If you add more, you MUST update MaxFilterType */
X    { 0, 0, 0 }
X};
X#endif
X#define MaxFilterType FTYPE_C_SOURCE
@@@End of lq-text/src/h/filter.h
echo x - lq-text/src/h/globals.h 1>&2
sed 's/^X//' >lq-text/src/h/globals.h <<'@@@End of lq-text/src/h/globals.h'
X/* globals.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X *
X * $Id: globals.h,v 1.6 91/02/20 19:26:53 lee Rel1-10 $
X *
X * (see Log at end of this file for change history.  Keep this up to date
X * using rcs if you have it...)
X */
X
X/* globals.h -- declarations of globally accessible variables, and also
X * of configurable parameters.
X *
X * Some of the configuation options might be given in ../Makefile, so
X * you must check in there too.
X *
X * Everything that includes this file must be linked with Defaults.c
X */
X
X/* 
X * DOCPATH gives the list of directories in which to search in order
X * to find files to retrieve and to index.  The default can be wired
X * in here, or can be simply "." (in which case relative pathnames will
X * be from wherever one invokes the commands, and absolute pathnames
X * will be absolute.  For example,
X * #define DFLTDOCPATH "/usr/man:."
X * In any case, it can be overridden by a DOCPATH line in the configuration
X * file for a given database (README in the database directory), and also
X * by an environment variable DOCPATH (the latter taking precedence over
X * the former).
X *
X * Use ((char *) 0) to disable the default -- in this case, you always have
X * to give one, either with the $DOCPATH variable or in the database file.
X *
X */
X#ifndef DFLTDOCPATH
X# define DFLTDOCPATH ((char *) 0)
X#endif
X
X/* LQTEXTDIR: if the programs can't find the directory to use -- i.e.,
X * there was no -d option and $(LQTEXTDIR) is unset, we either
X * look in UNDERHOME (if that was defined here) or in wherever LQTEXTDIR
X * is defined to point.
X */
X#ifndef LQTEXTDIR
X# define LQTEXTDIR "/usr/spool/lqtextdir"
X#endif
X
X/* If UNDERHOME is set, look there for a directory -- e.g. 
X * #define UNDERHOME "sockdrawer"
X * would make lqtext programs look for a directory something like
X * /users/liam/sockdrawer
X * (where /users/liam is my login directory)
X */
X#ifndef UNDERHOME
X# define UNDERHOME "LQTEXTDIR"
X#endif
X
X/* The name of a configuration file found in the database directory:
X */
X#define CONFIGFILE "README"
X
X/* If the config file doesn't give a filename for a list of common
X * words, we look for one called DFLTCOMMONFILE (and don't mind if we
X * don't find it).  Use "/dev/null" or ((char *) 0) if you want to
X * disable the default.
X * It's case sensitive, of course.
X */
X#define DFLTCOMMONFILE "CommonWords"
X
X#ifndef PAGER
X/* The default pager to use if the user doesn't set $PAGER.  This is only
X * used in lqshow, the browser.  Good things to try are
X * more, "less -Ce", and (generally only on System V) "pg -ns".
X * Specify an absolute path if possible.  It's often a lot faster, and
X * it's somewhat safer...
X */
X# ifdef BSD
X#  define PAGER "/usr/ucb/more"
X# else
X#  define PAGER "/usr/bin/pg -ns"
X# endif
X#endif
X
X#ifndef DBMCREAT
X/* If you are using dbm or gdbm (?), you will need to create the dbm files
X * by hand yourself.  Defining DBMCREAT as 0 makes the software do this
X * automatically, with a very slight performance penalty.
X *
X * ndbm and sdbm can use O_CREAT, so set it to 1 here for them.
X * You will also have to look at ../Makefile, ../PORTING, smalldb.h and
X * ../lqlib/smalldb.h, making whatever changes are needed.
X */
X# define DBMCREAT 1 /* 1 for ndbm, 0 for dbm */
X#endif
X
X#ifdef sparc
X# define NEEDALIGN
X#endif
X
X#ifdef mips /* e.g. SGI machines */
X# define NEEDALIGN
X#endif
X
X/* NEEDALIGN is for C compilers that require C structures to start at
X * word boundaries.  You need this on sparc and sgi machines...
X */
X
X/***
X *** If you want to change anything beyond here...
X ***
X *** well, you can.
X *** After all, it's your copy.
X ***
X *** But don't come running back to me if it doesn't work!
X *** At least not until you have tried
X ***	+  understanding what the problem is;
X ***	+  looking at the source to see why;
X ***	+  fixing the problem;
X ***	+  taking off your shoes and socks and grinning for a while.
X ***
X *** Liam.
X ***
X ***/
X
X/* The following let you reconfigure the names of the files that form
X * part of the database, but there is no point in doing so unless you
X * are porting to some strange system that has absurd filename restrictions!
X */
X#ifndef WORDINDEX
X# define WORDINDEX "wordlist"
X    /* This is a dbm file, so you'll get two files, one with ".pag"
X     * stuck on the end and one with ".dir" on the end.
X     * It contains an entry for every word in the database, enabling
X     * the software to go from a word to an integer (well, a t_WID)
X     * very quickly.
X     * It tends to be a little over one tenth of the size of the DATABASE.
X     */
X#endif
X#ifndef WIDINDEXFILE
X# define WIDINDEXFILE "WIDIndex"
X    /* WIDINDEXFILE contains each word in the datbase, together with some
X     * information and the first few bytes of data.
X     * It contains WIDBLOCKSIZE bytes for every word, but this has to
X     * be at least MAXWORDLEN + 10 bytes long (see WordInfo.c).
X     */
X#endif
X#ifndef DATABASE
X# define DATABASE "data"
X    /* For those words whose data doesn't fit into the first WIDBLOCKSIZE
X     * bytes, space is allocated in this file in BLOCKSIZE chunks.  Make
X     * BLOCKSIZE small, or you will waste a lot of space -- on the other
X     * hand, there's a 4-byte-per-block overhead at the moment.
X     * This file gets very  b  i  g  indeed.
X     */
X#endif
X#ifndef FILEINDEX
X# define FILEINDEX "FileList"
X    /* This is a list of every file in the database, again in dbm format,
X     * so there are actually two files (a .pag and a .dir) involved.
X     * If your files are short, it will quickly grow to a tenth of the size
X     * of the database.
X     * It stores the filename, and some other information.
X     */
X#endif
X#ifndef FIDFILE
X# define FIDFILE "FIDFile"
X    /* This contains the largest currently used file number... you can
X     * look at it to see how many files have been indexed.
X     * It is only a few bytes long.
X     */
X#endif
X#ifndef WIDFILE
X# define WIDFILE "WIDFile"
X    /* This contains the largest currently used word number... you can
X     * look at it to see how many unique words have been seen.
X     * It is only a few bytes long.
X     */
X#endif
X
X#ifndef WIDBLOCKSIZE
X# define WIDBLOCKSIZE	32
X/* WIDBLOCKSIZE absolutely must be large enough to fit at least one byte
X * of actual data, or all hell will break loose.
X * (actually that could be fixed...).
X * In any case, it has to contain (apart from the >= 1 byte of data):
X * + the length count (1 byte) and the word itself (no null on the end)
X * + the block number in the database (1..5 bytes)
X * + the number of matches (1..5 bytes)
X *
X * It helps efficiency very, very slightly if these are a power of two
X * bytes, as then they never cross Unix block boundaries.
X *
X */
X#endif
X
X#ifndef BLOCKSIZE
X#define BLOCKSIZE	64
X/* BLOCKSIZE is the size of blocks in the data file.  There are several
X * tradeoffs:
X * + there is a 4-bytes-per-block overhead for list pointers, so it's
X *   a good idea to make them large
X * + there's a bit of work involved in fetching the blocks, so things go
X *   faster if they're larger...
X * + many blocks are not full, so it's a good idea to make them small.
X *   On average, a little over (BLOCKSIZE - 4) / 2 bytes are wasted for
X *   every word chain.
X * + since many of the blocks are not full, it's a good idea to make them
X *   small, minimising the amount of extra data that gets copied around by
X *   the Unix kernel.  If the blocks are smaller it'll go faster...
X *
X * It helps efficiency very, very slightly if these are a power of two
X * bytes, as then they never cross Unix block boundaries.
X *
X */
X#endif
X
X/**** Some useful macros */
X
X/* STREQ(a,b) is much faster than strcmp() in the (common) case that the
X * first character of the strings differ.
X * It is due (as far as I know) to Henry Spencer, at the University of
X * Toronto Zoology Dept.,
X * utzoo!henry
X */
X#ifndef STREQ
X# define STREQ(henry,utzoo) (*(henry) == *(utzoo) && !strcmp(henry, utzoo))
X#endif
X
X/* Inline functions are functions that get expanded inline during
X * compilation -- sort of like macros with real local arguments.
X * Not all compilers support them.
X */
X#ifdef __GNUC__
X#  define INLINE inline
X#else
X#  define INLINE /* not supported */
X#endif
X
X#ifdef DefineThem
X# define DECL(name, type, value)   type name = value
X# define EXTERN /* just define them please */
X#else
X# define EXTERN extern /* declare but do not define */
X# define DECL(name, type, value)   EXTERN type name
X#endif
X
X/****/
X
X/* Now declare (or define) things: */
X
Xextern char *progname; /* from progname.c, for error messages */
XDECL(CommonWordFile, char *, DFLTCOMMONFILE); 
XDECL(DatabaseDir, char *, LQTEXTDIR); 
XDECL(FileIndex, char *, FILEINDEX);
XDECL(WordIndex, char *, WORDINDEX);
XDECL(DataBase, char *, DATABASE);
XDECL(FidFile, char *, FIDFILE);
XDECL(WidFile, char *, WIDFILE);
XDECL(WidIndexFile, char *, WIDINDEXFILE);
XDECL(DocPath, char *, DFLTDOCPATH); 
X
X/*
X * $Log:	globals.h,v $
X * Revision 1.6  91/02/20  19:26:53  lee
X * Added NEEDALIGN on mips systems
X * (thanks to Mark Moraes, moraes@cs.toronto.edu)
X * 
X * Revision 1.5  90/10/07  20:41:20  lee
X * Added NEEDALIGN for fussy architectures.
X * 
X * Revision 1.4  90/10/06  02:21:21  lee
X * Prepared for first beta release.
X * 
X * Revision 1.3  90/10/03  21:31:54  lee
X * Added definition of PAGER, which has moved here from lqshow.c
X * 
X * Revision 1.2  90/08/09  19:15:03  lee
X * after BSD lint and saber-C
X * 
X * Revision 1.1  90/03/23  17:32:11  lee
X * Initial revision
X * 
X *
X */
@@@End of lq-text/src/h/globals.h
echo x - lq-text/src/h/numbers.h 1>&2
sed 's/^X//' >lq-text/src/h/numbers.h <<'@@@End of lq-text/src/h/numbers.h'
X/* numbers.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* ReadNumber and WriteNumber take/return a long, using a compression
X * algorithm to reduce the amount of data taken.
X *
X * They use (char *) pointers instead if prefixes with an s.
X *
X * $Id: numbers.h,v 1.3 90/10/06 02:18:30 lee Rel1-10 $
X *
X */
X
Xextern INLINE unsigned long fReadNumber();
Xextern INLINE unsigned long sReadNumber();
X
Xextern INLINE void fWriteNumber();
Xextern INLINE void sWriteNumber();
X
X/*
X * $Log:	numbers.h,v $
X * Revision 1.3  90/10/06  02:18:30  lee
X * Prepared for first beta release.
X * 
X * Revision 1.2  90/08/09  19:15:42  lee
X * after BSD lint and saber-C
X * 
X * Revision 1.1  90/04/19  19:27:04  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:46:43  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 1.2  89/09/16  21:15:40  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:06:02  lee
X * Initial revision
X * 
X */
@@@End of lq-text/src/h/numbers.h
echo x - lq-text/src/h/pblock.h 1>&2
sed 's/^X//' >lq-text/src/h/pblock.h <<'@@@End of lq-text/src/h/pblock.h'
X/* pblock.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X#ifndef PBLOCK_H /* the matching endif is at the end of the file... */
X
X# define PBLOCK_H
X/* The physical Word Database...
X *
X * First, there is the WID (from 1 to 4 bytes)
X *
X * Then, there is a NEXT pointer (or 0).
X *
X * Then, there is a list of (FID, OFFSET) pairs.
X *
X * $Header: /usr/src/cmd/lq-text/src/h/RCS/pblock.h,v 1.2 90/08/09 19:15:45 lee Rel1-10 $
X *
X * $Log:	pblock.h,v $
X * Revision 1.2  90/08/09  19:15:45  lee
X * after BSD lint and saber-C
X * 
X * Revision 1.1  90/03/01  23:54:37  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:47:04  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:15:36  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.2  89/09/16  21:15:43  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:06:09  lee
X * Initial revision
X * 
X *
X */
X
Xtypedef struct {
X    t_FID FID;
X    unsigned long BlockInFile;
X    unsigned short WordInBlock;
X    unsigned short Flags;
X    unsigned char StuffBefore; /* preceding ignored garbage */
X} t_WordPlace;
X
X/* This structure is really only used by addfile; elsewhere arrays of
X * WordlPlace are used.
X */
X
Xtypedef struct s_WordPlaceList {
X    char *Word;
X    t_WordPlace WordPlace;
X    struct s_WordPlaceList *Next;
X} t_WordPlaceList;
X
X/* Warning: One cannot use structure copy for a pblock! */
X
X/* This does *NOT* correspond to the physical disk layout -- see pblock.c */
Xtypedef struct {
X    t_WID WID; /* for checking; */
X    unsigned long ChainStart;
X    unsigned long NumberOfWordPlaces;
X    t_WordPlace WordPlaces[1]; /* made by joining lots of disk blocks... */
X} t_pblock;
X
X#endif
@@@End of lq-text/src/h/pblock.h
echo x - lq-text/src/h/phrase.h 1>&2
sed 's/^X//' >lq-text/src/h/phrase.h <<'@@@End of lq-text/src/h/phrase.h'
X/* phrase.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* LQ-Text -- Liam's Text Retrieval Package
X * Liam R. Quin, September 1989, and later...
X *
X * phrase.h -- data structures for handling entire phrases
X *
X */
X
X/* $Id: phrase.h,v 1.2 90/10/06 02:18:33 lee Rel1-10 $
X *
X */
X
X/* Represent a Phrase as a linked list of WordInfo pointers, plus a list
X * of matches.
X */
X
Xtypedef struct s_PhraseItem {
X    t_WordInfo *Word;
X    struct s_PhraseItem *Next;
X    unsigned long SearchIndex; /* For phrase-matching */
X    char *WordStart; /* pointer into original phrase */
X} t_PhraseItem;
X
Xtypedef enum {
X    PCM_AnyCase, /* Ignore case entirely */
X    PCM_HalfCase, /* Upper only matches upper; lower matches either */
X    PCM_SameCase, /* Exact matching */
X} t_PhraseCaseMatch;
X
Xtypedef struct s_Match {
X    t_WID WID;
X    t_WordPlace *Where;
X    struct s_Match *Next;
X} t_Match;
X
Xtypedef struct s_MatchList {
X    t_Match *Match;
X    struct s_MatchList *Next;
X} t_MatchList;
X
X
Xtypedef struct s_Phrase {
X    t_PhraseItem *Words; /* list of words and pblocks */
X    char *OriginalString; /* as supplied by the user */
X    char *ModifiedString; /* after deleting short/unindexed words */
X    unsigned long NumberOfMatches;
X    t_MatchList *Matches;
X    struct s_Phrase *Next; /* for use when we're in a list of phrases... */
X    unsigned short HasUnknownWords;
X} t_Phrase;
X
X/* This is for FilleList() */
Xtypedef struct s_Answer {
X    char *Answer;
X    struct s_Answer *Next;
X} t_Answer;
X
X/*
X * $Log:	phrase.h,v $
X * Revision 1.2  90/10/06  02:18:33  lee
X * Prepared for first beta release.
X * 
X * Revision 1.1  90/08/09  19:15:49  lee
X * Initial revision
X * 
X * Revision 1.1  89/09/17  23:03:37  lee
X * Initial revision
X * 
X */
@@@End of lq-text/src/h/phrase.h
echo x - lq-text/src/h/smalldb.h 1>&2
sed 's/^X//' >lq-text/src/h/smalldb.h <<'@@@End of lq-text/src/h/smalldb.h'
X/* smalldb.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* $Id: smalldb.h,v 1.3 91/03/03 00:12:56 lee Exp $
X */
X
X/* You must include fcntl.h before this file. */
X
X#ifdef ndbm
X# include <ndbm.h>
X# define FoundDbmOK
X# define NDBM
X#endif
X
X#ifdef sdbm
X# include "sdbm.h"
X# define FoundDbmOK
X# define NDBM /* it's compatible */
X#endif
X
X#ifdef ozmahash
X# include "ozmadbm.h"
X# define FoundDbmOK
X# define NDBM /* it's compatible as well... */
X#endif
X
X#ifndef FoundDbmOK
X# include "Liamdbm.h"
X#endif
X
X#ifndef O_RDWR
X# include <fcntl.h>
X#endif
X
X#define CACHE 2 /* size of DBM cache in startdb() -- I only use two! */
X/* If you rip out the dbm cache stuff for use elsewhere, increse the 2
X * to something like 5 or so!!!  Each entry uses two file pointers.
X * Lee
X */
X
X#ifndef CACHE
X# define startdb(FilePrefix) dbm_open(FilePrefix, O_RDWR|O_CREAT, 0640)
X# define enddb(db) { if (db) dbm_close(db); }
X#endif
X
X
X#ifndef startdb
XDBM *startdb();
X#endif
X
X#ifndef enddb
X# ifdef CACHE
X#  define enddb(db) /* nothing to do, because of the cache */
X# else
X   void enddb();
X# endif /* CACHE */
X#endif /* !enddb */
X
X/*
X * $Log:	smalldb.h,v $
X * Revision 1.3  91/03/03  00:12:56  lee
X * Integrated ozmahash.
X * 
X * Revision 1.2  90/10/06  02:18:36  lee
X * Prepared for first beta release.
X * 
X * Revision 1.1  90/08/09  19:16:00  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:47:19  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:16:01  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.2  89/09/16  21:15:45  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:06:12  lee
X * Initial revision
X * 
X */
@@@End of lq-text/src/h/smalldb.h
echo x - lq-text/src/h/wordindex.h 1>&2
sed 's/^X//' >lq-text/src/h/wordindex.h <<'@@@End of lq-text/src/h/wordindex.h'
X/* wordindex.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* (this file is currently empty, but might return...) */
X
X/*
X * $Id: wordindex.h,v 1.2 90/10/06 02:18:38 lee Rel1-10 $
X *
X * $Log:	wordindex.h,v $
X * Revision 1.2  90/10/06  02:18:38  lee
X * Prepared for first beta release.
X * 
X * Revision 1.1  90/08/09  19:16:02  lee
X * Initial revision
X * 
X * Revision 2.1  89/10/02  01:16:06  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.2  89/09/16  21:15:47  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:06:13  lee
X * Initial revision
X * 
X *
X */
@@@End of lq-text/src/h/wordindex.h
echo x - lq-text/src/h/wordinfo.h 1>&2
sed 's/^X//' >lq-text/src/h/wordinfo.h <<'@@@End of lq-text/src/h/wordinfo.h'
X/* wordinfo.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/*
X * $Id: wordinfo.h,v 1.3 90/10/06 02:21:30 lee Rel1-10 $
X */
X
Xtypedef unsigned long t_WID;
X
X#ifndef PBLOCK_H
X# include "pblock.h"
X#endif
X
X#ifndef WIDBLOCKSIZE
X#define WIDBLOCKSIZE 32
X#endif
X
Xextern char *WidIndexFile; /* Default.c */
X
X/* this is a hack for speed: */
X#define GetNextWID SpoofGetNextWID
X
X/** A t_WordInfo describes a single word, in terms of
X ** where it came from
X ** how to find its database entries
X ** how to find the in-core database entries (a copy of the above)
X **/
X
X/* There would be a performance benefit if this struct was smaller.
X * It was foolish of me to use WordInfo for so many different things in
X * addfile, and now I pay the price.
X * Addfile may end up calling malloc for 10,000 of these things...
X *
X * There should be:
X * t_WordPlace (exists, pblock.h)
X *	for recording a specific occurrence of a given word in a given file
X * t_WordInfo (definition follows... look down...)
X *	for recording information about a WID's entry in the database
X * t_WordPlaceList
X *	for addfile to make a list of word places...
X * t_pblock (exists, see pblock.h)
X *	for containing the list of WordPlaces found in the database for a
X *	given word, or for putting them there.  Uses arrays rather than
X *	lists to squeeze a few extra milliseconds.  Some hope :-( :-)
X *
X * t_WordPlaceList almost certainly happen in the next major edit phase...
X * t_WordInfo will then be somewhat smaller.
X * All of the entries marked with a leading comment (below) should
X * be elsewhere (and some of them were, in the Grand Design!).
X *
X */
Xtypedef struct s_WordInfo {
X    char *Word;
X    t_WID WID; /* My Word Identifier */
X    unsigned long NumberOfWordPlaces; /* total */
X    t_FID FID; /* where we got it from */
X    unsigned long Offset; /* word entry position in the data base */
X    struct s_WordInfo *Next; /* for making lists of WordInfo structs */
X    char *DataBlock; /* for writing me out to the index */
X    char *WordPlaceStart;
X    t_WordPlace *WordPlaces; /* first few pairs */
X    t_WordPlace WordPlace; /* For addfile -- this is due to go!!!! */
X    /* shorts are at the end to obviate alignment padding... */
X    unsigned long WordPlacesInHere;
X    unsigned short Length; /* Word length; reduce the need for strlen */
X#if 0
X    unsigned char Flags;
X	/* Flags serve two purposes:
X	 * the LSB says whether the entry is sorted.
X	 * the remainder are a logical AND of all entries in a sorted
X	 * block.  NOTE: if the block is unsorted, the other bits should
X	 * still be up to date.
X	 */
X#endif
X} t_WordInfo;
X
X/*
X * $Log:	wordinfo.h,v $
X * Revision 1.3  90/10/06  02:21:30  lee
X * Prepared for first beta release.
X * 
X * Revision 1.2  90/08/09  19:16:04  lee
X * after BSD lint and saber-C
X * 
X * Revision 2.2  89/10/08  20:47:27  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:16:15  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.3  89/09/17  23:04:52  lee
X * Various fixes; NumberInBlock now a short...
X * 
X * Revision 1.2  89/09/16  21:15:49  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:06:16  lee
X * Initial revision
X * 
X */
@@@End of lq-text/src/h/wordinfo.h
echo x - lq-text/src/h/wordrules.h 1>&2
sed 's/^X//' >lq-text/src/h/wordrules.h <<'@@@End of lq-text/src/h/wordrules.h'
X/* wordrules.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
X * This code is NOT in the public domain.
X * See the file COPYRIGHT for full details.
X */
X
X/* $Id: wordrules.h,v 1.2 90/10/06 02:18:39 lee Rel1-10 $
X *
X */
X
X/* Rules for determining what an indexable word looks like;
X * These are implemented by the various filters, as well as by
X * the indexing software itself.  This means that the filters
X * don't need to keep track of word lengths, as addfile will do this,
X * but that they should not emit non-word stuff if they can help it,
X * turning it into the equivalent amount (in bytes) of white-space
X * instead.
X * They should also turn words they don't want indexed into "qxxx",
X * with the right number of x's (e.g. "bare" --> "qxxx").
X */
X
X/* A "word" is a letter followed by any combination of
X * letters, digits or '_'.  An embedded (not trailing) ' is also allowed
X * (_ is allowed so that one can index progamming languages; strictly
X * speaking, a lot of languages allow _ at the start too, but I don't
X * want to get confused by nroff output etc., which contains lines of
X * underscores)
X *
X * This scheme currently excludes numbers...
X * 31, 31.4 and 31.9e4 will all be ignored.  So will 1987.
X */
X
X#define StartsWord(ch) isalpha(ch)
X#define WithinWord(ch) (isalnum(ch) || (ch == '_') || (ch == '\''))
X#define EndsWord(ch) isalnum(ch)
X
X/* Don't index words unless they are at least MinWordLength characters
X * long!
X */
X#define MinWordLength 3
X#define MaxWordLength 18 /* truncate words to this */
X/* The Following is for *.WordPlace.BlockInFile.  If words are constrained
X * to be 3 or more characters long, there can be at most
X * (FileBlockSize / 4) of them in a block (since words must be separated
X * by at least one character).
X * Hence, 7 bits, which allows 0..127 giving 128  distinct values,
X * gives us a block that is 128 * (MinWordLength + 1) bytes long.
X */
X#define FileBlockSize (128 * (MinWordLength + 1))
X
X/* WordPlace Flags:
X * When a plural word is found, or a possessive word, it is reduced to
X * being singular, and flags are set appropriately.
X * Also, a flag is set to say if the word started with a Capital Letter.
X * This puts Window, windows, and Window's all together, but enables them
X * to be differentiated for searching if required.
X * These flags are implemented by WordInfo and addfile, not by the various
X * filters, but the filters must preserve capitalisation of the first letter
X * in each word, and pass through apostrophes within words (like this's).
X */
X
X#define WPF_WASPLURAL		0001 /* The word...  ended in s */
X#define WPF_UPPERCASE		0002 /* ...Started with a capital letter */
X#define WPF_POSSESSIVE		0004 /* ...ended in 's */
X#define WPF_ENDEDINING		0010 /* ...ended in ing */
X#define WPF_LASTWASCOMMON	0020 /* the previous word was common */
X#define WPF_LASTHADLETTERS	0040 /* we skipped some letters to get here */
X#define WPF_HASSTUFFBEFORE	0100 /* Other than 1 byte of garbage before */
X#define WPF_LASTINBLOCK		0200 /* I'm the last word in this block */
X
X/* new note (jan 90):
X * You can't currently have both plural and posessive in the most common case
X * of the boys' muddy feet (for example), as the trailing ' gets deleted.
X * this doesn't matter, but perhaps that combination should be reserved for
X * had-another-standard-ending??? e.g. -ed or -ing, that isn't often followed by
X * -s or -'s...
X *
X * Also, ENDEDINING (ended in "ing") is currently unused entirely.
X * Perhaps if it is set, the plural and possessive bits should index which of
X * four endings was found, although this would preclude special-casing of the
X * s's combination.  Probably better that way.
X *
X * I should very much like to have another flag or two, perhaps embedded in
X * one of the other fields.  This might be feasible if there is a pre-scan
X * when the index is written to determine the most common (modal) flags and
X * distance (currently I assume 1) and to omit these whenever they are the default.
X * In this case, the fact that every occurrence of Jesus starts with a capital
X * letter (and ends in -s, *blush*), can still lead to most of the flags being
X * omitted.
X *
X * The next revision will separate the list of FIDs from the rest of the information,
X * in which case the embedding of the flags becomes a little trickier.  This
X * belongs in the TODO file now, sorry.
X *
X * Liam Quin, January 22nd 1990, at home in Warrington, England (ugh)
X *
X */
X
X/*
X * $Log:	wordrules.h,v $
X * Revision 1.2  90/10/06  02:18:39  lee
X * Prepared for first beta release.
X * 
X * Revision 1.1  90/08/09  19:16:05  lee
X * Initial revision
X * 
X * Revision 2.2  89/10/08  20:47:35  lee
X * Working version of nx-text engine.  Addfile and wordinfo work OK.
X * 
X * Revision 2.1  89/10/02  01:16:19  lee
X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
X * 
X * Revision 1.2  89/09/16  21:15:52  lee
X * First demonstratable version.
X * 
X * Revision 1.1  89/09/07  21:06:17  lee
X * Initial revision
X * 
X */
@@@End of lq-text/src/h/wordrules.h
echo end of part 02
-- 
Liam R. E. Quin,  lee@sq.com, SoftQuad Inc., Toronto, +1 (416) 963-8337