[comp.sources.misc] v02i042: brwsr - browse text files using inverted index

science@nems.ARPA (Mark Zimmermann) (02/03/88)

Comp.sources.misc: Volume 2, Issue 42

Submitted-By: "Mark Zimmerman" <science@nems.ARPA>

Archive-Name: brwsr


Comp.sources.misc: Volume 2, Issue 42
Submitted-By: "Mark Zimmerman" <science@nems.ARPA>
Archive-Name: brwsr

Following C code uses the index files built by qndxr.c and lets you move
around in them ... view occurrences of words in context, retrieve full
text, do simple proximity searching, etc. ... seems to work on Suns,
VAXen, and Macintoshes ... heavily commented ... ^z

/* transportable version of the browser program -- 871008 ^z
 * version 0.11 -- with fixes to allow for non-ASCII characters
 * in the document/database file....
 *
 * This program lets a user browse through indices created by
 * the qndxr program....
 *
 * Contact Mark Zimmermann, 9511 Gwyndale Drive, Silver Spring, MD  20910
 * ("science (at) nems.arpa, [75066,2044] on CompuServe, 301-565-2166
 * for further information....
 */


/* commands:
 *
 *    ?            print out helpful info
 *    :            quit program
 *    :fnord       open document file 'fnord' for browsing
 *    >grok        append copy of future output to notes file 'grok'
 *    >            end copying to notes file
 *    'DON.MAC     append comment string 'DON.MAC' to notes file
 *    xyzzy        jump to INDEX item 'XYZZY'
 *    ".012345     take string '.012345' as target for jump
 *    -17          back up 17 lines from here
 *    -            back up one line; same as '-1' command
 *    +23          jump down 23 lines from here
 *    + or <ret>   move down one line; same as '+1' command
 *    .29          print 29 lines from here
 *    =            descend:  INDEX -> CONTEXT -> TEXT display
 *    ^            ascend:  TEXT -> CONTEXT -> INDEX display
 *    *            empty 'working subset' proximity filter (nothing valid)
 *    **           fill 'working subset' (everything valid)
 *    &            add word-neighborhood of current index item to subset
 *    &&           add sentence-neighborhood to subset
 *    &&&          add paragraph-neighborhood to subset
 *    ~            invert (logical NOT) entire working subset
 *    ;            repeat previous command
 *    
 */
 

#include <stdio.h>            /* for FILE, printf(), etc. */
#include <strings.h>
#include <ctype.h>

/* -----------------header stuff---------------- */

/* some definitions for the brwsr program...
 * 870805-07-... ^z
 */

/* tell the compiler that we are using Lightspeed C ... mostly so that
 * certain sections of non-transportable code will be activated to
 * compensate for the use of 16-bit int variables in LSC.... */
/* #define LIGHTSPEED */

/* KEY_LENGTH is the number of letters we have in each record.... */
#define KEY_LENGTH    28

/* CONTEXT_LINE_LENGTH is the total length of a key-word-in-context
 * display line.... */
#define CONTEXT_LINE_LENGTH  75

/* CONTEXT_OFFSET is the distance from the start of the context line
 * to where the key word itself begins.... */
#define CONTEXT_OFFSET  30

/* STRLEN is the most characters to allow in a line from the
 * file at one time, and the maximum length of a command line.... */
#define STRLEN  256

/* CMASK masks off characters to avoid negativity problems with funny
 * non-ASCII symbols... */
#define CMASK  0xFF

/* NEIGHBORHOOD is the distance in bytes that defines the proximity
 * neighborhood of a word in the index ... used when defining a
 * subset for proximity searching.... NEIGHBORHOOD * 8 is the
 * compression factor for squeezing the document file down into the
 * array subset[] ...
 * Thus, NEIGHBORHOOD = 32, a good choice, defines a chunkiness of 32
 * characters in making comparisons for proximity determination purposes....
 */
#define NEIGHBORHOOD  32

/* WORD_RANGE, SENTENCE_RANGE, and PARAGRAPH_RANGE define how many chunks
 * of size NEIGHBORHOOD on each side of an item that the neighborhood of
 * each type extends.  Values 1, 10, and 100 work pretty well....
 */
#define WORD_RANGE         1
#define SENTENCE_RANGE    10
#define PARAGRAPH_RANGE  100

/* define a value to help recognize long operations, for which
 * the user should be warned of a likely delay in response ...
 */
#define GIVE_WARNING  200

/* structure of the records in the index key file:
 *    a fixed-length character string kkey, filled out with blanks and
 *        containing the unique alphanumeric 'words' in the document file,
 *        changed to all-capital letters;
 *    a cumulative count of how many total occurrences of words, including
 *        the current one, have happened up to this point in the alphabetized
 *        index.
 *
 *    Obviously, the number of occurrences of the nth word is just the
 *        difference between the nth cumulative count and the (n-1)th
 *        cumulative count.  Also, the (n-1)th cumulative count is a
 *        pointer to the first occurrence of the nth word in the ptr_file
 *        (which holds all the index offset pointers for the document file).
 *
 *    See the index building program "ndxr.c" for further details of the
 *        index structure....
 */
typedef struct
  {
    char kkey[KEY_LENGTH];
    long ccount;
  }  KEY_REC;

/* the pointer records are simply long integers, offsets from the start
 * of the document to the indexed words....*/
#define PTR_REC  long

/* define the values for the three levels that the user may be browsing
 * on:  INDEX is the display of unique words and their occurrence counts;
 * CONTEXT is the key-word-in-context (KWIC) display; TEXT is the full
 * text of the document.  */
#define INDEX    0
#define CONTEXT  1
#define TEXT     2

/* symbolic values for yes/no answers... */
#define TRUE  1
#define FALSE 0

/* ---------------prototypes------------------- */

/* prototypes for my brwsr functions ...
 * 870805...  ^z
 */

#ifdef LIGHTSPEED

/* in file_utils.c */
void open_files (char cmd[]);
void close_files (void);
void init_items (void);

/* in brwsr.c */
void main (void);

/* in show_items.c */
void show_current_item (void);
void show_current_index_item (void);
void show_current_context_item (void);
void show_current_text_item (void);

/* in brws_utils.c */
void get_key_rec (KEY_REC *recp, long n);
PTR_REC get_ptr_rec (long n);
long start_of_line (long p);
long next_line (long p);
void beep (void);

/* in do_help/files.c */
void do_help (void);
void do_open (char cmd[]);
void do_redirection (char cmd[]);
void do_comments (char cmd[]);

/* in do_moves_etc.c */
void do_move (char cmd[]);
int make_move (long n);
int move_in_text (long move);
int make_subindex_move (long move);

/* in do_targets.c */
void do_descend (void);
void do_ascend (void);
void do_target_jump (char cmd[]);
int zstrcmp (char *s1, char *s2);
void init_target_item (KEY_REC *rp, char cmd[]);
void do_multiprint (char cmd[]);

/* in do_subset.c */
void do_make_subset (char cmd[]);
void create_subset (void);
void destroy_subset (void);
void do_invert_subset (void);
void do_add_neighborhood (char cmd[]);
void set_neighborhood_bits (int n);
void set_subset_bit (long offset);
int get_subset_bit (long offset);
long count_subset_recs (KEY_REC *this_rec, KEY_REC *prev_rec);

#endif

/* ------------------------main program-------------- */


/* First, define a few external variables to hold:
 *    - pointers to the document, key, pointer, and notes files;
 *    - numbers indicating the current and the minimum/maximum values for:
 *        INDEX item (one line for each unique word in the document);
 *        CONTEXT item (one line for each occurrence of chosen INDEX word);
 *        TEXT item (byte offset from start of file to current line).
 *    - the actual KEY_RECs corresponding to the current INDEX item and
 *        the INDEX item just before that one
 *    - the subset array pointer used for proximity searching, the flag
 *        empty_subset, and the count of how many current records are in
 *        the working subset subset_rec_count
 *    - the level of user operations:
 *        0     means browsing the INDEX
 *        1    means browsing the CONTEXT (key-word-in-context = KWIC)
 *        2    means browsing the TEXT
 *
 * These items are referred to by various routines at scattered
 * locations, and it seems easiest to let them reside in external
 * variables....
 */
 
FILE *doc_file = NULL, *key_file = NULL, *ptr_file = NULL,
        *notes_file = NULL;
long current_item[3] = {0, 0, 0};
long min_item[3] = {0, 0, 0};
long max_item[3] = {0, 0, 0};
KEY_REC this_rec, prev_rec;
char *subset = NULL;
int empty_subset = TRUE;
long subset_rec_count = 0;
int level = INDEX;

void main()
  {
    char cmd[STRLEN], prevcmd[STRLEN], *gets(), *strcpy();
    void do_help(), do_open(), do_redirection(), do_comments(), do_move(),
        do_make_subset(), do_invert_subset(), do_add_neighborhood(),
        do_descend(), do_ascend(), do_target_jump(), do_multiprint(),
        close_files();
    
    prevcmd[0] = '\0';
    printf ("Greetings, program! ... type '?' for help.\n");
    printf ("Browser version 0.11 by ^z - 871007\n");
    
    while (gets (cmd))
      {
         do_cmd:
         switch (cmd[0])
          {
            case '?':
                  do_help ();
                  break;
            case '\0':
                strcpy (cmd, "+");
                /* <return> is just + */
            case '+':
                /* + or - commands are both moves */
            case '-':
                do_move (cmd);
                break;
            case ':':
                do_open (cmd);
                break;
            case '>':
                do_redirection (cmd);
                break;
            case '\'':
                do_comments (cmd);
                break;
            case '=':
                do_descend ();
                break;
            case '^':
                do_ascend ();
                break;
            case '.':
                do_multiprint (cmd);
                break;
            case '*':
                do_make_subset (cmd);
                break;
            case '&':
                do_add_neighborhood (cmd);
                break;
            case '~':
                do_invert_subset ();
                break;
            case ';':
                strcpy (cmd, prevcmd);
                goto do_cmd;
                break;
            case '"':
                do_target_jump (cmd + 1);
                break;
            default:
                do_target_jump (cmd);
                break;
          }
        strcpy (prevcmd, cmd);
      }
  }


/* ---------------------brws_utils.c----------------- */

/* miscellaneous utility functions for helping the higher-level routines
 * browse around in indexed files....
 * 870805-13-...  ^z
 */

/* function to get the 'n'th KEY_REC from key_file and store it in the
 * KEY_REC space pointed to by recp ... note that if an illegal value
 * of 'n' is requested, a 'null' KEY_REC is produced....
 */
 
void get_key_rec (recp, n)
  KEY_REC *recp;
  register long n;
  {
    extern FILE *key_file;
    extern long min_item[], max_item[];
    char *strncpy();
    void beep();
    
    if (n < min_item[INDEX] || n > max_item[INDEX])
      {
        strncpy ((char *)recp->kkey, "                    ",
                    KEY_LENGTH);
        recp->ccount = 0;
        return;
      }
    
    if (fseek (key_file, sizeof (KEY_REC) * n, 0) != 0)
      {
        beep ();
        printf ("Fatal error in fseek() getting key record #%ld!\n", n);
        exit();
      }
    
    if (fread ((char *)recp, sizeof (KEY_REC), 1, key_file) == 0)
      {
        beep ();
        printf ("Fatal error in fread() getting key record #%ld!\n", n);
        exit();
      }
  }


/* routine to get the nth pointer record (a long integer) from ptr_file;
 * the pointer record gives the actual offset in the document file of
 * the nth word in the expanded index....
 */
 
PTR_REC get_ptr_rec (n)
  register long n;
  {
    PTR_REC p;
    extern FILE *ptr_file;
    void beep();
    
    if (fseek (ptr_file, sizeof (PTR_REC) * n, 0) != 0)
      {
        beep ();
        printf ("Fatal error in fseek() getting pointer record #%ld!\n", n);
        exit();
      }
    
    if (fread ((char *)&p, sizeof (PTR_REC), 1, ptr_file) == 0)
      {
        beep ();
        printf ("Fatal error in fread() getting pointer record #%ld!\n", n);
        exit();
      }
    
    return (p);
  }


/* Move backwards in document file to find the start of the line of text
 * which has offset p (from start of file) somewhere in that line.
 * However, don't go back beyond STRLEN characters at a time... that
 * is, put a ceiling of STRLEN on the maximum 'length of a line', to
 * avoid pathological input files with no \n's....
 * Do the work by grabbing a buffer full and then scanning back in that,
 * rather than by laboriously lseek() and fgetc() back one character at
 * a time....
 */
 
long start_of_line (p)
  register long p;
  {
    extern FILE *doc_file;
    char buf[STRLEN];
    register long i;
    register int n;
    void beep();

    if (p <= 0)
        return (0L);

    i = p - STRLEN;
    i = ((i < 0) ? 0 : i);
    if (fseek (doc_file, i, 0) != 0)
      {
        beep ();
        printf ("Fatal error in fseek() backing up in document!\n");
        exit();
      }
    if ((n = fread (buf, sizeof (char), p - i, doc_file)) == 0)
      {
        beep ();
        printf ("Fatal error in fread() backing up in document!\n");
        exit();
      }

    while (--n >= 0 && buf[n] != '\n');
    
    return (i + n + 1);
  }


/* scan forward until reaching the beginning of the next line of text;
 * if at or beyond the end of the file, return a pointer to the last
 * character in the file ...
 * Don't go beyond STRLEN characters, however, in scanning forward....
 */

long next_line (p)
  long p;
  {
    register int c, n;
    extern long max_item[];
    void beep();
    
    if (p >= max_item[TEXT])
        return (max_item[TEXT]);
    
    if (fseek (doc_file, p, 0) != 0)
      {
        beep ();
        printf ("Fatal error in fseek() scanning forward in document!\n");
        exit();
      }
    
    for (n = 0; n < STRLEN; ++n)
        if ((c = fgetc (doc_file)) == EOF || c == '\n')
            break;
    
    return (ftell (doc_file));
  }


/* paging James Bond ....
 */

void beep ()
  {
    putchar ('\007');
  }


/* ---------------------do_help/files.c---------------- */



/* function to print out helpful information -- a command summary
 * for the user....
 */

void do_help ()
  {
    printf ("  ?        print this help\n");
    printf ("  :        quit the program\n");
    printf ("  :fnord   open document file 'fnord' for browsing\n");
    printf ("  >grok    open notes file 'grok' and append output\n");
    printf ("  >        close notes file\n");
    printf ("  'DON.MAC append comment 'DON.MAC' to notes file\n");
    printf ("  xyzzy    jump to 'XYZZY' in INDEX\n");
    printf ("  \".123    jump to '.123' in INDEX\n");
    printf ("  -17      back up 17 lines\n");
    printf ("  -        back up one line\n");
    printf ("  +0       print current line\n");
    printf (" <return>  move down one line\n");
    printf ("  +23      move down 23 lines\n");
    printf ("  =        INDEX --> CONTEXT --> TEXT\n");
    printf ("  ^        INDEX <-- CONTEXT <-- TEXT\n");
    printf ("  .9       move down and print out 9 lines\n");
    printf ("  *        create an empty working subset\n");
    printf ("  **       fill the working subset (complete database)\n");
    printf ("  &        add word-neighborhood to subset\n");
    printf ("  &&       add sentence-neighborhood to subset\n");
    printf ("  &&&      add paragraph-neighborhood to subset\n");
    printf ("  ~        invert (logical NOT) entire working subset\n");
    printf ("  ;        repeat previous command\n");
  }


/* function to handle a ":" command, to either quit the program or to
 * open a new database for browsing.... display some amusing data
 * about the file that is opened, if successful (total length in bytes,
 * total number of words, and number of unique words in the file)...
 */

void do_open (cmd)
  char cmd[];
  {
    char qcmd[STRLEN];
    extern FILE *doc_file;
    extern long max_item[];
    void destroy_subset (), open_files (), init_items ();
    
    if (cmd[1] == '\0')
      {
        printf ("Quit?\n");
        gets (qcmd);
        if (qcmd[0] == 'y' || qcmd[0] == 'Y')
          {
            printf ("Good-bye ... ^z\n");
            exit ();
          }
        else
          {
            printf ("Cancelled!\n");
            return;
          }
      }
    
    destroy_subset ();
    open_files (cmd);
    if (doc_file == NULL)
        return;
    init_items ();
    printf ("File \"%s\" contains:\n", cmd + 1);
    printf ("%9ld characters\n%9ld total words\n%9ld unique words\n",
        max_item[TEXT] + 1, max_item[CONTEXT] + 1, max_item[INDEX] + 1);
  }


/* function to open or close a notes file ... copies of browser
 * program output gets appended to that notes file as long as it
 * is open, and the user may add comment strings with the '"' command ....
 */

void do_redirection (cmd)
  char cmd[];
  {
     extern FILE *notes_file;
     FILE *fopen ();
     void beep();
     
    if (notes_file != NULL)
         fclose (notes_file);
     
     if (cmd[1] == '\0')
         return;

     if ((notes_file = fopen (cmd + 1, "a")) == NULL)
       {
        beep ();
        printf ("Error opening notes file \"%s\"!\n", cmd + 1);
        return;
       }
  }


/* function to append a comment line to the notes_file... does the job
 * for the '"' command....
 */

void do_comments (cmd)
  char cmd[];
  {
     extern FILE *notes_file;
     void beep();
     
     if (notes_file == NULL)
       {
        beep ();
        printf ("No notes file open!\n");
        return;
       }
     else
        fprintf (notes_file, "%s\n", cmd + 1);
  }
  
 /* -------------------do_moves.c---------------------- */
 
 

/* function to interpret a command to move around in the current level
 * of the browser display ... handles "+", "-", and "<return>" ... calls
 * routine make_move() to do the real work....
 */

void do_move (cmd)
  char cmd[];
  {
    long move;
    extern FILE *doc_file;
    char *strcat();
    void beep(), show_current_item();
    
    if (doc_file == NULL)
      {
        beep ();
        printf ("No file open!\n");
        return;
      }
    
    if (cmd[1] == '\0')
        strcat (cmd, "1");
    move = atol (cmd);
    make_move (move);
    show_current_item ();
  }


/* function to do the actual moving around in the INDEX or CONTEXT or
 * TEXT displays .... Safety features prevent user from going
 * off the end of the file in either direction....
 * make_move() returns FALSE if it fails to completely execute the move
 * (i.e., if the move would have run off either end of the file) and
 * beeps at the user ... it returns TRUE if the move succeeds....
 */

int make_move (n)
  long n;
  {
    int r;
    PTR_REC get_ptr_rec ();
    extern long current_item[], min_item[], max_item[];
    extern int level;
    extern char *subset;
    void beep();
    
    r = TRUE;
    switch (level)
      {
        case INDEX:
            current_item[INDEX] += n;
            if (current_item[INDEX] < min_item[INDEX])
              {
                current_item[INDEX] = min_item[INDEX];
                r = FALSE;
              }
            if (current_item[INDEX] > max_item[INDEX])
              {
                current_item[INDEX] = max_item[INDEX];
                r = FALSE;
              }
            break;

        case CONTEXT:
            if (subset == NULL)
              {
                current_item[CONTEXT] += n;
                if (current_item[CONTEXT] < min_item[CONTEXT])
                  {
                    current_item[CONTEXT] = min_item[CONTEXT];
                    r = FALSE;
                  }
                if (current_item[CONTEXT] > max_item[CONTEXT])
                  {
                    current_item[CONTEXT] = max_item[CONTEXT];
                    r = FALSE;
                  }
              }
            else
                r = make_subindex_move (n);

            current_item[TEXT] = get_ptr_rec (current_item[CONTEXT]);
            break;

        case TEXT:
            r = move_in_text (n);
            break;
      }
    
    if (r == FALSE)
        beep ();
    return (r);
  }


/* move up or down the chosen number of lines of text, and put the result
 * into current_item[TEXT] ... return FALSE if attempted move would have
 * run off the end of the file, non-zero if the move was completed without
 * error....
 */
 
int move_in_text (move)
  register long move;
  {
    int result;
    register long loc;
    extern long current_item[];
    
    result = TRUE;
    loc = current_item[TEXT];
    
    if (move < 0)
        for ( ; move < 0; ++move)
          {
            if (loc <= 0)
              {
                result = FALSE;
                break;
              }
            loc = start_of_line (loc - 1);
          }
            
    else if (move > 0)
        for ( ; move > 0; --move)
          {
            loc = next_line (loc);
            if (loc >= max_item[TEXT])
              {
                result = FALSE;
                loc = start_of_line (max_item[TEXT]);
                break;
              }
          }
    
    current_item[TEXT] = loc;
    return (result);
  }


/* make a move in the working subindex ... must just step along in
 * whichever direction is desired until either hitting the end (in
 * which case the last valid item found in the subindex should be
 * returned as the current one) or finishing the requested move.
 * Return FALSE if hit a wall, TRUE otherwise....
 *
 * This routine is only called when level == CONTEXT ... since
 * in TEXT level we are just moving around in the full text of
 * the document file, and in the INDEX level we count and display
 * even items with zero occurrences in the subindex...
 *
 * This routine is only called when subset != NULL, since if there
 * is no working subset the move becomes trivial arithmetic.
 *
 * This routine is only called when either current_item[CONTEXT]
 * is a good item in the subset already, or when there is a certainty
 * that a good item will be found in the subsequent scan (specifically,
 * when scanning down to find the first good item when called from
 * do_descend () ....).
 */

int make_subindex_move (move)
  long move;
  {
    register long i, last_good_item;
    int result;
    PTR_REC get_ptr_rec ();
    extern long current_item[], min_item[], max_item[];
    extern char *subset;

    result = TRUE;
    last_good_item = current_item[CONTEXT];

    if (move >= 0)
      {
        for (i = 0; i < move; ++i)
          {
            while (++current_item[CONTEXT] <= max_item[CONTEXT] &&
                ! get_subset_bit ((long)
                    get_ptr_rec (current_item[CONTEXT])));
            if (current_item[CONTEXT] > max_item[CONTEXT])
                break;
            last_good_item = current_item[CONTEXT];
          }
        if (current_item[CONTEXT] > max_item[CONTEXT])
          {
            result = FALSE;
            current_item[CONTEXT] = last_good_item;
          }
      }
    else
      {
        for (i = 0; i > move; --i)
          {
            while (--current_item[CONTEXT] >= min_item[CONTEXT] &&
                ! get_subset_bit ((long)
                    get_ptr_rec (current_item[CONTEXT])));
            if (current_item[CONTEXT] < min_item[CONTEXT])
                break;
            last_good_item = current_item[CONTEXT];
          }
        if (current_item[CONTEXT] < min_item[CONTEXT])
          {
            result = FALSE;
            current_item[CONTEXT] = last_good_item;
          }
      }
    return (result);
  }
  
 /* -------------------do_subset.c------------------- */
 
/* words to handle subset (proximity filter) tasks....
 * ^z -- 870810-13-....
 *
 * Subset (or subindex) restrictions are useful in doing searches for
 * combinations of terms, phrases, etc., where the terms are themselves
 * common words.  Subsets also become indispensible when working in
 * very large databases, where even slightly-exotic terms occur too many
 * times for convenient browsing through their CONTEXT displays.
 *
 * When a subset has been defined, then instead of showing an INDEX
 * item as something like:
 *      3141 UNIX
 * that item is displayed with a count of how many occurrences are
 * 'valid' in addition to the total number of occurrences:
 *   27/3141 UNIX
 * Thus, in this case only 27 out of the 3141 appearances of the term
 * 'UNIX' happened to be in the neighborhood of the set of words chosen
 * by the user to define the current working subset.
 * 
 * The 'neighborhood' (a half dozen words or so on each side) of a chosen
 * index term is added (logical OR) into the working subset by giving
 * the '&' command when that term's INDEX display is the current item.
 * If a larger neighborhood (half a dozen sentences or so each way) is
 * preferable, use the '&&' command; for a still larger neighborhood
 * (a few paragraphs on each side), use the '&&&' command.
 *
 *
 * IMPLEMENTATION NOTES:
 *
 * Subest searching is handled by an array of flag bits -- one bit
 * for each chunk of NEIGHBORHOOD bytes in the original text.  Bits are
 * held in the array subset[]; they are zero for 'invalid' regions
 * of the original document, and one for 'valid' regions that have
 * been defined by proximity to index terms selected by the user.
 *
 * A value of 32 for NEIGHBORHOOD seems to work very well for defining
 * proximity.  The compression factor from the original document to the
 * array subset[] is NEIGHBORHOOD*8 = 256 in that case.  So, even for a
 * document that is tens of megabytes long, the subset[] array only
 * requires a few hundred kB and it is quite feasible to hold it in
 * memory.
 *
 * The presence of a subset is signalled when the global array pointer
 * 'subset' is non-NULL ... in which case show_current_index_item,
 * make_move, and other words modify their behavior in order to
 * reflect the chosen subset.  If the subset is completely
 * empty, after having just been created or flushed, then the
 * global variable empty_subset is nonzero (true) and short-cuts
 * are taken in counting and displaying words....
 *
 * Consider a single occurrence of a word.  Suppose we want to add the
 * immediate neighborhood of that word to our working subset.  To avoid
 * edge-effects due to the chunkiness of the definition of neighborhood,
 * the routine to set bits in subset[] actually sets 2*WORD_RANGE+1 bits:
 * the bit corresponding to the chunk wherein the chosen word falls,
 * and WORD_RANGE extra bits on each side.
 *
 * Thus, for the choice WORD_RANGE = 1, any index item which is
 * within NEIGHBORHOOD (=32) bytes of our chosen word is certain
 * to be included in the proximity subset; any item which occurs
 * at least 2*NEIGHBORHOOD (=64) bytes away is certain NOT to be
 * included, and items in the intermediate range have a
 * linearly-decreasing probability of false inclusion.  The choice
 * of 32 for NEIGHBORHOOD and 1 for WORD_RANGE therefore gives
 * essentially all items within half a dozen or so words
 * (average word length is 5-6 letters) of the term used to
 * define the subset....
 *
 * In extensive experimentation with the MacForth-based Browser v.244+
 * program on the Macintosh, the above definition of proximity seemed
 * to be by far the most useful one in real life.  It is also quite
 * straightforward to implement with good efficiency.  In addition
 * to the 'word'-neighborhood proximity (within WORD_RANGE chunks),
 * it is occasionally convenient to add a somewhat larger
 * neighborhood to the working subset -- corresponding to proximity
 * within a few sentences or even a few paragraphs.  To do that,
 * the && (sentence-proximity) and &&& (paragraph-proximity) commands
 * simply set more bits on either side of the word's location in
 * subset[].  Specifically, && sets SENTENCE_RANGE bits on each side,
 * and &&& sets PARAGRAPH_RANGE bits on each side.  Values of 10 and 100
 * respectively for those parameters seem to work quite well.
 */



/* Handle the * command to empty out the working subset before
 * beginning to do proximity-filtered searching (that is, to create
 * the array subset[] in empty form). 
 * Also handle the ** command to destroy subset[] (which makes it
 * seem as if it is "full" from the user's viewpoint)
 *
 * Force the user back to INDEX level when the working subindex
 * is emptied or filled, to avoid inconsistencies in moving around
 * the CONTEXT level....
 */

void do_make_subset (cmd)
  char cmd[];
  {
    void beep (), create_subset (), destroy_subset (),
            show_current_item ();
    extern int level;
    
    if (cmd[1] == '\0')
        create_subset ();
    else if (cmd[1] == '*')
        destroy_subset ();
    else
        beep ();
    
    level = INDEX;
    show_current_item ();
  }


/* Do the job of creating the empty array subset[] ... note that if
 * running on a machine with 16-bit int variables such as Lightspeed C
 * on the Mac, the calloc() function won't work for files bigger
 * than 16 MB or so (for NEIGHBORHOOD = 32) ....must modify this
 * to use clalloc(), the non-ANSI standard function for allocating
 * and clearing bigger memory chunks.  Hence, the #ifdef LIGHTSPEED
 * lines below....
 *
 * Note that there is no need to provide a "No file open!" error msg
 * since show_current_item() in calling function will do that for
 * us....
 */

void create_subset ()
  {
    extern char* subset;
    extern FILE *doc_file;
    extern int empty_subset;
    extern long max_item[];
#ifdef LIGHTSPEED
    char *clalloc ();
#else
    char *calloc ();
#endif
    void beep (), destroy_subset ();
    
    if (doc_file == NULL)
      return;

    destroy_subset ();
#ifdef LIGHTSPEED
    if ((subset = clalloc ((unsigned long)(1 +
            max_item[TEXT]/(NEIGHBORHOOD*8)), (long)sizeof(char))) == NULL)
#else    
    if ((subset = calloc ((unsigned int)(1 +
            max_item[TEXT]/(NEIGHBORHOOD*8)), sizeof(char))) == NULL)
#endif
      {
        beep ();
        printf ("Not enough memory for subset!\n");
        return;
      }
    
    empty_subset = TRUE;
  }


/* routine to destroy a subset (used in response to a ** command,
 * or when changing over to a new file with a : command, or when about
 * to create a new empty subset inside the * command)
 */

void destroy_subset ()
  {
    extern char *subset;
    
    if (subset != NULL)
        free (subset);
    subset = NULL;
  }


/* routine to invert the working subindex (response to a ~ command) --
 * perform a logical NOT on the entire set, so that what was once a
 * member of the set becomes a non-member, and vice versa....
 *
 * Must set empty_subset FALSE since we don't a priori know that the
 * resulting set is empty or full or what....
 *
 * As with * and ** commands, kick the user back to the INDEX level
 * when this command is executed, for safety's sake....
 */

void do_invert_subset ()
  {
    register long i, lim;
    extern char *subset;
    extern long max_item[];
    extern int empty_subset;
    void beep();

    if (doc_file == NULL)
      {
        beep ();
        printf ("No file open!\n");
        return;
      }
    if (subset == NULL)
      {
        beep ();
        printf ("No subset!\n");
        return;
      }

    lim = max_item[TEXT]/(NEIGHBORHOOD*8);
    for (i = 0; i <= lim; ++i)
        subset[i] = ~subset[i];

    empty_subset = FALSE;
    level = INDEX;
    show_current_item ();
  }
  

/* handle the '&' command to add the current word's neighborhood to
 * the working subset:
 *    &    adds a few-words neighborhood
 *    &&    adds a few-sentences neighborhood
 *    &&&    adds a few-paragraphs neighborhood
 *
 * Note that instead of calling show_current_item() to finish up
 * this function, since we know that every occurrence of the
 * current item will be in the working subset we can save time
 * and avoid re-counting them all ... hence, the final lines
 * of this routine
 */

void do_add_neighborhood (cmd)
  char cmd[];
  {
    int nsize;
    extern int level;
    extern char *subset;
    extern FILE *doc_file, *notes_file;
    void beep (), set_neighborhood_bits (), get_key_rec ();
    extern KEY_REC this_rec, prev_rec;
    extern long current_item[], subset_rec_count;

    if (doc_file == NULL)
      {
        beep ();
        printf ("No file open!\n");
        return;
      }
    if (subset == NULL)
      {
        beep ();
        printf ("No subset!\n");
        return;
      }

    nsize = WORD_RANGE;
    if (cmd[1] == '&')
      {
        nsize = SENTENCE_RANGE;
        if (cmd[2] == '&')
            nsize = PARAGRAPH_RANGE;
      }
    
    level = INDEX;
    set_neighborhood_bits (nsize);
    empty_subset = FALSE;
    get_key_rec (&prev_rec, current_item[INDEX] - 1);
    get_key_rec (&this_rec, current_item[INDEX]);
    subset_rec_count = this_rec.ccount - prev_rec.ccount;
    printf ("%6ld/%-6ld %.28s\n", subset_rec_count, subset_rec_count,
                this_rec.kkey);
    if (notes_file != NULL)
        fprintf (notes_file, "%6ld/%-6ld %.28s\n", subset_rec_count,
                    subset_rec_count, this_rec.kkey);
  }


/* function to set n bits on each side of each current index term in the
 * subset[] array ... note that it is important to set min_item[CONTEXT]
 * and max_item[CONTEXT] values before using them, since they are only
 * set ordinarily when descending ('=' command) into the CONTEXT level...
 * values of prev_rec and this_rec are set every time an index item
 * is displayed, so they will be all right for us already....
 */

void set_neighborhood_bits (n)
  register int n;
  {
    register long i;
    register PTR_REC j, j0, j1;
    extern long min_item[], max_item[];
    extern KEY_REC prev_rec, this_rec;
    PTR_REC get_ptr_rec ();
    void set_subset_bit ();
    
    min_item[CONTEXT] = prev_rec.ccount;
    max_item[CONTEXT] = this_rec.ccount - 1;
    
    if ((this_rec.ccount-prev_rec.ccount) * n > GIVE_WARNING)
        printf ("Please wait (%ld bits to set)...\n",
            (this_rec.ccount - prev_rec.ccount) * n * 2 + 1);

    for (i = min_item[CONTEXT]; i <= max_item[CONTEXT]; ++i)
      {
        j = get_ptr_rec (i);
        j0 = j - n * NEIGHBORHOOD;
        if (j0 < min_item[TEXT])
            j0 = min_item[TEXT];
        j1 = j + n * NEIGHBORHOOD;
        if (j1 > max_item[TEXT])
            j1 = max_item[TEXT];
        for (j = j0; j <= j1; j += NEIGHBORHOOD)
            set_subset_bit (j);
      }
  }


/* function to set a bit in the array subset[] for a chosen character
 * offset from the start of the file ... just convert the offset into
 * byte and bit numbers and then "OR" the 1 into that slot....
 *
 * (An attempt to optimize for the specific value NEIGHBORHOOD = 32,
 * using & and >> in place of % and /, did not yield a significant
 * improvement in speed... ^z 870813)
 */

void set_subset_bit (offset)
  long offset;
  {
    int bit_no;
    long byte_no;
    extern char *subset;

    bit_no = (offset % (8 * NEIGHBORHOOD)) / NEIGHBORHOOD;
    byte_no = offset / (8 * NEIGHBORHOOD);
    
    subset[byte_no] |= 1 << bit_no;
  }


/* function to return the value of a bit in the array subset[] for a
 * chosen character offset from the start of the document file ...
 * as in set_subset_bit() above, just convert offset into byte and
 * bit numbers, extract the relevant bit, shift it down, and return
 * it....
 *
 * (An attempt to optimize for the specific value NEIGHBORHOOD = 32,
 * using & and >> in place of % and /, did not yield a significant
 * improvement in speed... ^z 870813)
 */

int get_subset_bit (offset)
  long offset;
  {
    int bit_no;
    long byte_no;
    extern char *subset;

    bit_no = (offset % (8 * NEIGHBORHOOD)) / NEIGHBORHOOD;
    byte_no = offset / (8 * NEIGHBORHOOD);

    return ((subset[byte_no] >> bit_no) & 1);
  }


/* function to count and return the number of occurrences of this_rec
 * in the working subindex (i.e., the number of times the word comes
 * up with its subset[] bit turned ON) ....
 */
 
long count_subset_recs (this_recp, prev_recp)
  KEY_REC *this_recp, *prev_recp;
  {
    register long i, n;
    extern int empty_subset;
    PTR_REC get_ptr_rec ();
    int get_subset_bit ();
    
    if (empty_subset)
        return (0L);

    if (this_recp->ccount - prev_recp->ccount > GIVE_WARNING)
        printf ("Please wait (%ld words to check)...\n",
                    this_recp->ccount - prev_recp->ccount);
    
    for (n = 0, i = prev_recp->ccount; i < this_recp->ccount; ++i)
        n += get_subset_bit ((long)get_ptr_rec (i));
    
    return (n);
  }


/* ----------------------do_targets.c----------------- */

/* some functions to respond to various user inputs...
 *  870806-13-... ^z
 */


/* function to move down a level in the browser hierarchy:
 *    INDEX --> CONTEXT --> TEXT
 * includes various safety features against errors, and takes
 * care of initializations for the level being entered ... also
 * displays the current line when entering the new level, for the user's
 * convenience....
 *
 * Modified for subset operations, ^z - 870813.  Note that we must check
 * here to be sure that we are descending into a non-empty subset when
 * a subset is active and we are descending from INDEX --> CONTEXT;
 * we also must make sure in that case that we initialize the CONTEXT
 * display to start with the first good item, not just min_item[CONTEXT].
 */

void do_descend ()
  {
    extern int level, empty_subset;
    extern long current_item[], min_item[], max_item[], subset_rec_count;
    extern KEY_REC this_rec, prev_rec;
    extern FILE *doc_file;
    extern char *subset;
    void beep();
    
    if (doc_file == NULL)
      {
        beep ();
        printf ("No file open!\n");
        return;
      }

    if (current_item[INDEX] < 0)
      {
        beep ();
        printf ("No current index item!\n");
        return;
      }
    
    if (subset != NULL && (empty_subset || subset_rec_count == 0))
      {
        beep ();
        printf ("No items in subset!\n");
        return;
      }

    ++level;

    if (level == CONTEXT)
      {
        min_item[CONTEXT] = prev_rec.ccount;
        max_item[CONTEXT] = this_rec.ccount - 1;
        if (subset == NULL)
            current_item[CONTEXT] = min_item[CONTEXT];
        else
          {
            current_item[CONTEXT] = min_item[CONTEXT] - 1;
            make_subindex_move (1L);
          }
        current_item[TEXT] = get_ptr_rec (current_item[CONTEXT]);
      }

    if (level == TEXT)
        current_item[TEXT] =
                start_of_line (current_item[TEXT]);

    if (level > TEXT)
      {
        beep ();
        printf ("Can't descend past full text display!\n");
        level = TEXT;
      }

    show_current_item ();
  }

/* function to move up a level in the browser hierarchy:
 *       TEXT --> CONTEXT --> INDEX
 * includes a safety net against going past INDEX, and
 * displays the current line when entering the new level...
 * note that when coming up from TEXT, must reset the current_item[TEXT]
 * before displaying CONTEXT line, as TEXT browsing changes it....
 */


void do_ascend ()
  {
    extern int level;
    void beep();
    
    --level;
    
    if (level < INDEX)
      {
        beep ();
        printf ("Can't ascend past index display!\n");
        level = INDEX;
      }

    if (level == CONTEXT)
        current_item[TEXT] = get_ptr_rec (current_item[CONTEXT]);

    show_current_item ();
  }


/* function to jump the INDEX display to a target entered by the
 * user ... does a simple binary search (see K&R p.54) to get there
 * and then shows that line of the INDEX ... and if the
 * target word is not found, it beeps and shows the preceeding
 * word available in the index ... if called from another level, it
 * jumps to the INDEX level ...
 */

void do_target_jump (cmd)
  char cmd[];
  {
    register int c, diff;
    register long low, high, mid;
    KEY_REC target_item, this_item;
    extern long current_item[], max_item[];
    extern FILE *key_file;
    extern int level;
    void beep(), init_target_item();
    
    if (key_file == NULL)
      {
        beep ();
        printf ("No file open!\n");
        return;
      }
    
    level = INDEX;
    init_target_item (&target_item, cmd);
    low = min_item[INDEX];
    high = max_item[INDEX];

    while (low <= high)
      {
        mid = (low + high) / 2;
        get_key_rec (&this_item, mid);
        diff = zstrcmp (target_item.kkey, this_item.kkey);
        if (diff < 0)
            high = mid - 1;
        else if (diff > 0)
            low = mid + 1;
        else
          break;
      }

    current_item[INDEX] = mid;
    if (diff < 0)
        --current_item[INDEX];
    if (current_item[INDEX] < 0)
        current_item[INDEX] = 0;
    if (diff != 0)
        beep ();
    show_current_item ();
  }



/* my function to compare two strings and give a result as to who is
 * alphabetically earlier.  Note that this is almost the same as strncmp()
 * with the fixed value of KEY_LENGTH as the maximum comparison distance,
 * except that I must be sure to mask the characters to make them positive
 * (since we want to be able to handle the non-ASCII funny letters in
 * the Apple character set properly/consistently).  If the masking isn't
 * done, then inconsistent results can occur with those non-ASCII chars!
 */

int zstrcmp (s1, s2)
  register char *s1, *s2;
  {
    register int n = KEY_LENGTH;
    
    for (; --n && ((*s1 & 0xFF) == (*s2 & 0xFF)); s1++, s2++)
        if (!*s1) break;
        
    return ((*s1 & 0xFF) - (*s2 & 0xFF));
  }



/* initialize the target KEY_REC kkey string to the value typed in by
 * the user ... convert the user's string to all capital letters, and
 * pad it out to KEY_LENGTH with trailing blanks as needed ... use the
 * toupper() function warily (assume that it may be nonstandard and
 * mess up the value of non-lowercase characters, as it seems to on
 * the VAX and Sun C compilers I've tried -- so check and don't apply
 * toupper() except to lower-case characters!)....
 */

void init_target_item (rp, cmd)
  KEY_REC *rp;
  char cmd[];
  {
    register int c, n;

    strncpy (rp->kkey, "                                    ", KEY_LENGTH);
    for (n = 0; n < KEY_LENGTH && (c = cmd[n]) != 0; ++n)
        rp->kkey[n] = (islower (c) ? toupper (c) : c);
  }


/* handle the printing out of N lines due to a ".N" user command ... do
 * the job simply by marching down a step and displaying that line N times
 * ... precisely equivalent to hitting the <return> key N times.
 */

void do_multiprint (cmd)
  char cmd[];
  {
    register long i, n;
    extern FILE *doc_file;
    void beep();
    
    if (doc_file == NULL)
      {
        beep ();
        printf ("No file open!\n");
        return;
      }
      
    if (cmd[1] == '\0')
        strcat (cmd, "1");
    n = atol (cmd + 1);
    for (i = 0; i < n; ++i)
      {
        if (! make_move (1L))
            break;
        show_current_item ();
      }
  }

/* ---------------------file_utils.c--------------------- */

/* initialize, open, etc. various things for brwsr program...
 * 870805-13... ^z
 */


/* open the document, key, and pointer files ...
 * for now, demand that if the document file is named * then the
 * key file must be named *.k and the pointer file must be *.p
 * ... just the way that the ndxr program builds the index.....
 */

void open_files (cmd)
  char cmd[];
  {
    char ocmd[STRLEN], *strcat(), *strcpy();
    FILE *fopen();
    extern FILE *doc_file, *key_file, *ptr_file;
    void beep();

    close_files ();
    strcpy (ocmd, cmd + 1);
    if ((doc_file = fopen (ocmd, "r")) == NULL)
      {
        beep ();
        printf ("Error opening document file \"%s\"!\n", ocmd);
        close_files ();
        return;
      }

    strcat (ocmd, ".k");
    if ((key_file = fopen (ocmd, "rb")) == NULL)
      {
        beep ();
        printf ("Error opening key file \"%s\"!\n", ocmd);
        close_files ();
        return;
      }

    ocmd[strlen (ocmd) - 2] = '\0';
    strcat (ocmd, ".p");
    if ((ptr_file = fopen (ocmd, "rb")) == NULL)
      {
        beep ();
        printf ("Error opening ptr file \"%s\"!\n", ocmd);
        close_files ();
        return;
      }
  }


/* close off the document, key, and pointer files
 */
 
void close_files ()
  {
    extern FILE *doc_file, *key_file, *ptr_file;
    
    if (doc_file != NULL)
      {
        fclose (doc_file);
        doc_file = NULL;
      }
    if (key_file != NULL)
      {
        fclose (key_file);
        key_file = NULL;
      }
    if (ptr_file != NULL)
      {
        fclose (ptr_file);
        ptr_file = NULL;
      }
  }


/* set up initial values for current_item and max_item arrays ...
 * initially, current_item is set to an illegal value, -1, and
 * max_item[CONTEXT] and max_item[TEXT] convey information about
 * the sizes of the pointer and document files respectively...
 */

void init_items ()
  {
    extern long current_item[], max_item[];
    extern FILE *doc_file, *key_file, *ptr_file;
    extern int level;
    
    level = INDEX;
    
    current_item[INDEX] = -1;
    current_item[CONTEXT] = -1;
    current_item[TEXT] = -1;

    fseek (key_file, 0L, 2);
    max_item[INDEX] = ftell (key_file) / sizeof (KEY_REC) - 1;
    
    fseek (ptr_file, 0L, 2);
    max_item[CONTEXT] = ftell (ptr_file) / sizeof (PTR_REC) - 1;

    fseek (doc_file, 0L, 2);
    max_item[TEXT] = ftell (doc_file) - 1;
  }


/* --------------------show_items.c------------------ */

/* functions to show a chosen item from an index key file,etc.
 * 870805-13...  ^z
 */


/* function to show whatever the current item is, in whatever level
 * the user is at the moment.... basically, it just routes the
 * request to whichever is the appropriate agent to take care of
 * it, for INDEX, CONTEXT, or TEXT levels....
 */
 
void show_current_item ()
  {
    extern int level;
    extern FILE *doc_file;
    void beep(), show_current_index_item(), show_current_context_item(),
        show_current_text_item();
    
    if (doc_file == NULL)
      {
        beep ();
        printf ("No file open!\n");
        return;
      }
    
    if (level == INDEX)
        show_current_index_item ();
    else if (level == CONTEXT)
        show_current_context_item ();
    else
        show_current_text_item ();
  }


/* function to show the current INDEX item (count of occurrences followed
 * by the indexed word itself) ... send copy to notes file, if open....
 */

void show_current_index_item ()
  {
    extern KEY_REC this_rec, prev_rec;
    extern long current_item[], subset_rec_count;
    extern FILE *notes_file;
    extern char *subset;
    long count_subset_recs ();
    void get_key_rec ();

    get_key_rec (&prev_rec, current_item[INDEX] - 1);
    get_key_rec (&this_rec, current_item[INDEX]);
    
    if (subset == NULL)
      {
        printf ("%6ld %.28s\n", this_rec.ccount - prev_rec.ccount,
                    this_rec.kkey);
        if (notes_file != NULL)
            fprintf (notes_file, "%6ld %.28s\n",
                    this_rec.ccount - prev_rec.ccount, this_rec.kkey);
      }
    else
      {
          subset_rec_count = count_subset_recs (&this_rec, &prev_rec);
        printf ("%6ld/%-6ld %.28s\n", subset_rec_count,
                    this_rec.ccount - prev_rec.ccount, this_rec.kkey);
        if (notes_file != NULL)
        fprintf (notes_file, "%6ld/%-6ld %.28s\n", subset_rec_count,
                    this_rec.ccount - prev_rec.ccount, this_rec.kkey);
      }
  }


/* function to show the current CONTEXT item, a line in the form of a
 * key-word-in-context (KWIC) display with the indexed term in the'
 * middle ... the line is filtered so that the presence of tabs, <cr>'s,
 * or other nasty control characters won't cause any problems... 
 * a little testing is needed to take care of end-effects, so that 
 * nothing happens for context lines near either end of the document
 * file ... finally, send a copy to the notes file, if open....
 */

void show_current_context_item ()
  {
    register int n, m;
    register long p;
    char buf[CONTEXT_LINE_LENGTH + 1];
    extern long current_item[];
    extern FILE *doc_file;
    void beep();
    
    p = current_item[TEXT] - CONTEXT_OFFSET;
    n = 0;
    if (p < 0)
      {
        for ( ; n < -p; ++n)
            buf[n] = ' ';
        p = 0;
      }
    if (fseek (doc_file, p, 0) != 0)
          {
            beep ();
            printf ("Error in fseek() getting a context line to show!\n");
            return;
          }
    if ((m = fread (buf + n, sizeof (char), CONTEXT_LINE_LENGTH - n,
                        doc_file)) == 0)
          {
            beep ();
            printf ("Error in fread() getting a context line to show!\n");
            return;
          }
    n += m;
    for ( ; n < CONTEXT_LINE_LENGTH; ++n)
        buf[n] = ' ';
    for (n = 0; n < CONTEXT_LINE_LENGTH; ++n)
        if (! isprint (buf[n] & 0xFF))
            buf[n] = ' ';
    buf[CONTEXT_LINE_LENGTH] = '\0';
    printf ("%s\n", buf);
    if (notes_file != NULL)
        fprintf (notes_file, "%s\n", buf);
  }


/* function to show the current TEXT item, a line from the document file
 * itself ... copy to notes file also, if so desired....
 */

void show_current_text_item ()
  {
    char buf[STRLEN + 1];
    extern long current_item[];
    extern FILE *doc_file;
    void beep();
    
    if (fseek (doc_file, current_item[TEXT], 0) != 0)
          {
            beep ();
            printf ("Error in fseek() getting a text line to show!\n");
            return;
          }
    if (fgets (buf, STRLEN, doc_file) == NULL)
          {
            beep ();
            printf ("Error in fgets() getting a text line to show!\n");
            return;
          }
    printf ("%s", buf);
    if (notes_file != NULL)
            fprintf (notes_file, "%s", buf);
  }



-------