[mod.sources] v06i050: Binary search for strings in a file

sources-request@mirror.UUCP (07/11/86)

Submitted by: hplabs!hpfcla!ajs
Mod.sources: Volume 6, Issue 50
Archive-name: bsearchstr

[  I compiled and tested this on a 4.2BSD Vax750 and had no problems.
   The README is right -- just do a "vanilla cc"; add -DDEBUG to
   get a test stub with main.  --r$ ]

#!/bin/sh
# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by mirror!rs on Fri Jul 11 15:00:17 EDT 1986
# Contents:  README bsearchstr.3 bsearchstr.c
 
echo x - README
sed 's/^XX//' > "README" <<'@//E*O*F README//'
XXInformation on bsearchstr(3):

XX1.  This library routine is like bsearch(3), but generalized to work
XX    with files (not memory images) of random length strings (not fixed
XX    length objects).

XX2.  Unlike bsearch(), it is well-behaved.  If more than one data item
XX    (text line) matches the pattern, it returns the first matching item,
XX    not a random one.

XX3.  It's been around a while, well tested, and shown itself to be
XX    generally useful, mainly for lookups into large, sorted lists.

XX4.  It runs on HP-UX (AT&T V.2); hasn't been tested on other variants.

XX5.  No makefile provided or needed -- compilation and linking are
XX    vanilla.

XX6.  No warranty express or implied accompanies this software.  Caveat
XX    emptor.  See the manual entry.

XXAlan Silverstein, Hewlett-Packard Systems Software Operation, Fort Collins,
XXColorado; {ihnp4 | hplabs}!hpfcla!ajs; 303-229-3053; (lat-long on request :-)
@//E*O*F README//
chmod u=rw,g=rw,o=rw README
 
echo x - bsearchstr.3
sed 's/^XX//' > "bsearchstr.3" <<'@//E*O*F bsearchstr.3//'
XX.TH BSEARCHSTR 3 "HP Experimental"
XX.SH NAME
XXbsearchstr \- binary search a sorted file of random-length strings
XX.SH SYNOPSIS
XX.B "long bsearchstr (stream, pattern, caseins, tellnext)"
XX.br
XX.B "FILE\0\(**stream;"
XX.br
XX.B "char\0\(**pattern;"
XX.br
XX.B "int\0\0caseins;"
XX.br
XX.B "int\0\0tellnext;"
XX.ad b
XX.SH HP-UX COMPATIBILITY
XX.TP 10
XXLevel:
XXHP-UX/RUN ONLY
XX.TP
XXOrigin:
XXHewlett-Packard
XX.SH DESCRIPTION
XX.B Bsearchstr
XXis a binary search routine which understands ordinary files containing sorted,
XXrandom-length strings (lines) terminated by newline characters.
XXIt returns the byte offset of the first line in the file which begins with the
XXgiven pattern, taken as a simple string of characters (regular expressions are
XXnot supported).
XXIt also sets the file location to the line beginning at the offset returned,
XXusing \fIfseek\fR\^(3s).
XX.PP
XX\fIStream\fR must be a currently open file containing lines previously sorted
XXin increasing order by the simple numeric values of the characters.
XX.PP
XX\fIPattern\fR must be a valid pointer to char.
XXA nil pattern (e.g. \(**pattern == 0) matches anything, so
XX.B bsearchstr
XXreturns zero (start of file) in this case.
XXA nil line in the file (e.g. nothing but a newline) is treated as being less
XXthan any non-nil pattern.
XX.PP
XXIf
XX.I caseins
XXis zero, searching is done case-sensitively, and the data file should be sorted
XXaccordingly.
XXIf
XX.I caseins
XXis non-zero, searching is done case-insensitively, and the data must have been
XXsorted using "sort \-f" or equivalent.
XXThe routine uses
XX.IR malloc (3)
XXto get memory to make an uppercased copy of
XX.IR pattern ,
XXso as not to modify the passed-in value.
XX.PP
XXIf \fItellnext\fR is zero, and no line in the file begins with \fIpattern\fR,
XX.B bsearchstr
XXreturns \-1.
XXIf \fItellnext\fR is non-zero, it returns the offset of the first line
XXlexically after \fIpattern\fR.
XXIt returns the size of the file if the last line is before \fIpattern\fR.
XX.PP
XXEach time
XX.B bsearchstr
XXhalves the remainder of the file, it seeks to a
XXlocation which is usually not the start of a line.
XXIt searches forwards, then backwards, a character at a time, to locate the
XXnext start of line.
XXThis is normally more efficient than linear searching the file, especially if
XXthe file is large and no lines are extremely long.
XXAlso,
XX.B bsearchstr
XXworks fine even if the last line in the file is not
XXterminated by a newline.
XX.SH SEE ALSO
XXsort(1), bsearch(3c), fgets(3s), fopen(3s), hsearch(3c), lsearch(3c),
XXqsort(3c), tsearch(3c).
XX.SH DIAGNOSTICS
XXReturns \-1 if no line matches and \fItellnext\fR is zero.
XX.PP
XXReturns \-2 if any
XX.IR fseek (3s),
XX.IR ftell (3s),
XXor
XX.IR malloc (3c)
XXerror occurs.
XXIn this case the current file location is unpredictable.
@//E*O*F bsearchstr.3//
chmod u=rw,g=rw,o=rw bsearchstr.3
 
echo x - bsearchstr.c
sed 's/^XX//' > "bsearchstr.c" <<'@//E*O*F bsearchstr.c//'
XXstatic char Uni_id[] = "@(#)28.1";
XX/* UNISRC_ID: @(#)bsearchstr.c	28.1	86/01/04  */
XX/*
XX * Binary search routine for sorted file of strings (lines).
XX * Compile with -DDEBUG for standalone testing and extra output.
XX */

XX#include <stdio.h>
XX#include <ctype.h>

XX#define	chNull	('\0')
XX#define	cpNull	((char *) NULL)


XX#ifdef DEBUG
XX/*
XX * Test usage:  cmd file pattern [caseins [tellnext]]
XX *
XX * If there is a third  argument, sends caseins  == 1.
XX * If there is a fourth argument, sends tellnext == 1.
XX */

XXmain (argc, argv)
XX	int	argc;
XX	char	**argv;
XX{
XX	FILE	*fp;
XX	char	buf[BUFSIZ];

XX	fp = fopen (argv[1], "r");
XX	printf ("==min=== ==max=== ==pos1== ==pos2== cmp match\n");
XX	printf ("Search returns %ld\n",
XX		bsearchstr (fp, argv[2], (argc > 3), (argc > 4)));
XX	printf ("%s\n", fgets (buf, BUFSIZ, fp));
XX}
XX#endif /* DEBUG */


XX/***************************************************************************
XX * B S E A R C H S T R
XX *
XX * Binary search a stream consisting of sorted lines of data for the first
XX * line that starts with the given pattern, or the next line if none and
XX * tellnext == 1.  Set the file location to the first byte of that line
XX * and return the offset.
XX *
XX * When caseins is set, takes pains not to modify the memory pointed to by
XX * pattern.  Uses multi-statement macros for efficiency.
XX *
XX * See bsearchstr(3) for more information.
XX *
XX * Author:  Alan Silverstein, Hewlett-Packard Fort Collins Systems Division
XX */

XX#define	LT 0
XX#define	EQ 1
XX#define	GT 2

XX#define	RETURN(value)	{ if (caseins) free (pattern); return (value); }
XX#define	SETPOS		{ if (fseek (filep, pos, 0) < 0) RETURN (-2L); }

XXlong bsearchstr (filep, pattern, caseins, tellnext)
XX	FILE	*filep;			/* file to search	   */
XX	char	*pattern;		/* start of line to match  */
XX	int	caseins;		/* case insensitive?	   */
XX	int	tellnext;		/* tell next if not found? */
XX{
XX	/* start of first line not yet compared (lower limit) */
XX	long	min = 0;
XX	/* start of line after last line not yet compared (upper limit) */
XX	long	max;

XX	 long	pos;			/* current offset in file  */
XX	 char	*patp;			/* pattern pointer	   */
XX	 /* warning: this must be an int, not a char */
XXregister int	ch;			/* current char to compare */
XXregister int	cmp;			/* results of comparison   */
XX	 int	match = 0;		/* true if match found	   */

XX	 char	*malloc();

XX/*
XX * PREPARE PATTERN FOR CASE-INSENSITIVE SEARCH:
XX *
XX * Use toupper(), not tolower(), to be compatible with the actual function
XX * of sort -f.
XX */

XX	if (caseins)
XX	{
XX	    int	  len = strlen (pattern) + 1;	/* chars to uppercase	*/
XX	    char  *cp;				/* for copying data	*/
XX	    char  *cpend;			/* end of copy range	*/

XX	    if ((cp = patp = malloc (len)) == cpNull)
XX		return (-2L);

XX	    /* now remember to free (pattern) before returning	*/
XX	    /* can use the RETURN() macro to accomplish this	*/

XX	    cpend = cp + len;

XX	    while (cp < cpend)			/* copy chNull also	     */
XX		*cp++ = toupper (*pattern++);	/* toupper() is not a macro! */

XX	    pattern = patp;			/* "move" to new location */
XX	}

XX/*
XX * SET MAX TO END OF FILE (byte past last) by seeking there:
XX */

XX	if ((fseek (filep, 0L, 2) < 0) || ((max = ftell (filep)) < 0))
XX	    RETURN (-2L);

XX/*
XX * SET NEXT STARTING POSITION (where to find string and compare):
XX */

XX	while (min < max)			/* do until closure happens */
XX	{
XX	    pos = (min + max) / 2;
XX	    SETPOS;

XX#ifdef DEBUG
XX	    printf ("%8ld %8ld %8ld ", min, max, pos);
XX#endif

XX/*
XX * LOOK FOR THE NEXT END OF LINE IN THE FORWARD DIRECTION:
XX */

XX	    while ((pos < max) && (getc (filep) != '\n'))
XX		pos++;

XX	    pos++;				/* skip newline */

XX/*
XX * If we ran into max, we are nearly done, assuming the current last line is
XX * not really huge compared to all the others (but this will work out OK too).
XX * Simply reset pos = min and check the first current line.
XX */

XX	    if (pos >= max)
XX		pos = min;

XX/*
XX * COMPARE THE CURRENT LINE TO THE PATTERN:
XX *
XX * If pattern[0] == chNull, never does the for loop, so it matches anything.
XX */

XX	    SETPOS;

XX	    for (cmp = EQ, patp = pattern; (*patp != chNull); patp++)
XX	    {
XX		ch = caseins ? toupper (getc (filep)) : getc (filep);

XX		if ((ch < *patp) || (ch == '\n') || (ch == EOF))
XX		{				/* line < pattern */
XX		    cmp = LT;
XX		    break;
XX		}
XX		else if (ch > *patp)		/* line > pattern */
XX		{
XX		    cmp = GT;
XX		    break;
XX		}
XX	    }

XX	    match += (cmp == EQ);		/* note a match */

XX#ifdef DEBUG
XX	    printf ("%8ld  %c  %5d\n", pos,
XX		    ((cmp == LT) ? '<' : ((cmp == EQ) ? '=' : '>')), match);
XX#endif
XX		
XX/*
XX * MOVE MIN OR MAX according to the results of comparison:
XX *
XX * If pattern[0] == chNull, cmp == EQ, which is "safe".
XX */

XX	    if (cmp == LT)				/* min => next line */
XX	    {
XX		min = pos + (patp - pattern);

XX		while ((ch != '\n') && (min < max))	/* find end */
XX		{
XX		    ch = getc (filep);
XX		    min++;
XX		}
XX		min++;					/* skip newline */
XX	    }
XX	    else					/* max => this line */
XX	    {
XX		max = pos;
XX	    }

XX	} /* while */

XX/*
XX * FINISH UP:
XX *
XX * Note that min could be one too large if the last line is unterminated
XX * and less than pattern, so we use max instead.
XX */

XX	if (match || tellnext)			/* success */
XX	{
XX	    pos = max;
XX	    SETPOS;
XX	    RETURN (pos);
XX	}
XX	else					/* failure */
XX	{
XX	    RETURN (-1L);
XX	}

XX} /* bsearchstr */
@//E*O*F bsearchstr.c//
chmod u=rw,g=rw,o=rw bsearchstr.c
 
echo Inspecting for damage in transit...
temp=/tmp/sharin$$; dtemp=/tmp/sharout$$
trap "rm -f $temp $dtemp; exit" 0 1 2 3 15
cat > $temp <<\!!!
      23     133     905 README
      87     435    2678 bsearchstr.3
     220     923    5190 bsearchstr.c
     330    1491    8773 total
!!!
wc  README bsearchstr.3 bsearchstr.c | sed 's=[^ ]*/==' | diff -b $temp - >$dtemp
if test -s $dtemp
then echo "Ouch [diff of wc output]:" ; cat $dtemp
else echo "No problems found."
fi
exit 0