[net.bugs.usg] "fgrep" doesn't always match everything it should

guy@sun.uucp (Guy Harris) (04/18/86)

The S5 "fgrep" has a bug in it.  If you have a file with

	shxp
	hers
	his
	xk

in it, and you feed that as a pattern file to the S5 "fgrep", it will NOT
match on "shxk"!

Here's a fix.  It also includes Doug Gwyn's recent fixes, and some other
changes:

	Checks for I/O errors reading files.
	Better error messages - uses "perror".
	Runs much faster - doesn't call a subroutine to do character
	    matching (I posted this fix a while ago).
	Disallows multiple "-f" flags.  (Should also disallow multiple
	    "-e" flags as well - I just noticed this, so it's not in
	    this "diff" listing.)
	Allows "-e" flag when reading standard input.

*** fgrep.c.orig	Thu Apr 17 21:27:30 1986
--- fgrep.c.fixed	Thu Apr 17 21:32:27 1986
***************
*** 9,14
   */
  
  #include <stdio.h>
  
  #define	MAXSIZ 6000
  #define QSIZE 400

--- 9,15 -----
   */
  
  #include <stdio.h>
+ #include <ctype.h>
  
  #define	MAXSIZ 6000
  #define QSIZE 400
***************
*** 30,35
  int	nsucc;
  long	tln;
  FILE	*wordf;
  char	*argptr;
  extern	char *optarg;
  extern	int optind;

--- 31,37 -----
  int	nsucc;
  long	tln;
  FILE	*wordf;
+ char	*wordfname;
  char	*argptr;
  extern	char *optarg;
  extern	int optind;
***************
*** 34,39
  extern	char *optarg;
  extern	int optind;
  
  main(argc, argv)
  char **argv;
  {

--- 36,45 -----
  extern	char *optarg;
  extern	int optind;
  
+ /* The following macro was inserted to allow for the "-i" option */
+ 
+ #define	same(a, b) ((a) == (b) || iflag && ((a) ^ (b)) == ' ' && letter(a) == letter(b))
+ 
  main(argc, argv)
  char **argv;
  {
***************
*** 63,68
  			continue;
  
  		case 'f':
  			fflag++;
  			wordf = fopen(optarg, "r");
  			if (wordf==NULL) {

--- 69,78 -----
  			continue;
  
  		case 'f':
+ 			if (fflag) {
+ 				fprintf(stderr, "fgrep: Only one \"-f\" option allowed\n");
+ 				exit(2);
+ 			}
  			fflag++;
  			wordfname = optarg;
  			wordf = fopen(wordfname, "r");
***************
*** 64,70
  
  		case 'f':
  			fflag++;
! 			wordf = fopen(optarg, "r");
  			if (wordf==NULL) {
  				fprintf(stderr, "fgrep: can't open %s\n", optarg);
  				exit(2);

--- 74,81 -----
  				exit(2);
  			}
  			fflag++;
! 			wordfname = optarg;
! 			wordf = fopen(wordfname, "r");
  			if (wordf==NULL) {
  				fprintf(stderr, "fgrep: ");
  				perror(wordfname);
***************
*** 66,72
  			fflag++;
  			wordf = fopen(optarg, "r");
  			if (wordf==NULL) {
! 				fprintf(stderr, "fgrep: can't open %s\n", optarg);
  				exit(2);
  			}
  			continue;

--- 77,84 -----
  			wordfname = optarg;
  			wordf = fopen(wordfname, "r");
  			if (wordf==NULL) {
! 				fprintf(stderr, "fgrep: ");
! 				perror(wordfname);
  				exit(2);
  			}
  			continue;
***************
*** 92,98
  	}
  
  	argc -= optind;
! 	if (errflg || ((argc <= 0) && !fflag)) {
  		printf("usage: fgrep %s\n",usage);
  		exit(2);
  	}

--- 104,110 -----
  	}
  
  	argc -= optind;
! 	if (errflg || ((argc <= 0) && !fflag && !eflag)) {
  		printf("usage: fgrep %s\n",usage);
  		exit(2);
  	}
***************
*** 128,134
  	char *nlp;
  	if (file) {
  		if ((fptr = fopen(file, "r")) == NULL) {
! 			fprintf(stderr, "fgrep: can't open %s\n", file);
  			retcode = 2;
  			return;
  		}

--- 140,147 -----
  	char *nlp;
  	if (file) {
  		if ((fptr = fopen(file, "r")) == NULL) {
! 			fprintf(stderr, "fgrep: ");
! 			perror(file);
  			retcode = 2;
  			return;
  		}
***************
*** 187,192
  					blkno += ccount;
  				   }
  			}
  			if ( (vflag && (failed == 0 || xflag == 0)) || (vflag == 0 && xflag && failed) )
  				goto nomatch;
  	succeed:	nsucc = 1;

--- 200,207 -----
  					blkno += ccount;
  				   }
  			}
+ 			if (ccount <= 0 && ferror(fptr))
+ 				goto done;
  			if ( (vflag && (failed == 0 || xflag == 0)) || (vflag == 0 && xflag && failed) )
  				goto nomatch;
  	succeed:	nsucc = 1;
***************
*** 198,204
  			}
  			else {
  				if (nfile > 1) printf("%s:", file);
! 				if (bflag) printf("%d:", (blkno-ccount-1)/BUFSIZ);
  				if (nflag) printf("%ld:", lnum);
  				if (p <= nlp) {
  					while (nlp < &buf[2*BUFSIZ]) putchar(*nlp++);

--- 213,219 -----
  			}
  			else {
  				if (nfile > 1) printf("%s:", file);
! 				if (bflag) printf("%ld:", (blkno-ccount-1)/BUFSIZ);
  				if (nflag) printf("%ld:", lnum);
  				if (p <= nlp) {
  					while (nlp < &buf[2*BUFSIZ]) putchar(*nlp++);
***************
*** 221,226
  				failed = 0;
  			}
  	}
  	fclose(fptr);
  	if (cflag) {
  		if (nfile > 1)

--- 236,249 -----
  				failed = 0;
  			}
  	}
+ done:
+ 	if (ccount <= 0 && ferror(fptr)) {
+ 		fprintf(stderr, "fgrep: Read error on ");
+ 		perror(file ? file : "standard input");
+ 		retcode = 2;
+ 		fclose(fptr);
+ 		return;
+ 	}
  	fclose(fptr);
  	if (cflag) {
  		if (nfile > 1)
***************
*** 232,241
  getargc()
  {
  	register c;
! 	if (wordf)
! 		return(getc(wordf));
! 	if ((c = *argptr++) == '\0')
! 		return(EOF);
  	return(c);
  }
  

--- 255,274 -----
  getargc()
  {
  	register c;
! 	if (wordf) {
! 		c = getc(wordf);
! 		if (c == EOF) {
! 			if (ferror(wordf)) {
! 				fprintf(stderr, "fgrep: Read error on ");
! 				perror(wordfname);
! 				exit(1);
! 			}
! 			fclose(wordf);
! 		}
! 	} else {
! 		if ((c = *argptr++) == '\0')
! 			return(EOF);
! 	}
  	return(c);
  }
  
***************
*** 303,309
  }
  
  overflo() {
! 	fprintf(stderr, "wordlist too large\n");
  	exit(2);
  }
  cfail() {

--- 336,342 -----
  }
  
  overflo() {
! 	fprintf(stderr, "fgrep: wordlist too large\n");
  	exit(2);
  }
  cfail() {
***************
*** 310,315
  	struct words *queue[QSIZE];
  	struct words **front, **rear;
  	struct words *state;
  	register char c;
  	register struct words *s;
  	s = w;

--- 343,349 -----
  	struct words *queue[QSIZE];
  	struct words **front, **rear;
  	struct words *state;
+ 	int bstart;
  	register char c;
  	register struct words *s;
  	s = w;
***************
*** 328,333
  			front = queue;
  		else front++;
  	cloop:	if ((c = s->inp) != 0) {
  			*rear = (q = s->nst);
  			if (front < rear)
  				if (rear >= &queue[QSIZE-1])

--- 362,368 -----
  			front = queue;
  		else front++;
  	cloop:	if ((c = s->inp) != 0) {
+ 			bstart = 0;
  			*rear = (q = s->nst);
  			if (front < rear)
  				if (rear >= &queue[QSIZE-1])
***************
*** 337,343
  			else
  				if (++rear == front) overflo();
  			state = s->fail;
! 		floop:	if (state == 0) state = w;
  			if (state->inp == c) {
  			qloop:	q->fail = state->nst;
  				if ((state->nst)->out == 1) q->out = 1;

--- 372,381 -----
  			else
  				if (++rear == front) overflo();
  			state = s->fail;
! 		floop:	if (state == 0) {
! 				state = w;
! 				bstart = 1;
! 			}
  			if (state->inp == c) {
  			qloop:	q->fail = state->nst;
  				if ((state->nst)->out == 1) q->out = 1;
***************
*** 345,350
  			}
  			else if ((state = state->link) != 0)
  				goto floop;
  		}
  		if ((s = s->link) != 0)
  			goto cloop;

--- 383,392 -----
  			}
  			else if ((state = state->link) != 0)
  				goto floop;
+ 			else if(bstart == 0){
+ 				state = 0;
+ 				goto floop;
+ 			}
  		}
  		if ((s = s->link) != 0)
  			goto cloop;
***************
*** 351,357
  	}
  }
  
! /* The following functions were inserted to allow for the "-i" option */
  
  same(a, b)
  register int a, b;

--- 393,399 -----
  	}
  }
  
! /* The following function was inserted to allow for the "-i" option */
  
  letter(c)
  register int c;
***************
*** 353,364
  
  /* The following functions were inserted to allow for the "-i" option */
  
- same(a, b)
- register int a, b;
- {
- 	return (a == b || iflag && (a ^ b) == ' ' && letter(a) == letter(b));
- }
- 
  letter(c)
  register int c;
  {

--- 395,400 -----
  
  /* The following function was inserted to allow for the "-i" option */
  
  letter(c)
  register int c;
  {
***************
*** 362,370
  letter(c)
  register int c;
  {
! 	if (c >= 'a' && c <= 'z')
! 		return (c);
! 	if (c >= 'A' && c <= 'z')
! 		return (c + 'a' - 'A');
  	return(0);
  }

--- 398,406 -----
  letter(c)
  register int c;
  {
! 	if (islower(c))
! 		return(c);
! 	if (isupper(c))
! 		return(_tolower(c));
  	return(0);
  }

The use of a macro for character comparison, as well as this bug fix, came
from the 4.2BSD version of "fgrep".

I think this bug is also in the V7 "fgrep"; the section of code in the S5
"fgrep" where this bug lies is similar to the V7 one and lacks the changes
from 4.2.

Here's an edited "diff" listing of the interesting differences between the
V7 and 4.2 "fgrep", in case you're curious:

24a12
> #include <ctype.h>
37c25
< int	bflag, cflag, fflag, lflag, nflag, vflag, xflag;
---
> int	bflag, cflag, fflag, lflag, nflag, vflag, xflag, yflag;
93a83,86
> 		case 'i':		/* Berkeley */
> 		case 'y':		/* Btl */
> 			yflag++;
> 			continue;
95c88
< 			fprintf(stderr, "egrep: unknown flag\n");
---
> 			fprintf(stderr, "fgrep: unknown flag\n");
104c97
< 			fprintf(stderr, "egrep: can't open %s\n", *argv);
---
> 			fprintf(stderr, "fgrep: can't open %s\n", *argv);
125a119,120
> # define ccomp(a,b) (yflag ? lca(a)==lca(b) : a==b)
> # define lca(x) (isupper(x) ? tolower(x) : x)
161c158
< 			if (c->inp == *p) {
---
> 			if (ccomp(c->inp, *p)) {
174c171
< 					if (c->inp == *p) {
---
> 					if (ccomp(c->inp ,  *p)) {
319a317
> 	int bstart;
337a336
> 			bstart = 0;
347c346,349
< 		floop:	if (state == 0) state = w;
---
> 		floop:	if (state == 0) {
> 				state = w;
> 				bstart = 1;
> 			}
349c351
< 				q->fail = state->nst;
---
> 			qloop:	q->fail = state->nst;
351c353
< 				continue;
---
> 				if ((q = q->link) != 0) goto qloop;
354a357,360
> 			else if(bstart == 0){
> 				state = 0;
> 				goto floop;
> 			}

The interesting things to note are:

	1) The "ignore case" flags - "-i" and "-y" - are not in the
	   V7 version, even though the Berkeley code claims "-y"
	   is a BTL flag.

	   The S5 version does have such a flag - but, despite what
	   the comments might make you expect, it's "-i", not "-y"!

	2) The error messages in the V7 "fgrep" call it "egrep", not
	   "fgrep".

	3) The bug in question is fixed by the stuff at the end that
	   involves the variable "bstart".  (I *think* the problem is that
	   the "fgrep" implementation of the Aho/Corasick algorithms -
	   "Efficient String Matching: An Aid to Bibliographic Search",
	   Alfred V. Aho and Margaret J. Corasick, *Communications of the
	   ACM, June 1975, Volume 18, Number 6, pp. 333 - 340 - was lacking
	   the part of Algorithm 3 reading

		while g(state, a) = fail do state := f(state)

	    Since "fgrep" doesn't care what particular pattern was
	    matched, and doesn't bother doing any further string matching
	    once it's found one match on a line, I think that this part
	    isn't completely necessary; however, the stuff added by the
	    bug fix is necessary.)

	4) Either this fix, or the one involving "qloop", or both, were
	   in some update to V7 which I remember seeing.  This update
	   also had "fsck", some fixes to bugs mentioned by Dennis Ritchie
	   in a talk which was commented on in some issue of ";login"
	   (one such bug, I remember, was that some error bit for some
	   DEC tape driver was incorrectly #defined in the driver), new
	   versions of "f77" and "awk" and some other programs (the
	   new versions ended up in 4BSD, but didn't completely make it
	   into System N until S5R2 or so).

This bit of UNIX trivia not brought to you by Abbott Horn (or is it Abbott
and Horn?), makers of Trivial Pursuit.

Another note, on a standard I/O "feature"/bug:

While testing this, I noticed that if you ran "fgrep" and just typed "shkx"
followed by control-D, it waited for another control-D before exiting.  I
built it with the 4.2BSD standard I/O library, and it exited as soon as you
typed ^D.

There was a lot of flaming about the so-called "sticky EOF" feature of the
4.2BSD standard I/O a while ago.  The 4.2BSD standard I/O returns an EOF on
all attempts to read from a stream which has already gotten an EOF (i.e.,
any stream where the _IOEOF bit is set, so that "feof(stream)" returns
TRUE).  This change caused programs which read something from standard input
until EOF, and then start reading more data from standard input without
doing a "clearerr(stdin)", to break.  These programs were assuming that
standard input was a terminal (unless they did an "fseek" after the first
EOF, there is no way that subsequent reads from a regular file wouldn't get
EOF, assuming somebody wasn't writing to the file they were reading from).

I believe the change was made in 4.2BSD to fix "fread"s which saw an EOF
before they got all the data requested.  This change also fixes the problem
described above, as well as making the behavior of "feof" consistent with
the behavior of reads from the file; other versions of standard I/O can have
"feof(stream)" be true, but have reads from "stream" not return an
end-of-file indication!
-- 
	Guy Harris
	{ihnp4, decvax, seismo, decwrl, ...}!sun!guy
	guy@sun.arpa

guy@sun.UUCP (04/23/86)

> 	4) Either this fix, or the one involving "qloop", or both, were
> 	   in some update to V7 which I remember seeing.  This update
> 	   also had "fsck", some fixes to bugs mentioned by Dennis Ritchie
> 	   in a talk which was commented on in some issue of ";login"
> 	   (one such bug, I remember, was that some error bit for some
> 	   DEC tape driver was incorrectly #defined in the driver), new
> 	   versions of "f77" and "awk" and some other programs (the
> 	   new versions ended up in 4BSD, but didn't completely make it
> 	   into System N until S5R2 or so).

It sure was.  From the README file from that addendum tape:

	Addenda to UNIX 7th edition distribution tape, 12/2/80.

		fgrep.c: new source for fgrep(1) corrects certain
		      troubles with keys with common prefixes

The "V7 addendum" and 4.2BSD versions of "fgrep" are almost identical; the
only significant differences are that the 4.2BSD version accepts "-i" as
well as "-y" for the "case-independent matching" flag (which is interesting,
considering the S5 version accepts "-i" but not "-y") and the 4.2BSD version
uses a block size of BUFSIZ (1024 bytes on 4.xBSD) rather than 512.

This, of course, raises the question "why didn't this fix make it into
System V, and why did System V's "-i" flag use a subroutine for character
comparison rather than a macro?"  I suspect the answer is the same as for
the question "why didn't the new 'f77' or 'awk' make it in until S5R2 or
so", which is "nobody in the USDL knew about it."  One suspects the lines of
communication between BTL research and AT&T-IS need some Roto-Rooter(TM)
work...

> This bit of UNIX trivia not brought to you by Abbott Horn (or is it Abbott
> and Horn?), makers of Trivial Pursuit.

It's Horn Abbott, you nitwit.
-- 
	Guy Harris
	{ihnp4, decvax, seismo, decwrl, ...}!sun!guy
	guy@sun.arpa