[news.software.b] Inpaths modification for crossposting

jim@aob.aob.mn.org (Jim Anderson) (05/24/89)

Watching the recent flamage about inpaths not getting rid of cross-posted
articles and the perl scripts and flames about perl availability, etc, I
decided to modify inpaths.c to minimize the effect of cross-posted articles
by not counting articles which have already been seen.  In the process of
making these modifications, a problem with malloc on machine where
sizeof(char *) != sizeof(int) showed up.  This modification, along with
the cross-posting modification is included.

Also, while I was drifting through the article, I noticed that if the
open was unsuccessful, it still attempted to read the header from the
nonexistant article.  I made some modifications so that if an article
is unreadable, it skips it and tries the next article.

The general idea behind the modification is to keep track of a number of
inode numbers of articles.  Currently, it keeps track of 4000 inode numbers.
It counts the number of cross-postings, and as all the occurances are
accounted for, it removes it from the list, providing space for another
cross-posted article.  Granted, this technique does not get rid of all
cross-posted articles, but it does at least get rid of many of them, and
the higher the tracking is set, the more it gets rid of.  Running on an
80286 as I am, 4000 is a practical limit for this tracking.

Anyway, justification behind us, these are the changes I made to accomplish
this:

*** inpaths.old	Tue May 23 19:03:30 1989
--- inpaths.c	Tue May 23 20:42:18 1989
***************
*** 36,41
  #include <fcntl.h>
  #include <ctype.h>
  #include <sys/types.h>
  
  #define SURVEYPERIOD 21		/* Maximum number of days in survey period */
  #define	INTERVAL	SURVEYPERIOD*60*60*24

--- 36,42 -----
  #include <fcntl.h>
  #include <ctype.h>
  #include <sys/types.h>
+ #include <sys/stat.h>
  
  #define SURVEYPERIOD 21		/* Maximum number of days in survey period */
  #define	INTERVAL	SURVEYPERIOD*60*60*24
***************
*** 40,45
  #define SURVEYPERIOD 21		/* Maximum number of days in survey period */
  #define	INTERVAL	SURVEYPERIOD*60*60*24
  #define HEADBYTES 1024
  
  main (argc,argv)
    int argc;

--- 41,47 -----
  #define SURVEYPERIOD 21		/* Maximum number of days in survey period */
  #define	INTERVAL	SURVEYPERIOD*60*60*24
  #define HEADBYTES 1024
+ #define MAXINODETRACK	8000
  
  struct	sInodeTrack {
  	int	inodeNum; /* File inode number */
***************
*** 41,46
  #define	INTERVAL	SURVEYPERIOD*60*60*24
  #define HEADBYTES 1024
  
  main (argc,argv)
    int argc;
    char **argv;

--- 43,53 -----
  #define HEADBYTES 1024
  #define MAXINODETRACK	8000
  
+ struct	sInodeTrack {
+ 	int	inodeNum; /* File inode number */
+ 	int	count;	/* Number of references remaining. */
+ } inodeTrack[MAXINODETRACK];
+ 
  main (argc,argv)
    int argc;
    char **argv;
***************
*** 47,53
   {
      char linebuf[1024], jc, *lptr, *cp, *cp1, *cp2;
      char rightdelim;
!     char *pathfield;
      char artbuf[HEADBYTES];
      char * scanlimit;
      char *hostname;

--- 54,60 -----
   {
      char linebuf[1024], jc, *lptr, *cp, *cp1, *cp2;
      char rightdelim;
!     char *pathfield,*malloc();
      char artbuf[HEADBYTES];
      char * scanlimit;
      char *hostname;
***************
*** 76,81
      } ;
      struct nrec *hosthash[128], *hnptr, *list, *relay;
      struct trec *rlist;
      int i, article, gotbytes, c;
      extern errno;
  

--- 83,89 -----
      } ;
      struct nrec *hosthash[128], *hnptr, *list, *relay;
      struct trec *rlist;
+     struct stat sbuf;
      int i, article, gotbytes, c;
      int ignoreArticle;
      extern errno;
***************
*** 77,82
      struct nrec *hosthash[128], *hnptr, *list, *relay;
      struct trec *rlist;
      int i, article, gotbytes, c;
      extern errno;
  
      hostname = "unknown";

--- 85,91 -----
      struct trec *rlist;
      struct stat sbuf;
      int i, article, gotbytes, c;
+     int ignoreArticle;
      extern errno;
  
      for (i=0;i<MAXINODETRACK;i++) {
***************
*** 79,84
      int i, article, gotbytes, c;
      extern errno;
  
      hostname = "unknown";
      verbose = 2;
      while (( c=getopt(argc, argv, "sml" )) != EOF)

--- 88,97 -----
      int ignoreArticle;
      extern errno;
  
+     for (i=0;i<MAXINODETRACK;i++) {
+ 	inodeTrack[i].inodeNum = 0;
+ 	inodeTrack[i].count = 0;
+     }
      hostname = "unknown";
      verbose = 2;
      while (( c=getopt(argc, argv, "sml" )) != EOF)
***************
*** 126,131
  /* Open the file for reading */
  	article = open(lptr, O_RDONLY);
  	isopen = (article > 0);
  
  /* Read in the first few bytes of the article; find the end of the header */
  	gotbytes = read(article, artbuf, HEADBYTES);

--- 139,153 -----
  /* Open the file for reading */
  	article = open(lptr, O_RDONLY);
  	isopen = (article > 0);
+ 	if (!isopen)
+ 	    goto bypass; /* Go back and read another line */
+ 	fstat(article,&sbuf);
+ 	for (i=0;i<MAXINODETRACK && inodeTrack[i].inodeNum!=0 &&
+ 		inodeTrack[i].inodeNum!=sbuf.st_ino;i++)
+ 		;
+ 	ignoreArticle = 0;
+ 	if (i<MAXINODETRACK && inodeTrack[i].inodeNum == sbuf.st_ino) {
+ 		/* Found a cross-posted article */
  
  		ignoreArticle = 1;
  		inodeTrack[i].count--;
***************
*** 127,132
  	article = open(lptr, O_RDONLY);
  	isopen = (article > 0);
  
  /* Read in the first few bytes of the article; find the end of the header */
  	gotbytes = read(article, artbuf, HEADBYTES);
  	if (gotbytes < 10) goto bypass;

--- 149,182 -----
  	if (i<MAXINODETRACK && inodeTrack[i].inodeNum == sbuf.st_ino) {
  		/* Found a cross-posted article */
  
+ 		ignoreArticle = 1;
+ 		inodeTrack[i].count--;
+ 		if (inodeTrack[i].count==0) {
+ 			/* Last occurance of this inode - delete it */
+ 			int	j;
+ 
+ 			inodeTrack[i].inodeNum = 0;
+ 			for (j=i+1;inodeTrack[j].inodeNum!=0;j++) {
+ 				inodeTrack[j-1].inodeNum =
+ 					inodeTrack[j].inodeNum;
+ 				inodeTrack[j-1].count = inodeTrack[j].count;
+ 			}
+ 		}
+ 	} else {
+ 		/* It wasn't in the list - Maybe add it?? */
+ 		if (sbuf.st_nlink>1) {
+ 			/* It wasn't in the list and we should see it again-
+ 				add it */
+ 			if (i<MAXINODETRACK-1) {
+ 				/* We have enough room to add it */
+ 				inodeTrack[i].inodeNum = sbuf.st_ino;
+ 				inodeTrack[i].count = sbuf.st_nlink-1;
+ 			}
+ 		} /* else no room to add it - maybe later */
+ 	}
+ 	if (ignoreArticle)
+ 	    goto bypass; /* Go back and read another file name */
+ 
  /* Read in the first few bytes of the article; find the end of the header */
  	gotbytes = read(article, artbuf, HEADBYTES);
  	if (gotbytes < 10) goto bypass;
***************
*** 188,194
  		if(list == NULL) {
  			/* get storage and splice in a new one */
  			hnptr = (struct nrec *) malloc(sizeof (struct nrec));
! 			hnptr->id = (char *) strcpy(malloc(1+strlen(cp1)),cp1);
  			hnptr->link = hosthash[*cp1];
  			hnptr->rlink = (struct trec *) NULL;
  			hnptr->sentto = (long) 0;

--- 238,244 -----
  		if(list == NULL) {
  			/* get storage and splice in a new one */
  			hnptr = (struct nrec *) malloc(sizeof (struct nrec));
! 			(void) strcpy((hnptr->id = malloc(1+strlen(cp1))),cp1);
  			hnptr->link = hosthash[*cp1];
  			hnptr->rlink = (struct trec *) NULL;
  			hnptr->sentto = (long) 0;
-- 
Jim Anderson			(612) 636-2869
Anderson O'Brien, Inc		New mail:jim@aob.mn.org
2575 N. Fairview Ave.		Old mail:{rutgers,gatech,amdahl}!bungia!aob!jim
St. Paul, MN  55113		"Fireball... Let me see... How did that go?"