[can.uucp] faster version of inpaths

lyndon@cs.AthabascaU.CA (Lyndon Nerenberg) (10/25/89)
After watching inpaths beat our disks to a pulp, I decided there
had to be a better way to collect the Path: info. The current method
involves opening *every* article in the news tree. This seems wasteful
when you can collect the Path: header quite easily as the articles arrive.

What I've done is add an entry to our sys file that pipes each article
through a script to extract the Path: header and append it to a file.
I then modified inpaths.c to take its data directly from this file.
The new version is *much* faster than the original.

This method also generates more accurate numbers. Most sites expire
news after a week (or less). Our relay machines expire after 48 hours.
The old method reported unrealisticly low numbers. This one gives an
accurate count of what the host actually did during the month.

There are some disadvantages to this version. First, the path data
file gets rather large. 'atha' takes in pretty well everything except
talk and soc. This is sufficient to generate a 4.5 MByte data file
every month. It also starts three processes for each incoming article
while grabbing the header. This slows down article processing, and
increases the size of your system accounting files. If this bugs you,
rewrite the filter in C.

Since the diffs are bigger than the source I'm appending the whole
thing. Enjoy.

--lyndon

[ Watch out for trailing .sig's ]

/*
 *
 * This program inputs a list of filenames of news articles, and outputs a
 * data report which should be mailed to the decwrl Network Monitoring
 * Project at address "pathsurvey@decwrl.dec.com".
 *
 *
 * Run it like this:
 *
 *  cd /usr/spool/news
 *  find . -type f -print | inpaths "yourhost" | mail pathsurvey@decwrl.dec.com
 *
 *  where "yourhost" is the host name of your computer.
 *
 * If you have a huge amount of news spooled and don't want to run 
 * all of it through inpaths, you can do something like
 *
 *   find . -type f -mtime -10 -print | ...
 *
 * If you expire in less than three weeks but want accurate stats,
 * define PATHSFILE to point to a file containing the Path: header
 * from each locally processed article. Then, add an entry like this
 * to your sys file:
 *
 * PathSurvey:all::sed -e '/^$/,$d' | grep '^Path:' >> /usr/spool/batch/PATHS
 *
 * and run the following script out of cron once per month:
 *
 *   pathsdir=/usr/spool/batch
 *   pathsfile=PATHS
 *   inpaths=/usr/local/lib/inpaths
 *   cd $pathsdir
 *   mv $pathsfile $pathsfile.work
 *   sleep 60	# Let currently running rnews complete
 *   $inpaths -l atha | mail usenet pathsurvey@decwrl.dec.com
 *   rm $pathsfile.work
 *
 * 
 * there are 3 options: -s, -m, and -l for short, medium, and long report.
 * The default is to produce a long report. If you are worried about mail
 * expenses you can send a shorter report. The long report is typically
 * about 50K bytes for a major site, and perhaps 25K bytes for a smaller
 * site. 
 *
 * If you define PATHSFILE there is a fourth option, '-f' which allows
 * you to override the default path data filename.
 *
 * Brian Reid
 *	V1.0	 Sep 1986
 *	V2.0	 May 1989
 *
 * Support for PATHSFILE added Oct 1989 by lyndon@cs.AthabascaU.CA
 *     
 */

#ifndef lint
static char *RCSID = "$Id: inpaths.c,v 1.3 89/10/24 15:00:25 aubin Beta $";
#endif /* lint */
#define VERSION "2.2"
#include <stdio.h>
#include <fcntl.h>
#include <ctype.h>
#include <sys/types.h>

#define SURVEYPERIOD 21		/* Maximum number of days in survey period */
#define	INTERVAL	SURVEYPERIOD*60*60*24
#define HEADBYTES 1024

/*
 * Define PATHSFILE to point to a file where the Path: header from
 * each locally processed article will be saved (see comments above).
 *
 * NOTE: This file gets LARGE. On 'atha' this file typically grows
 *       to 4.5 MBytes after four weeks. If you're running under
 *       System V, be sure to turn your ulimit *way* up.
 */
#define PATHSFILE "/usr/spool/batch/PATHS.work"

main (argc,argv)
  int argc;
  char **argv;
 {
    char jc, *lptr, *cp, *cp1, *cp2;
    char rightdelim;
    char *pathfield;
    char artbuf[HEADBYTES];
    char * scanlimit;
    char *hostname;
    char hostString[128];
#ifndef PATHSFILE
    char linebuf[1024];
    int article, isopen;
#endif /* ! PATHSFILE */
    int needHost;
    static int passChar[256];
    int columns,verbose,totalTraffic;
#ifdef PATHSFILE
    FILE *pfile;
    char *pfilename;
#endif /* PATHSFILE */

	/* definitions for getopt */
    extern int optind;
    extern char *optarg;

 /* structure used to tally the traffic between two hosts */
    typedef struct trec {
	struct trec *rlink;
	struct nrec *linkid;
	int tally;
    } ;

 /* structure to hold the information about a host */
    typedef struct nrec {
	struct nrec *link;
	struct trec *rlink;
	char *id;
	long sentto; /* tally of articles sent to somebody from here */
    } ;
    struct nrec *hosthash[128], *hnptr, *list, *relay;
    struct trec *rlist;
    int i, gotbytes, c;
    extern errno;

    hostname = "unknown";
    verbose = 2;
#ifdef PATHSFILE
    pfilename = PATHSFILE;
#endif /* PATHSFILE */
    while (( c=getopt(argc, argv, "smlf:" )) != EOF)
    switch (c) {
	case 's': verbose=0; break;
	case 'm': verbose=1; break;
	case 'l': verbose=2; break;
#ifdef PATHSFILE
	case 'f': pfilename = optarg; break;
#endif /* PATHSFILE */
	case '?': (void) fprintf(stderr,
#ifdef PATHSFILE
	"usage: %s [-s] [-m] [-l] [-f pathsfile] hostname\n",argv[0]);
#else
	"usage: %s [-s] [-m] [-l] hostname\n",argv[0]);
#endif /* PATHSFILE */
	exit(1);
    }
    if (optind < argc) {
        hostname = argv[optind];
    } else {
#ifdef PATHSFILE
	(void) fprintf(stderr,"usage: %s [-s] [-m] [-l] [-f pathsfile] `hostname`\n",argv[0]);
#else
	(void) fprintf(stderr,"usage: %s [-s] [-m] [-l] `hostname`\n",argv[0]);
#endif /* PATHSFILE */
	exit(1);
    }

    (void) fprintf(stderr,"computing %s inpaths for host %s\n",
	verbose==0 ? "short" : (verbose==1 ? "medium" : "long"),hostname);
    for (i = 0; i<128; i++) hosthash[i] = (struct nrec *) NULL;

/* precompute character types to speed up scan */
    for (i = 0; i<=255; i++) {
    	passChar[i] = 0;
	if (isalpha(i) || isdigit(i)) passChar[i] = 1;
	if (i == '-' || i == '.' || i == '_') passChar[i] = 1;
    }
    totalTraffic = 0;    

#ifndef PATHSFILE
    while (gets(linebuf) != NULL) {
        lptr = linebuf;
	isopen = 0;

/* Skip files that do not have pure numeric names */
	i = strlen(lptr)-1;
	do {
	    if (!isdigit(linebuf[i])) {
	        if (linebuf[i]=='/') break;
		goto bypass;
	    }
	    i--;
	} while (i>=0);

/* Open the file for reading */
	article = open(lptr, O_RDONLY);
	isopen = (article > 0);

/* Read in the first few bytes of the article; find the end of the header */
	gotbytes = read(article, artbuf, HEADBYTES);
	if (gotbytes < 10) goto bypass;

#else /* PATHSFILE */

    if ((pfile = fopen(pfilename, "r")) == NULL) {
	(void) fprintf(stderr, "%s: fopen failed\n", pfilename);
	exit(1);
    }

    lptr = pfilename;	/* kludge */
    while (fgets(artbuf, HEADBYTES, pfile) != NULL) {
	gotbytes = strlen(artbuf);
#endif /* ! PATHSFILE */
	
/* Find "Path:" header field */
	pathfield = (char *) 0;
	scanlimit = &artbuf[gotbytes];
	for (cp=artbuf; cp <= scanlimit; cp++) {
	    if (*cp == '\n') break;
	    if (pathfield) break;
	    if (strncmp(cp, "Path: ", 6) == 0) {
		pathfield = cp; goto gotpath;
	    }
	    while (*cp != '\n' && cp <= scanlimit) cp++;
	}
	(void) fprintf(stderr,"%s: didn't find 'Path:' in 1st %d bytes.\n",
	    lptr,HEADBYTES);
	goto bypass; 

gotpath: ;

/* Extract all of the host names from the "Path:" field and put them in our
host table.								 */
	cp = pathfield;
	while (*cp != NULL && *cp != '\n') cp++;
	if (cp == NULL) {
	    (void) fprintf(stderr,"%s: end of Path line not in buffer.\n",lptr);
	    goto bypass;
	}

	totalTraffic++;
	*cp = 0;
	pathfield += 5;	/* skip 'Path:' */
	cp1 = pathfield;
	relay = (struct nrec *) NULL;
	rightdelim = '!';
	while (cp1 < cp) {
	    /* get next field */
	    while (*cp1=='!') cp1++;
	    cp2 = ++cp1;
	    while (passChar[(int) (*cp2)]) cp2++;

	    rightdelim = *cp2; *cp2 = 0;
	    if (rightdelim=='!' && *cp1 != (char) NULL) {
	    /* see if already in the table */
		list = hosthash[*cp1];
		while (list != NULL) {
		    /*
		     * Attempt to speed things up here a bit.  Since we hash
		     * on the first char, we see if the second char is a match
		     * before calling strcmp()
		     */
		    if (list->id[1] == cp1[1] && !strcmp(list->id, cp1)) {
			hnptr = list;
			break;		/* I hate unnecessary goto's */
		    }
		    list = list->link;
		}
		if(list == NULL) {
			/* get storage and splice in a new one */
			hnptr = (struct nrec *) malloc(sizeof (struct nrec));
			hnptr->id = (char *) strcpy(malloc(1+strlen(cp1)),cp1);
			hnptr->link = hosthash[*cp1];
			hnptr->rlink = (struct trec *) NULL;
			hnptr->sentto = (long) 0;
			hosthash[*cp1] = hnptr;
		}
	    }
/* 
At this point "hnptr" points to the host record of the current host. If
there was a relay host, then "relay" points to its host record (the relay
host is just the previous host on the Path: line. Since this Path means
that news has flowed from host "hnptr" to host "relay", we want to tally
one message in a data structure corresponding to that link. We will
increment the tally record that is attached to the source host "hnptr".
*/

	    if (relay != NULL && relay != hnptr) {
		rlist = relay->rlink;
		while (rlist != NULL) {
		    if (rlist->linkid == hnptr) goto have2;
		    rlist = rlist->rlink;
		}
		rlist = (struct trec *) malloc(sizeof (struct trec));
		rlist->rlink = relay->rlink;
		relay->rlink = rlist;
		rlist->linkid = hnptr;
		rlist->tally = 0;

    have2:      rlist->tally++;
		hnptr->sentto++;
	    }

	    cp1 = cp2;
	    relay = hnptr;
	    if (rightdelim == ' ' || rightdelim == '(') break;
	}
bypass:
#ifndef PATHSFILE
	 if (isopen) (void) close(article)
#endif /* ! PATHSFILE */
    ; }
#ifdef PATHSFILE
    (void) fclose(pfile);
#endif /* PATHSFILE */
/* Now dump the host table */
    (void) printf("ZCZC begin inhosts %s %s %d %d %d\n",
    	VERSION,hostname,verbose,totalTraffic,SURVEYPERIOD);
    for (jc=0; jc<127; jc++) {
	list = hosthash[jc];
	while (list != NULL) {
	    if (list->rlink != NULL) {
		if (verbose > 0 || (100*list->sentto > totalTraffic))
		   (void) printf("%d\t%s\n",list->sentto, list->id);
	    }
	    list = list->link;
	}
    }
   (void) printf("ZCZC end inhosts %s\n",hostname);

   (void) printf("ZCZC begin inpaths %s %s %d %d %d\n",
        VERSION,hostname,verbose,totalTraffic,SURVEYPERIOD);
    for (jc=0; jc<127; jc++) {
	list = hosthash[jc];
	while (list != NULL) {
	    if (verbose > 1 || (100*list->sentto > totalTraffic)) {
		if (list->rlink != NULL) {
		    columns = 3+strlen(list->id);
		    (void) sprintf(hostString,"%s H ",list->id);
		    needHost = 1;
		    rlist = list->rlink;
		    while (rlist != NULL) {
		        if (
			     (100*rlist->tally > totalTraffic)
			  || ((verbose > 1)&&(5000*rlist->tally>totalTraffic))
			   ) {
			    if (needHost)(void) printf("%s",hostString);
			    needHost = 0;
			    relay = rlist->linkid;
			    if (columns > 70) {
				(void) printf("\n%s",hostString);
				columns = 3+strlen(list->id);
			    }
			    (void) printf("%d Z %s U ", rlist->tally, relay->id);
			    columns += 9+strlen(relay->id);
			}
			rlist = rlist->rlink;
		    }
		    if (!needHost) (void) printf("\n");
		}
	    }
	    list = list->link;
	}
    }
    (void) printf("ZCZC end inpaths %s\n",hostname);
    (void) fclose(stdout);
    exit(0);
}

-- 
Lyndon Nerenberg  VE6BBM / Computing Services / Athabasca University
  {alberta,decwrl,lsuc}!atha!lyndon || lyndon@cs.AthabascaU.CA

                  The Connector is the Notwork.