[net.sources] Pathprune

matt@ncr-sd.SanDiego.NCR.COM (Matt Costello) (03/07/87)

:	shar:	Shell Archiver
# Run this text with /bin/sh to create:
#	README
#	pathprune.1
#	pathprune.c

sed 's/^X//' <<'SHAR_EOF' >README; chmod 644 README
XHere is a program to make your paths file (from pathalias) smaller
Xby removing redundant entries.  I wrote it because we currently
Xcreate two different paths files for the systems in our domain.
XMost systems get a file containing only the hosts within the domain,
Xwhile the domain gateway(s) get the full database distributed in
Xthe newsgroups mod.map and ncr.maps.  The large paths file is too
Xlarge (~500k) to want to ship all over.  Pathprune can often remove
Xenough of a paths file to make transmission of it using uucp feasible.
X
X
XThe two tables here show the actual compression on two different systems,
Xwith the pathalias entries broken down into three types:
X	gateway		domain gateway entry (starts with a '.')
X	domain		host belongs to a domain (host has a '.' in it)
X	host		simple host name (in the .uucp domain)
X
XThe numbers listed are the number of lines and percentage of original
Xlines.  The "domain" values are slightly skewed because Berkeley lists
X344 hosts in the berkeley.edu domain without any domain gateway.
X
Xfor ncr-sd (domain gateway)
X            orig        -v              -vt             -vut
Xgateway      199     128  0.64       124  0.62       124  0.62
Xdomain      1540     652  0.42       652  0.42       652  0.42
Xhost        8515    8515  1.00      8515  1.00      8515  1.00
Xtotal      10254    9295  0.91      9291  0.91      9291  0.91
X
Xfor se-sd (internal node)
X            orig        -v              -vt             -vut
Xgateway      199     124  0.62       120  0.60       120  0.60
Xdomain      1540     623  0.40       623  0.40       623  0.40
Xhost        8515    8515  1.00      8515  1.00         2  0.00
Xtotal      10254    9262  0.90      9258  0.90       745  0.07
X
X
XThe program will compile under both SYSV and BSD4.2, and will
Xshould also compile under V7.  See the beginning of the source
Xfile for compilation instructions.  It does use getopt(3).
SHAR_EOF
sed 's/^X//' <<'SHAR_EOF' >pathprune.1; chmod 644 pathprune.1
X.\" @(#)pathprune.1	matt.costello@sandiego.ncr.com 87/03/06
X.TH PATHPRUNE 1 
X.SH NAME
Xpathprune \- prune unnecessary entries in pathalias database
X.SH SYNOPSIS
X.B pathprune
X[
X.B \-vutj
X] [
X.BI \-d \0domain
X] [
X.BI \-h \0domain
X] [
X.BI \-r \0domain
X] [
X.I infile
X[
X.I outfile
X] ]
X.ad b
X.SH DESCRIPTION
X.I Pathprune
Xprunes unnecessary entries from a sorted \fIpathalias\fR(1) database.
XIt does this by reading a sorted database and then writing out a
Xsmaller one.  It preserves the original sorted order.
X.PP
XAny subdomain gateway whose path is the same as the parent domain
Xgateway is unnecessary.  The subdomain gateway is also unnecessary if
Xits path passes through the parent domain gateway.  This is subject
Xto one very important rule; if the path is \fB%s\fR then the domain gateway
Xis always necessary.  This rule allows the local mailer to detect invalid
Xhosts in the domains for which it is a gateway.
X.PP
XA host entry is unnecessary if its path arrives at or passes through
Xthe corresponding domain gateway entry.  For simple host names this is the
Xpseudo-domain ".uucp".
X.PP
XThis can more easily explained by example.  Assume the input of:
X.tr ~.
X.RS
X.nf
X~com			ncr-sd!scubed!seismo!%s
X~ncr.com		ncr-sd!%s
X~sandiego.ncr.com	ncr-sd!%s
X~other			ncr-sd!%s
X~uucp			ncr-sd!%s
Xfalstaf.sandiego.ncr.com	ncr-sd!falstaf!%s
Xncr-sd.sandiego.ncr.com	ncr-sd!%s
Xse-sd.sandiego.ncr.com	%s
Xtower5.sandiego.ncr.com	tower5!%s
X.fi
X.RE
XAfter processing, the output will be:
X.RS
X.nf
X~com			ncr-sd!scubed!seismo!%s
X~ncr.com		ncr-sd!%s
X~other			ncr-sd!%s
X~uucp			ncr-sd!%s
Xse-sd.sandiego.ncr.com	%s
Xtower5.sandiego.ncr.com	tower5!%s
X.fi
X.RE
XThe domain gateway .sandiego.ncr.com was removed because .ncr.com will get
Xus to the same place.  If the \fB\-t\fR option had been specified the .com
Xand .ncr.com gateways would also have been removed.  All hosts that lie
Xbeyond the .ncr.com (or .other for \fB\-t\fR) gateway were also removed
Xas being unnecessary.
X.PP
XThe
X.I pathprune
Xoptions are:
X.TP 6
X.B \-v
XReport some statistics on the standard error output.
X.TP
X.B \-u
XDelete simple hosts (host.uucp) if they pass through the domain gateway for
X".uucp".
X.TP
X.B \-t
XPossibly delete the gateways for top-level domain names.  The pseudo-domain
Xgateway for ".other" must be present; this domain gateway is used as a
Xsmarter machine for domain names.
X.TP
X.B \-j
XJunk all hosts that do not belong to a valid top-level domain.
X.TP
X.BI \-d \0domain
XPreserve the domain gateway for \fIdomain\fR and any subdomain gateways of
X\fIdomain\fR.  Normally subdomain gateways will be removed if they pass
Xthrough the domain gateway.  This option will preserve these gateways in
Xthe database in case the database is used for domain qualification.
XSpecifying \fB\-d other\fR will preserve all domain gateways in the database.
X.TP
X.BI \-h \0domain
XPreserve all host names in the \fIdomain\fR, or subdomains of \fIdomain\fR.
X.TP
X.BI \-r \0domain
XRemove all host names in the \fIdomain\fR, or subdomains of \fIdomain\fR.
XThis is useful for domains like berkeley.edu which list several
Xhundred hosts in the domain, but have no domain gateway.  Of course, you
Xcould always confuse ucbvax by listing it as the domain gateway.
X.IP
XSpecifying \fB\-r uucp\fR will remove all simple host names.
X.PP
XThe \fB\-d\fR, \fB\-h\fR and \fB\-r\fR options may be specified as
Xmany times as wanted.
X.SH BUGS
X.I Pathprune
Xassumes bang-routed paths and will not match gateway paths containing '@'s.
X.br
X.B .other
Xis specific to NCR's version of smail.
X.SH "SEE ALSO"
Xpathalias(1)
X.SH AUTHOR
XMatt Costello	<matt.costello@sandiego.ncr.com>
SHAR_EOF
sed 's/^X//' <<'SHAR_EOF' >pathprune.c; chmod 444 pathprune.c
X#ifndef lint
Xstatic char sccsID[] = "@(#)pathprune.c	1.1 Delta: 14:58:26 3/6/87";
X#endif
X/*
X *  Prune down a pathalias file by throwing out all unnecessary entries.
X *
X *  Usage
X *	pathprune [options] [ infile [outfile] ]
X *
X *	-u	prune .uucp(implied) entries
X *	-t	prune top level (via .other) entries
X *	-d dom	sacred domain, do not prune domain gateways
X *	-h dom	sacred hosts, do not prune hosts in this domain (or subdomains)
X *	-r dom	remove all hosts in this domain (and subdomains)
X *	-j	junk all hosts with bogus top-level domains
X *	-v	verbose, print statistics
X *
X *  Compilation
X *	cc -O -o pathprune pathprune.c			# for USG systems
X *	cc -DBSD -O -o pathprune pathprune.c getopt.o	# for BSD systems
X *
X *  Disclaimer
X *	Pathprune is in the public domain.  It may be used by any person or
X *	organization, in any way and for any purpose.  There is no warranty
X *	of any kind for this program.  What you see is what you get.
X *
X *  History
X *	1.1	Mar 6, 1987
X *		Written by Matt Costello
X */
X# include <stdio.h>
X
X# define VOID	(void)	/* define empty for non-voids */
X# define UUCPNAME	"uucp"
X# define OTHERNAME	"other"
X
X# ifdef BSD
X
X# include <strings.h>
X# define strchr(s,c) index(s,c)
X# define strrchr(s,c) rindex(s,c)
Xextern char	*gets();
X
Xchar *
Xstrtok( str, delims )
Xregister char *str, *delims;
X{
X	static char *strnext;
X	char *token;
X	if (!str) str=strnext;
X	while (*str && index(delims,*str))
X		str++;	/* skip leading delimiters */
X	if (!*str)
X		return (NULL);
X	token = str;
X	while (*str && !index(delims,*str))
X		str++;	/* skip token characters */
X	if (*str)	strnext = str;
X	else 		*strnext++ = '\0';
X	return (token);
X}
X
X# else /* !BSD */
X
X# include <string.h>
X# include <memory.h>
Xextern void	exit();
Xextern void	perror();
X
X# endif /* !BSD */
X
Xextern char	*malloc();
Xextern char	*calloc();
Xchar		*stralloc();		/* FORWARD */
X
Xstruct dnode {
X	struct dnode *lnode;	/* pointer to littler node name */
X	struct dnode *bnode;	/* pointer to bigger node name */
X	struct dnode *children;	/* pointer to childrens node names */
X	struct dnode *order;	/* pointer to next node in original order */
X	char *fullname;		/* full name of this domain gateway */
X	char *nodename;		/* last portion of the domain name */
X	char *pathname;		/* path name to gateway for this node */
X	char *line;		/* the full line from the paths file */
X	int flags;
X};
X# define SACRED_DOMAIN	001
X# define SACRED_HOST	002
X# define REMOVE_HOST	004
X
Xstruct dnode *rootnode = NULL;	/* root of the domain (node) tree */
Xstruct dnode *look_node();	/* look for a node */
Xstruct dnode *add_node();	/* look for a node */
X
Xstruct dnode *firstnode = NULL;	/* to keep original order */
Xstruct dnode *lastnode = NULL;	/* for performance */
X
Xchar *		otherpath = "!!!!!!!";
Xchar *		uucppath = "!!!!!!!";
Xint		flg_topdel;
Xint		flg_dontprune;
Xint		flg_junkbogus;
Xint		flg_uucpdel;
Xint		flg_removeuucp;
Xint		flg_verbose;
X
Xint	gw_in, gw_out;
Xint	dom_in, dom_out;
Xint	hst_in, hst_out;
X
Xmain(argc,argv)
Xint argc;
Xchar **argv;
X{
X	register struct dnode *p;
X	register i;
X	extern int optind;
X	extern char *optarg;
X	char linebuf[BUFSIZ];
X
X	while ((i = getopt(argc,argv,"utjvd:h:r:")) != EOF) switch (i) {
X		case 'u':	/* prune host.uucp entries where possible */
X			flg_uucpdel++;
X			break;
X		case 't':	/* prune top level domain names */
X			flg_topdel++;
X			break;
X		case 'j':	/* just hosts in bogus top-level domains */
X			flg_junkbogus++;
X			break;
X		case 'v':
X			flg_verbose++;
X			break;
X		case 'd':
X			domflag( optarg, SACRED_DOMAIN );
X			break;
X		case 'h':
X			domflag( optarg, SACRED_HOST );
X			break;
X		case 'r':
X			domflag( optarg, REMOVE_HOST );
X			break;
X
X		default:
XUsage:
XVOID fputs("Usage: pathprune [-vutj] [-[dhr] dom] [ infile [outfile] ]\n",
X				stderr );
X			exit(2);
X	}
X	if ( (optind < argc) &&
X	     (freopen( argv[optind++], "r", stdin ) == NULL) ) {
X		VOID fputs("pathprune: cannot open ", stderr );
X		perror( argv[--optind] );
X		exit(1);
X	}
X	if ( (optind < argc) &&
X	     (freopen( argv[optind++], "w", stdout ) == NULL) ) {
X		VOID fputs("pathprune: cannot create ", stderr );
X		perror( argv[--optind] );
X		exit(1);
X	}
X	if (optind < argc) {
X		goto Usage;
X	}
X
X
X	/*  Read in the file a line at a time.  There are two kinds of lines:
X	 *  node lines and gateway lines.  While we are reading gateway lines
X	 *  we just read them in and save them.  Once we get a non-gateway
X	 *  line, we prune.
X	 */
X
X	/*  Read in all the domain gateways.
X	 */
X	for (;;) {
X		if (gets( linebuf ) == NULL)
X			fatal("unexpected EOF");
X		if (linebuf[0] != '.')
X			break;
X		gw_in++;
X		add_gateway( linebuf );
X	}
X	if (rootnode == NULL)
X		fatal("no domain gateways");
X
X	/*  Now prune out any unnecessary domain gateways.  They will be
X	 *  unnecessary if they are a subdomain and the path passes
X	 *  through the domain gateway.  We will leave alone any entries
X	 *  for ourselves (%s) because we may use them to qualify.
X	 */
X	if ((p = look_node( &rootnode, UUCPNAME )) != NULL) {
X		uucppath = p->pathname;
X		p->flags |= SACRED_DOMAIN;
X		if (p->flags & REMOVE_HOST)
X			flg_removeuucp++;
X	}
X	if ((p = look_node( &rootnode, OTHERNAME )) != NULL) {
X		if (flg_topdel)
X			otherpath = p->pathname;
X		if (p->flags & SACRED_DOMAIN)
X			flg_dontprune++;
X		p->flags |= SACRED_DOMAIN;
X	}
X
X	if (!flg_dontprune)
X		prune_gateway( rootnode, otherpath );
X
X	/*  Now print out the gateway entries in the original order.
X	 */
X	for ( p = firstnode; p != NULL; p = p->order ) {
X		if (p->line != NULL) {
X			gw_out++;
X			VOID puts( p->line );
X		}
X	}
X
X	/*  All the remaining lines in the file will be node names.
X	 *  Look up the domain name to see what gateway we would use;
X	 *  If the gateway is on the path to the node name then we do
X	 *  not need this entry.
X	 *  If they are simple node names we have to pass them right
X	 *  through unless we have a usefull ".uucp" gateway along
X	 *  the way.
X	 */
X	for (;;) {
X		check_needed( linebuf );
X
X		/*  Now get another line.
X		 */
X		if (gets( linebuf ) == NULL)
X			break;
X		if (linebuf[0] == '.')
X			fatal("unexpected domain gateway");
X	}
X
X	if (flg_verbose) {
X		VOID fprintf(stderr,"gateway\t%8d%8d\t%5.2f\n",
X			gw_in, gw_out, (double)gw_out / (double)gw_in );
X		VOID fprintf(stderr,"domain\t%8d%8d\t%5.2f\n",
X			dom_in, dom_out, (double)dom_out / (double)dom_in );
X		VOID fprintf(stderr,"host\t%8d%8d\t%5.2f\n",
X			hst_in, hst_out, (double)hst_out / (double)hst_in );
X	}
X
X	exit(0);
X}
X
X
Xdomflag( name, flags )
Xregister char *name;
Xint flags;
X{
X	register struct dnode **pp;
X	register struct dnode *p;
X	register char *s;
X
X	pp = &rootnode;
X	while ((s = strrchr( name, '.' )) != NULL) {
X		*s++ = '\0';	/* to most significant part of domain name */
X		p = add_node( pp, s );
X		pp = &(p->children);
X	}
X	if (*name)
X		p = add_node( pp, name );
X
X	p->flags |= flags;
X}
X
Xadd_gateway( buf )
Xregister char *buf;
X{
X	register struct dnode **pp;
X	register struct dnode *p;
X	register char *s;
X	char * line;
X	char * gw;
X	char * path;
X
X	/*  Save the original line because we will need it later.
X	 */
X	line = stralloc( buf );
X
X	if ((gw = strtok( buf, " \t" )) == NULL)
X		fatal("missing gateway on line");
X	gw = stralloc( gw );
X	if ((path = strtok( (char *)NULL, " \t" )) == NULL)
X		fatal("missing path on gateway line");
X	path = stralloc( path );
X
X	/*  Go down through all the nodes looking for this one.
X	 *  We won't actually find it but will create everything
X	 *  along the way.
X	 */
X	pp = &rootnode;
X	while ((s = strrchr( buf, '.' )) != NULL) {
X		*s++ = '\0';	/* to most significant part of domain name */
X		p = add_node( pp, s );
X		pp = &(p->children);
X	}
X
X	/*  Make sure that this one hasn't already been used.
X	 */
X	if (p->line != NULL)
X		fatal("duplicate gateway name found");
X	p->line = line;
X	p->fullname = gw;
X	p->pathname = path;
X
X	/*  Build the correct pointers so we can process these nodes
X	 *  in the original order when it comes time to print them
X	 *  back out.
X	 */
X	if (firstnode == NULL)
X		firstnode = p;
X	if (lastnode != NULL)
X		lastnode->order = p;
X	lastnode = p;
X}
X
X
Xstruct dnode *
Xlook_node( pp, name )
Xregister struct dnode **pp;
Xregister char *name;
X{
X	register i;
X	/*  Search down the node tree looking for a node with the
X	 *  given name.
X	 */
X	while (*pp != NULL) {
X		i = strcmp( name, (*pp)->nodename );
X		if (i == 0)
X			return (*pp);
X		else if (i < 0)
X			pp = &((*pp)->lnode);
X		else
X			pp = &((*pp)->bnode);
X	}
X	return (NULL);
X}
X
Xstruct dnode *
Xadd_node( pp, name )
Xregister struct dnode **pp;
Xregister char *name;
X{
X	register i;
X	/*  Search down the node tree looking for a node with the
X	 *  given name.
X	 */
X	while (*pp != NULL) {
X		i = strcmp( name, (*pp)->nodename );
X		if (i == 0)
X			return (*pp);
X		else if (i < 0)
X			pp = &((*pp)->lnode);
X		else
X			pp = &((*pp)->bnode);
X	}
X
X	/*  Couldn't find such a node, so create one.
X	 */
X	*pp = (struct dnode *) calloc( 1, sizeof(struct dnode) );
X	if (*pp == NULL)
X		fatal("cannot allocate node");
X	(*pp)->nodename = stralloc( name );
X	return (*pp);
X}
X
Xprune_gateway( p, parpath )
Xregister struct dnode *p;
Xregister char *parpath;		/* parents path */
X{
X	register parlen = strlen( parpath ) - 2;
X	while (p != NULL) {
X		/*  Do the left side of the tree first.
X		 */
X		prune_gateway( p->lnode, parpath );
X
X		/*  Check to see if we share a common path with our
X		 *  parent.  If we do this is a redundant gateway and
X		 *  can be eliminated.
X		 */
X		if ( (parlen <= 0) || (p->line == NULL) ||
X		     (p->flags & SACRED_DOMAIN) ||
X		     (strcmp( p->pathname, "%s" ) == 0) ) {
X			/*  We leave these alone.
X			 */
X		} else if (strncmp( p->pathname, parpath, parlen ) == 0) {
X			/*  This is not one we need to know.  Remove it
X			 *  from the ranks of the living.
X			 */
X			p->line = NULL;
X		}
X
X		/*  Check our subdomains for uselessness.
X		 */
X		if (!(p->flags & SACRED_DOMAIN))
X			prune_gateway( p->children,
X				       (p->line) ? p->pathname : parpath );
X
X		/*  Do the right side of the tree using tail
X		 *  recursion.
X		 */
X		p = p->bnode;
X	}
X}
X
X
Xcheck_needed( buf )
Xregister char *buf;
X{
X	register struct dnode **pp;
X	register struct dnode *p;
X	register char *s;
X	char * path;
X	char * gwpath;
X	int	isdomain;
X	char	tmpbuf[BUFSIZ];
X
X	VOID strcpy( tmpbuf, buf );
X	/*  Extract the host/domain name.  */
X	if (strtok( tmpbuf, " \t" ) == NULL)
X		fatal("blank line");
X	if (strchr( tmpbuf, '.' ) != NULL) {
X		isdomain = 1;
X		dom_in++;
X	} else {
X		isdomain = 0;
X		hst_in++;
X	}
X	/*  Extract the path.  */
X	if ((path = strtok( (char *)NULL, " \t" )) == NULL)
X		fatal("missing path");
X	if (strcmp(path, "%s") == 0) {
X		/*  This is for us.  We don't dare delete it.
X		 */
X		goto keep;
X	}
X
X	/*  Go down through all the nodes looking for this one.
X	 *  We won't actually find it but will create everything
X	 *  along the way.
X	 */
X	if (isdomain)
X		gwpath = otherpath;	/*  It is in a domain.  */
X	else if (flg_removeuucp)
X		return;			/*  They all go away.  */
X	else if (flg_uucpdel)
X		gwpath = uucppath;	/*  .uucp may be sufficient  */
X	else
X		gwpath = NULL;		/*  Keep this one.  */
X	pp = &rootnode;
X	while ((s = strrchr( tmpbuf, '.' )) != NULL) {
X		*s++ = '\0';	/* to most significant part of domain name */
X		p = look_node( pp, s );
X		if (p == NULL) {
X			/*  Haven't seen this domain before.  If this is
X			 *  a bugs top-level domain name we want to junk it.
X			 */
X			if (flg_junkbogus && pp == &rootnode)
X				return;
X			break;
X		}
X		if (p->flags & SACRED_HOST) {
X			gwpath = NULL;
X			break;
X		}
X		if (p->flags & REMOVE_HOST) {
X			/*  Don't want any hosts in this domain.  */
X			return;
X		}
X		if (p->line != NULL)
X			gwpath = p->pathname;
X		pp = &(p->children);
X	}
X
X	if ( (gwpath == NULL) || (strcmp( gwpath, "%s" ) == 0) ||
X	     (strncmp( path, gwpath, strlen(gwpath)-2 ) != 0) ) {
Xkeep:
X		if (isdomain)	dom_out++;
X		else		hst_out++;
X		VOID puts( buf );
X	}
X}
X
Xchar *
Xstralloc( str )
Xchar *str;
X{
X	register char *s;
X	if ((s = malloc( (unsigned)(strlen(str) + 1) )) == NULL)
X		fatal("cannot allocate string");
X	VOID strcpy( s, str );
X	return (s);
X}
X
Xfatal( reason )
Xchar * reason;
X{
X	VOID fprintf(stderr,"pathprune: %s\n", reason );
X	exit(1);
X}
SHAR_EOF
exit 0
-- 
Matt Costello	<matt.costello@SanDiego.NCR.COM>
		{sdcsvax,cbatt,dcdwest,nosc.ARPA,ihnp4}!ncr-sd!matt