[news.sysadmin] Awk script for C News statistics

lmb@vicom.com (Larry Blair) (07/02/89)

This posting contains a modified version of Erik Fair's awk script for 
netnews traffic statistics.  It has been modified create statistics from
C News that has been compiled with my improved logging patch.  Note that
the script will not work with the release version (not enough info in the
logs) and will not work with news 2.11, 2.10, or TMNN.

The necessary patch to produce the log file was posted in
<1989Jul1.205126.13674@vicom.com>.

#  USAGE: awk -f report_awk /usr/lib/news/log
#  AWK script which eats netnews log files and produces a summary of USENET
#  traffic over the period of time that the log was collected.
#
#  C news version - for use with log file patches
#
#  6/30/89
#
#  Erik E. Fair <dual!fair>
#  Original Author, May 22, 1984
#
#  Brad Eacker <onyx!brad>
#  Modified to simplify the record processing and to sort the output.
#
#  Erik E. Fair <dual!fair>
#  Modifed to provide information about control messages.
#
#  Erik E. Fair <dual!fair>
#  Bug in system name extraction fixed. It was assumed that the forth field
#  (system name) always had a dot. local is one that doesn't. Some others
#  (including 2.9 sites) don't either.
#
#  Earl Wallace <pesnta!earlw>
#  The "sent" field was changed from $5 to $6 in 2.10.2 (beta)
#  named "newstats" and called with no arguments.
#
#  Erik E. Fair <dual!fair>
#  Remove support for 2.10.1, revise for 2.10.2 to provide information
#  about junked articles, garbled articles, and bad newsgroups
#
#  Erik E. Fair <ucbvax!fair>
#  Minor bug fix to bad newsgroup reporting, also now counting ``old''
#  articles as junked, with counter for number that are `old'.
#
#  Erik E. Fair <ucbvax!fair>
#  Fix up the domain & local hosts support
#
#  Erik E. Fair <ucbvax!fair>
#  Fix up the counting of gatewayed material, add counting of "linecount"
#  problems. Additional cleanup to make things faster.
#
#  Larry Blair <lmb@vicom.com>
#  Rewritten for C news with modified logging.  Removed many of the B news
#  counts, such as linecount mismatch.
#
BEGIN{
#	"ourname" is the C news name of our system.  The old lprefix stuff
#	doesn't apply for C news, since a common naming scheme is provided.

	ourname = "vsi1";

#
#	For phony name, create real entries.  They divide into two classes.
#	Most are additive.  Some are subtractive, meaning that when the phony
#	group appears, you need to subtract for a site that was added to
#	in a previous alias.
#
#	This stuff is used if you are running a group batching scheme with
#	a phony site name.  We also use it to map stuff sent to "news",
#	which is ames' netnews system.
#
#	Example:
#	alias_add[leaf_main]="sitea,siteb,sitec"
#	alias_sub[leaf_rest]="sitec"
#
#	leaf_main would be attributed to sitea, siteb, and sitec
#	leaf_main, leaf_rest would be attriubted to sitea and siteb
#
	alias_add["leaf_main"]="daver,teraida,zorch,frame,ubvax,octela,altos86"
	alias_sub["leaf_rest"]="zorch"

	alias_add["news"]="ames"

#	If you do bi-directional USENET gatewaying (e.g. mailing list
#	to newsgroup where the material flows both ways freely), this
#	should be the name in the sys file that you use to mail stuff
#	to the mailing lists.
#
#	NOTE: I have not tested this stuff with C news. {lmb}
#
	pseudo = "internet";
	rptname = "(GATEWAY)";
#
#	Top level domain names and what network they represent
#	(for use in counting stuff that is gatewayed)
#
	domains["ARPA"] = rptname;
	domains["arpa"] = rptname;
	domains["EDU"] = rptname;
	domains["edu"] = rptname;
	domains["GOV"] = rptname;
	domains["gov"] = rptname;
	domains["COM"] = rptname;
	domains["com"] = rptname;
	domains["MIL"] = rptname;
	domains["mil"] = rptname;
	domains["ORG"] = rptname;
	domains["org"] = rptname;
	domains["NET"] = rptname;
	domains["net"] = rptname;
	domains["UK"] = rptname;
	domains["uk"] = rptname;
	domains["DEC"] = rptname;
	domains["dec"] = rptname;
	domains["CSNET"] = rptname;
	domains["csnet"] = rptname;
	domains["BITNET"] = rptname;
	domains["bitnet"] = rptname;
	domains["MAILNET"] = rptname;
	domains["mailnet"] = rptname;
	domains["UUCP"] = rptname;
	domains["uucp"] = rptname;
	domains["OZ"] = rptname;
	domains["oz"] = rptname;
	domains["AU"] = rptname;
	domains["au"] = rptname;
#
#	tilde chosen because it is ASCII 126 (don't change this)
#
	invalid = "~~~~~~";
#
	accept[invalid]   = 0;
	reject[invalid]   = 0;
	xmited[invalid]   = 0;
	control[invalid]  = 0;
	junked[invalid]   = 0;
	tossed[invalid]   = 0;
	neighbor[invalid] = 0;
	canfail = 0;
}
{
#
#	Get the name of the system that did this,
#	taking into account that not everyone believes in domains.
#	[[This stuff is extraneous for C news ]]
#
#	if we get a route addr (we shouldn't, but...), take the last one
#	[[Particularly with C news - lmb]]
#
	nhosts = split($4, hosts, "@");
	hostname = hosts[nhosts];
#
#	get the root domain name, and the hostname
#
	ndoms = split(hostname, doms, ".");
	domain = doms[ndoms];
	sys = doms[1];
#
#	check for local system, and if not that, then internet sites.
#	special case the network name replacement of specific host names,
#	such that the network name is there only on a `local' posting
#	(which is really gatewaying in disguise)
#
	if(sys == ourname)
	{
		sys = "local";
	} else {
		dom = domains[domain];
		if (dom) sys = dom;
	}
}
#
#	Accepted articles.  Count the newsgroups and who we sent it to.
#
$5 == "+" {

	accept[sys]++;
	neighbor[sys] = 1;
	nng = split($7, ngl, ",");
	for(i = 1; i <= nng; i++) {
		dot = index(ngl[i], ".");
		if (dot) ng = substr(ngl[i], 1, (dot - 1));
		else ng = ngl[i];
		if (ng) newsgcnt[ng]++;
	}
	for(j = 8; j <= NF; j++) {
		if ($(j) == pseudo) $(j) = rptname;
		else neighbor[$(j)] = 1;
		xmited[$(j)]++;
	}
	next;
}

#
#	Rejected article.  At this point, we just count them.  The "tossed"
#	count is for groups that were "x'ed" in the active file, but it's
#	not currently being printed in the report.  This section should
#	be expanded.
#
$5 == "-" {
	reject[sys]++;
	if($7 == "all")	 tossed[sys]++;
	next;
}
#	These are the cancels that preceed the article being cancelled.
#	Erik used to call the "failed", so I left it alone.  Note that
#	the cancel has already been counted on the "c" line.
#
$5 == "f"			{ canfail++; next }
#  
#	Count the junk.
# 
$5 == "j"		{ junked[sys]++; next }
#
#	Control messages.  This is not fully tested; there may be some
#	others that use more than one field.
#
$5 == "c"	{
	ctot++;
	accept[sys]++;
	control[sys]++;
	ctlcnt[$(8)]++;
	j = 9;
	if($8 == "cancel" || $8 == "newgroup" || $8 == "rmgroup") j = 10;
	for( ; j <= NF; j++) {
		if ($(j) == pseudo) $(j) = rptname;
		else neighbor[$(j)] = 1;
		xmited[$(j)]++;
	}
	next;
}
#
#	Summarize and print the report
#
END{
#	special processing for Duplicates, because we can't tell if
#	they came from a netnews neighbor or from the gatewaying
#	activities until we have processed the entire log.
#
	for( hostname in reject ) {
#
#	get the root domain name, and the hostname
#
		ndoms = split(hostname, doms, ".");
		domain = doms[ndoms];
		sys = doms[1];
		if (! neighbor[sys]) {
			if (sys == ourname) {
				sys = "local";
			} else {
				dom = domains[domain];
				if (dom) sys = dom;
			}
		}
		i = reject[hostname];
		reject[hostname] = 0;
		reject[sys] += i;
	}

	rtot = 0;
	for( i in reject ) {
		if (reject[i] > 0) {
			list[i] = 1;
			rtot += reject[i];
		}
	}

	atot = 0;
	for( i in accept ) {
		list[i] = 1;
		atot += accept[i];
	}

	xtot = 0;
	for( i in xmited ) {
		if(alias_add[i] != "")
		{
			split(alias_add[i], ala, ",");
			for (j in ala)
			{
				list[ala[j]] = 1;
				xmited[ala[j]] = xmited[i];
			}
			xmited[i] = 0;
			continue;
		}
		if(alias_sub[i] != "")
		{
			split(alias_sub[i], als, ",");
			for (j in als)
			{
				xmited[als[j]] -= xmited[i];
			}
			xmited[i] = 0;
		}
	}
	for( i in xmited ) {
		if(xmited[i] != 0)
			list[i] = 1;
		xtot += xmited[i];
	}

	ctot = 0;
	for( i in control ) {
		list[i] = 1;
		ctot += control[i];
	}

	jtot = 0;
	for( i in junked ) {
		list[i] = 1;
		jtot += junked[i];
	}
#
# ctot is part of rtot, so we don't add it in to the grand total.
#
	totarticles = atot + rtot;
	if (totarticles == 0) totarticles = 1;

	printf("\nSystem       \tAccept\tReject\tJunked\tXmit to\tControl\t%% total\t%% rejct\n");
	for( ; ; ) {
# selection sort
		i = invalid;
		for( j in list ) {
			if ( list[j] > 0 && j < i ) i = j;
		}
		if ( i == invalid ) break;
		list[i] = 0;
#
#	control & junked are counted under accept.
#
		sitetot = accept[i] + reject[i];
		if (sitetot == 0) sitetot = 1;
		articles[i] = sitetot;
#
# What an 'orrible printf spec
#
		printf("%-14s\t%6d\t%6d\t%6d\t%7d\t%7d\t%6d%%\t%6d%%\n", i, accept[i], reject[i], junked[i], xmited[i], control[i], (sitetot * 100) / totarticles, (reject[i] * 100) / sitetot);
#
	}
	printf("\nTOTALS        \t%6d\t%6d\t%6d\t%7d\t%7d\t%6d%%\t%6d%%\n", atot, rtot, jtot, xtot, ctot, 100, (rtot * 100) / totarticles);
	printf("\nTotal Articles processed %d", totarticles);
	printf("\n");

	if (ctot) {
		printf("\nControl	Invocations\n");
		for( i in ctlcnt ) {
			if (i == "cancel") {
				printf("%-12s %6d", i, ctlcnt[i]);
				if (canfail) printf(", %d failed", canfail);
				printf("\n");
			} else {
				printf("%-12s %6d\n", i, ctlcnt[i]);
			}
		}
	}

	if (atot) {
		printf("\nNetnews Categories Received\n");
		l = 0;
		for( i in newsgcnt ) {
			if (l < length(i)) l = length(i);
		}
		fmt = sprintf("%%-%ds %%6d\n", l);
		for( ; ; ) {
# selection sort
			max = 0;
			for( j in newsgcnt ) {
				if (newsgcnt[j] > max) {
					i = j;
					max = newsgcnt[j];
				}
			}
			if (max == 0) break;
			printf(fmt, i, newsgcnt[i]);
			newsgcnt[i] = 0;
		}
	}
}
-- 
Larry Blair   ames!vsi1!lmb   lmb@vicom.com