[alt.sources] buffering and a news archiver

rae%alias@csri.toronto.edu (Reid Ellis) (03/27/90)

A problem and a program.

The problem -

    I have a program that archives all of the source groups for me at
    night [comp.sources.*] which uses an 'rc' file in the home
    directory.  The program is called 'narc' and the rc file is named
    '.narcrc' and everytime I run narc, which is a perl script, I get
    8k of NULs appended to the end of my '.narcrc' [which JUST SO
    HAPPENS to be == BUFSIZ]..

The program -

    To help people find my problem :) here is my news archival
    program, written in Perl.

#!/bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #!/bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
#	narc.pl
# This archive created: Mon Mar 26 15:46:44 1990
# By:	Reid Ellis (Alias Research)
export PATH; PATH=/bin:$PATH
if test -f 'narc.pl'
then
	echo shar: over-writing existing file "'narc.pl'"
fi
cat << \SHAR_EOF > 'narc.pl'
#!/usr/local/bin/perl
#
# $Source: /repousr/u/rae/src/TEXT/perl/RCS/narc.pl,v $
# $Revision: 2.0 $
# $Author: rae $
# $Date: 90/03/25 15:43:00 $

# narc -- News ARChiver

# to avoid the buffering problems
#
$| = 0;

# Check for a NARCRC environment variable and use it if
# it's there.
#
if($ENV{'NARCRC'})
	{
	$narcrc = $ENV{'NARCRC'};

	# if NARCRC doesn't start with a '/', prepend
	# $HOME to it
	#
	if($narcrc !~ /\/.*/)
		{
		$narcrc = $ENV{"HOME"} . "/" . $narcrc;
		}
	}
else
	{
	# Default NARCRC: "$HOME/.narcrc"
	#
	$narcrc = $ENV{'HOME'} . "/.narcrc";
	}

# testing flag. Usually zero.
#
$verbose=0;
$standalone = "stand";

# get the name of the news server.  Can be replaced with
# a hardcoded '$server=machine;' if you like
#
$server_file = "/usr/local/lib/news/server";
$server = `cat $server_file` || die "Can't open $server_file";
chop $server;

# Where to find nntp
#
$nntp = "/usr/local/bin/nntp $server";

# open narcrc for input.  The format is:
#
#	/full/path/to/archive/directory
#	dirname   newsgroup   last-time-of-access
#	dirname   newsgroup   last-time-of-access
#	dirname   newsgroup   last-time-of-access
#	 ...
#
# 'dirname' is appended to the initial path.  The special
# case dirname of 'default' will use the last element of
# the newsgroup name.  For example, 
#	default  comp.sources.unix  19900101 000000
# will use the dirname 'unix'

open(NARCRC, "<$narcrc");
$SIG{'INT'} = 'handler';
$SIG{'QUIT'} = 'handler';

# The first line of the narcrc is where to store everythign
#
$repo = <NARCRC>;
chop($repo);
print "Saving files in $repo\n" if $verbose;

# $narc_line[$narc_index] is where each new line of the narcrc is
# stored .  This only gets written out after a successful 'narc'.
#
$narc_index = 1;

# for each line in the narcrc
#	chop it into separate bits
#	ask nntp for all new news for the group
#	for each new news item
#		read article into /tmp/narc$$
#		scan for magic string 'Archive-name'
#		get the subdir and part-name from this line
#	if hit eof, then no magic string was found, do next news item
#	create all leading directories if they're not there already
#	move /tmp/narc$$ to full pathname of archived article
while($narc_entry = <NARCRC>)
	{
	chop($narc_entry);

	# This is a software end-of-file because I am having trouble with
	# garbage showing up at the ends of file.  With shar files it's
	# not such a big deal, since they usually have 'exit 0' at the end
	# of them, but with the .narcrc, it might be accidentally parsed.
	#
	last if($narc_entry eq "EOF");

	($real_dirname, $newsgroup, $lastdate, $lasttime) = split(' ', $narc_entry);

	# check for degenerate empty case
	#	
	last if $newsgroup eq "" || $lastdate eq "" || $lasttime eq "";

	# if "default", set dirname to last element
	# of the newsgroup name
	#
	if($real_dirname eq "default" && $newsgroup =~ /.*\.([^.]*)$/)
		{
		$dirname = $1;
		}
	else
		{
		$dirname = $real_dirname;
		}

	# tell the user what's going on
	#
	"$lastdate $lasttime" =~ /^(..)(..)(..) (..)(..)(..)$/ &&
		printf("%s %s 19%s-%s-%s %s:%s\n",
			$newsgroup, "[$dirname]", $1, $2, $3, $4, $5);

	# open the stdout of the nntp command
	#
	open(MSGIDS, "$nntp newnews $newsgroup $lastdate $lasttime|")
		|| die "Can't get message ID's for $newsgroup from $server";

	# flag to see if we do anything
	$we_did_something = 0;

	# for each message ID..
	#
	while($msgid = <MSGIDS>)
		{
		# set the we_did_something flag so we know to update
		# the narcrc
		#
		$we_did_something = 1;

		# Zap the trailing newline on $msgid
		#
		chop($msgid);
		print "\t$msgid\n" if $verbose;

		# Get the article and store it in /tmp so we don't
		# need to use multiple [slow] nntp's for the same article
		#
		system("$nntp article '$msgid' > /tmp/narc$$");

		# Open the article and check for "Archive-name"
		#
 		open(BODY, "</tmp/narc$$") || die "Can't read retrieved article $msgid in file /tmp/narc$$";
		while(<BODY>)
			{
			if(/^Archive-name:/)
				{
				($foo, $arc_name) = split;
				@tree = split('/', $arc_name);
				# Put it in the '$standalone' dir if it's a single shar
				unshift(tree, $standalone)	if $#tree == 0;
				print "\t", join('/', @tree), "\n";
				last;
				} #if Archive-name
			} # while BODY

		# if eof then "Archive-name" wasn't found
		#
		last if eof;

		# close the file
		#
		close(<BODY>);

		# Now build the full pathname of where the file is
		# supposed to live
		$fullpath = $repo . '/' . $dirname . '/' . join('/', @tree);
		$name = pop(@tree);
		$relpath = $dirname . '/' . join('/', @tree);

		# make sure the dirs are there
		#
		&mkpath($repo . '/' . $dirname . '/' . join('/', @tree));

		if(-f $fullpath)
			{
			# A copy of the file already exists, so name this one the
			# same as the existing one, but with a number appended to
			# it to delineate its version.
			#
			for($i=2; -f $fullpath . $i; $i++)
				{ }

			# tack the number onto the end of the file name
			#
			$fullpath .= $i;
			}

		# and finally, put it there
		system("mv /tmp/narc$$ $fullpath");

		} # while MSGIDS
	close(<MSGIDS>);

	# only update the timestamp if we did anything
	#
	$timestamp = $we_did_something ? &now() : "$lastdate $lasttime";

	# Save the narcrc lines in memory in order so we can save them in
	# their original order.  Just using 'each' on %NARCRC currently
	# inverts their order [as of Perl 3.0]
	#
	$narc_line[$narc_index++] = sprintf("%s\t%s\t%s", $real_dirname, $newsgroup, $timestamp);
	} # while narc_entry
close(<NARCRC>);

# Save an updated narcrc
#
print "Saving $narcrc..";
open(NARCRC, ">$narcrc");
print NARCRC $repo, "\n";
for($i=1; $i < $narc_index; $i++)
	{
	print NARCRC $narc_line[$i], "\n";
	}
# software EOF
print NARCRC "EOF\n";
close(<NARCRC>);
print "\n";

# Subroutine to return the current date and time in
# nntp format, which is "YYMMDD HHMMSS"
#
sub now
	{
	($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time());
	$mon++;
	sprintf("%02d%02d%02d %02d%02d%02d", $year,$mon,$mday,$hour,$min,$sec);
	}

sub handler
	{
	local($sig) = @_;
	print "\n\nCaught signal $sig -- exiting $ARGV[0]\n";
	close(<ARTICLE>);
	close(<BODY>);
	close(<MSGIDS>);
	close(<NARCRC>);
	close(<SAVEFILE>);
	exit 0;
	}

sub mkpath
	{
	local($fullpath) = @_;
	local(@tree) = split('/', $fullpath);
	local(@tmp) = ();

	if($tree[0] eq "")
		{
		push(@tmp, shift(@tree));
		}
	foreach $dir (@tree)
		{
		push(@tmp, $dir);
		$path = join('/', @tmp);
		print "\t%% checking for '$path'\n" if $verbose;
		mkdir("$path", 0777) || die "Can't create directory $path"
			if ! -d "$path";
		}
	}	

# Emacs cutsomisation
#
# Local Variables:
# mode:fundamental
# tab-width:4
# End:
SHAR_EOF
chmod +x 'narc.pl'
#	End of shell archive
exit 0
Reid Ellis  264 Broadway Avenue, Toronto ON, M4P 1V9               Canada
rae@gpu.utcs.toronto.edu || rae%alias@csri.toronto.edu || +1 416 487 1383