[news.software.b] unbatcher out of sync - another cure

rob@mtdiablo.Concord.CA.US (Rob Bernardo) (03/18/91)

I've also had some difficulty lately with 'out of sync' unbatching
problems. Unfortunately, Eamonn McManus's patchbatch didn't work.
Below is a shell archive for a more robust program to fix batches
with bad article character counts. Accompanying it are a makefile
and a wrapper shell script, both of which you may need to tailor
to your systems. The main significant difference between my and E.M.'s
batch fixers is that mine does not presume that the correct article
size is close to the article size given in the bad news batch file.
Mine goes through the batch finding each article and recalculating
it's size from scratch.

The syntax of the command line is

    rebatch [-v] -i input_file -o output_file

where the input file is a compressed newsbatch file (e.g.  as found in
your in.coming/bad directory and where the output file is not
compressed.  The wrapper shell script will take care of compressing the
output file (so that you can just wait till cron next runs
input/newsrun).

I wrote this real quick this morning, and it worked for me. Please
let me know of any problems or suggestions.

#!/bin/sh
# shar:	Shell Archiver  (v1.22)
#	Packed Sun Mar 17 18:30:16 PST 1991 by mtdiablo!rob
#	from directory /usr/local/src/rebatch
#
#	Run the following text with /bin/sh to create:
#	  Makefile
#	  rebatch
#	  dorebatch.c
#
if test -f Makefile; then echo "File Makefile exists"; else
echo "x - extracting Makefile (Text)"
sed 's/^X//' << 'SHAR_EOF' > Makefile &&
XNEWSOWNER=bin
XNEWSGRP=bin
XMODE=755
XNEWSLIB=/usr/local/lib
X
Xall:	dorebatch
X
Xinstall:	dorebatch
X	install -g ${NEWSGRP} -o ${NEWSOWNER} -m ${MODE} rebatch ${NEWSLIB}/newsbin/batch
X	install -g ${NEWSGRP} -o ${NEWSOWNER} -m ${MODE} dorebatch ${NEWSLIB}/newsbin/batch
SHAR_EOF
chmod 0644 Makefile || echo "restore of Makefile fails"
set `wc -c Makefile`;Sum=$1
if test "$Sum" != "262"
then echo original size 262, current size $Sum;fi
fi
if test -f rebatch; then echo "File rebatch exists"; else
echo "x - extracting rebatch (Text)"
sed 's/^X//' << 'SHAR_EOF' > rebatch &&
X#! /bin/sh
X# =()<. ${NEWSCONFIG-@<NEWSCONFIG>@}>()=
X. ${NEWSCONFIG-/usr/local/lib/news/bin/config}
Xumask $NEWSUMASK
X
XPATH=$NEWSCTL/bin:$NEWSBIN/batch:$NEWSBIN:$NEWSPATH ; export PATH
Xfor file in  ${NEWSARTS}/in.coming/bad/*
Xdo
X    barename=`basename $file`
X    newbatch=${NEWSARTS}/in.coming/$barename
X    echo "rebatching $barename"
X    dorebatch -i $file -o $newbatch && compress $newbatch && mv $newbatch.Z $newbatch && rm $file
Xdone
SHAR_EOF
chmod 0644 rebatch || echo "restore of rebatch fails"
set `wc -c rebatch`;Sum=$1
if test "$Sum" != "437"
then echo original size 437, current size $Sum;fi
fi
if test -f dorebatch.c; then echo "File dorebatch.c exists"; else
echo "x - extracting dorebatch.c (Text)"
sed 's/^X//' << 'SHAR_EOF' > dorebatch.c &&
X#define ZCAT "zcat"
X#define RNEWSSTRING "#! rnews"
X
X#include <stdio.h>
X
Xint verbose;
Xextern int errno;
Xextern int sys_nerr;
Xextern char *sys_errlist[];
X
X#define ERRNO_MSG	(errno < sys_nerr ? sys_errlist[errno] : "unknown errno")
X
Xmain(argc, argv)
Xint argc;
Xchar **argv;
X{
X    char zfile[BUFSIZ], command[BUFSIZ], dataline[BUFSIZ];
X    char *infile = NULL, *outfile = NULL, *arttempfile;
X    FILE *infp, *outfp, *arttempfp;
X    int eofreached, optret, rnewslen, charcnt, artcnt, argerr;
X    extern int optind;
X    extern char *optarg;
X
X    /* Process command line */
X    argerr = 0;
X    while ((optret = getopt(argc, argv, "vi:o:")) != -1) {
X	switch (optret) {
X	case 'v':
X		verbose = 1;
X		break;
X	case 'i':
X		infile = optarg;
X		break;
X
X	case 'o':
X		outfile = optarg;
X		break;
X
X	default:
X		argerr++;
X	}
X    }
X
X    if(!infile || !outfile || argerr) {
X	fprintf(stderr, "%s: usage: %s -i [infile] -o [outfile]\n",
X	        argv[0], argv[0]);
X	exit(-1);
X    }
X
X    /* Open data stream of infile uncompressed */
X    strcpy(zfile, infile);
X    strcat(zfile, ".Z");
X    if(link(infile, zfile) == 1) {
X	fprintf(stderr, "%s: link(%s, %s): %s\n", argv[0],
X		infile, zfile, ERRNO_MSG);
X	exit(errno);
X    }
X    sprintf(command, "%s %s\n", ZCAT, zfile);
X    if((infp = popen(command, "r")) == NULL) {
X	fprintf(stderr, "%s: popen(%s, \"r\") failed\n", argv[0], command);
X	unlink(zfile);
X	exit(errno?errno:-1);
X    }
X
X    /* Open output file */
X    if((outfp = fopen(outfile, "w+")) == NULL) {
X	fprintf(stderr, "%s: fopen(%s, \"w+\") failed\n", argv[0], outfile);
X	pclose(infp);
X	unlink(zfile);
X	exit(errno?errno:-1);
X    }
X
X    /* Get file name for temp file to hold each article */
X    arttempfile = tmpnam(NULL);
X
X    /* Verify first line is rnews */
X    if(fgets(dataline, BUFSIZ, infp) == NULL) {
X	fprintf(stderr, "%s: premature end of file %s\n", argv[0], infile);
X	pclose(infp);
X	unlink(zfile);
X	exit(errno?errno:-1);
X    }
X    rnewslen = strlen(RNEWSSTRING);
X    if(strncmp(dataline, RNEWSSTRING, rnewslen)) {
X	fprintf(stderr, "%s: file %s not a compressed news archive\n",
X		argv[0], infile);
X	pclose(infp);
X	unlink(zfile);
X	exit(errno?errno:-1);
X    }
X    eofreached = 0;
X    artcnt = 0;
X
X    if(verbose)
X	printf("input = %s output = %s\n", infile, outfile);
X
X    /* Process each article in batch */
X    while(!eofreached) {
X
X	/* Open temp file for storing next article */
X	if((arttempfp = fopen(arttempfile, "w+")) == NULL) {
X	    fprintf(stderr, "%s: fopen(%s, \"w+\") failed\n", argv[0],
X		    arttempfile);
X	    pclose(infp);
X	    unlink(zfile);
X	    exit(errno?errno:-1);
X	}
X
X	/* Read a line from pipe. If oef or end of article,
X	 * write rnews line with count on output file,
X	 * copy article (in tempfile) to output file and quite loop.
X	 * Otherwise append this next line of article to end
X	 * of temp file and continue with next line from pipe.
X	 */
X	while (1) {
X	    if(fgets(dataline, BUFSIZ, infp) == NULL)
X		eofreached = 1;
X
X	    if(eofreached||(!strncmp(dataline, RNEWSSTRING, rnewslen))) {
X
X		charcnt = ftell(arttempfp);
X		rewind(arttempfp);
X		fprintf(outfp, "%s %d\n", RNEWSSTRING, charcnt);
X		if(verbose)
X		    printf("article %d charcnt %d\n", artcnt, charcnt);
X
X		while(fgets(dataline, BUFSIZ, arttempfp)) {
X		    if(fputs(dataline, outfp) == EOF) {
X			fprintf(stderr, "%s: fputs to %s failed\n",
X				argv[0], outfile);
X			pclose(infp);
X			unlink(zfile);
X			fclose(arttempfp);
X			unlink(arttempfile);
X			fclose(outfp);
X			exit(errno?errno:-1);
X		    }
X		    charcnt -= strlen(dataline);
X		} 
X		if(charcnt)
X		    fprintf(stderr, "%s: error in count by %d\n", argv[0],
X			    charcnt);
X
X		fclose(arttempfp);
X		artcnt++;
X		break;
X
X	    } else 
X		fputs(dataline, arttempfp);
X	}
X    }
X
X    pclose(infp);
X    unlink(zfile);
X    unlink(arttempfile);
X    fclose(outfp);
X    exit(0);
X}
SHAR_EOF
chmod 0644 dorebatch.c || echo "restore of dorebatch.c fails"
set `wc -c dorebatch.c`;Sum=$1
if test "$Sum" != "3789"
then echo original size 3789, current size $Sum;fi
fi
exit 0
-- 
Rob Bernardo					Mt. Diablo Software Solutions
email: rob@mtdiablo.Concord.CA.US		phone: (415) 827-4301

em@dce.ie (Eamonn McManus) (03/26/91)

rob@mtdiablo.Concord.CA.US (Rob Bernardo) writes:
>I've also had some difficulty lately with 'out of sync' unbatching
>problems. Unfortunately, Eamonn McManus's patchbatch didn't work.
>Below is a shell archive for a more robust program to fix batches
>with bad article character counts.

There are advantages and disadvantages to each of our programs.  Patchbatch
is designed to be run automatically on all incoming batches, whereas Rob's
program (rebatch) is to be run by hand on known bad batches.  Running
automatically from newsrun means that the fixer doesn't have to worry
about decompression and the like.

The reason I wrote patchbatch to fish around in the vicinity of the
supposed article end, rather than scanning through every line as rebatch
does, was that it provides a greater degree of transparency.  If an
article happens to contain the string "#! rnews" at the beginning of a
line, rebatch will assume it ends there.  Patchbatch is only susceptible
to problems if an article contains such a string very near the end.  Also,
if an article is truncated in the middle of a line, so that the "#! rnews"
of the following article is not preceded by a newline, rebatch will not
find that article.  Of course if it were changed to look for "#! rnews"
anywhere in a line it would go ape on articles like this one.

There is a problem with hacks like these, of striking a balance between
fixing corrupt batches and leaving alone correct ones.  Patchbatch stays
closer to the latter at the expense of sometimes failing to do the former.
However, I think people should try increasing the value of FUDGE before
resorting to a more promiscuous program like rebatch.  You might also need
to change the size of the buf[] array when doing this; I can't remember if
the version I posted had a magic constant 64 as the size (ugh).

Another noteworthy difference between the programs is that patchbatch
modifies the batch in place rather than creating a replacement.  This
means that it is much faster.  In particular, if you only occasionally get
corrupt batches you can afford to run patchbatch over every incoming
batch, since there is very little overhead in checking through a correct
batch.  There is a theoretical problem, in that the size of an article may
change from an n-digit number to a (n+1)-digit number, in which case
patchbatch will fail.  I never saw this happen in practice.

,
Eamonn