[alt.sources] unbatcher out of sync?

em@dce.ie (Eamonn McManus) (01/16/91)

henry@zoo.toronto.edu (Henry Spencer) writes:
>It means "something's wrong with your batch":  relaynews did not find a
>"#! rnews nnnnn" line where one should have been.  Typically this means
>garbling during preparation or transmission.  One notorious trouble spot
>is that the batch format cannot tolerate transformations of newlines to
>CR-LF pairs; the byte counts in the "#! rnews" lines must be spot-on.

We had `unbatcher out of sync' problems at a site I was involved in, which
was fed its news by mail from a VMS site (ugh).  The VMS mailer (PMDF) got
confused when lines exceeded 256 characters, as References lines often do,
and would make a total hash of the header when this happened.  As a result,
the "#! rnews" count would always be off by a small amount for the affected
article.  C News resyncs at the next "#! rnews" line, but if the count is
too long for the actual article contents it will have missed the start of
the article following the garbled one.

To kludge around this problem I wrote a program `patchbatch' which zips
through a news batch looking for "#! rnews" lines with incorrect counts.
If it finds one, it hunts back and forth a small amount for the next "#!
rnews" line and adjusts the incorrect one to point to it.  This was
surprisingly effective: while it was running I believe it never failed to
correct a munged batch.

I'm including the source of patchbatch in case it is of use to the original
poster, or anyone else.

,
Eamonn

/* patchbatch.c - patch a news batch. */

/* By Eamonn McManus <emcmanus@cs.tcd.ie>, February 1990.
 * This program is not copyrighted.
 *
 * Blast through a news batch checking the offsets after `#! rnews'.
 * If we find that the offset does not lead to another `#! rnews' line
 * or EOF, we search around for the line somewhere in the vicinity.  If
 * it is found, we go back and patch the original offset to point to the
 * correct place.  This is useful for example on systems where long lines
 * get truncated or split in transmission, since in this case the stated
 * offset will be wrong.
 *
 * This is the hackiest program I have written in a long time.
 */

#include <stdio.h>
#include <string.h>
#include <sys/fcntl.h>	/* For O_RDWR. */
#include <sys/types.h>
#include <sys/stat.h>

extern long strtol();

char verbose;
extern int optind;


main(argc, argv)
char **argv;
{
	int i, status;
	while ((i = getopt(argc, argv, "v")) != -1)
		switch (i) {
		case 'v':
			verbose = 1; break;
		default:
			goto usage;
		}
	if (optind == argc) {
usage:
		fprintf(stderr, "Usage: patchbatch file [...]\n");
		exit(2);
	}
	status = 0;
	for (i = optind; i < argc; i++)
		if (patchbatch(argv[i]) < 0)
			status = 1;
	exit(status);
}


static char lead[] = "#! rnews ";
#define LEADLEN (sizeof lead - 1)
#define FUDGE (2 * sizeof lead)

int patchbatch(name)
char *name;
{
	int fd, i;
	long here, offset;
	char buf[64];
	struct stat st;
	if ((fd = open(name, O_RDWR)) < 0) {
		perror(name);
		return -1;
	}
	if (fstat(fd, &st) < 0) {
		perror(name);
		return -1;
	}
	if ((i = read(fd, buf, sizeof buf - 1)) != sizeof buf - 1) {
		if (i < 0)
			perror(name);
		else	fprintf(stderr, "%s: too short for a news batch\n");
		close(fd); return -1;
	}
	buf[sizeof buf - 1] = '\0';
	if (strncmp(buf, lead, LEADLEN) != 0) {
		fprintf(stderr, "%s: not a news batch (should start with %s)\n",
			name, lead);
		close(fd);
		return -1;
	}
	here = 0; i = 0;
	while (1) {
		char *p;
		int numsize;
		long artstart, newpos;
		offset = strtol(buf + LEADLEN, &p, 10);
		if (offset == 0) {
			fprintf(stderr,
				"%s: bad value after %s, file offset %ld\n",
				name, lead, here);
			close(fd);
			return -1;
		}
		numsize = p - (buf + LEADLEN);
		artstart = here + LEADLEN + numsize + 1/*\n*/;
		newpos = artstart + offset;
		if (newpos == st.st_size)
			return 0;
		else if (newpos > st.st_size) {
			char offstr[16];
lastart:
			offset = st.st_size - artstart;
changeoffset:
			sprintf(offstr + 1, "%ld", offset);
			switch (strlen(offstr + 1) - numsize) {
			case 0:		/* Same size, just overwrite. */
				p = offstr + 1;
				break;
			case -1:	/* Shorter, use leading 0. */
				p = offstr; *p = '0';
				break;
			case 1:		/* Longer, oops. */
				fprintf(stderr, "%s: no room to change article \
length to %ld, file offset %ld\n", name, offset, here);
				goto setnewpos;
			}
			lseek(fd, here + LEADLEN, 0);
			if (write(fd, p, numsize) < 0) {
				perror(name); return -1;
			}
			if (verbose)
				fprintf(stderr, "%s: changed article length to \
%ld, file offset %ld\n", name, offset, here);
setnewpos:
			newpos = artstart + offset;
			if (newpos >= st.st_size)
				return 0;
		} else {	/* newpos < st.st_size */
			lseek(fd, newpos - FUDGE, 0);
			if (read(fd, buf, sizeof buf - 1) < sizeof buf - 1)
				goto lastart;
			if (strncmp(buf + FUDGE, lead, LEADLEN) == 0) {
				strcpy(buf, buf + FUDGE);	/* Hmmm... */
				here = newpos;
				continue;
			}
			for (p = buf; (p = strchr(p, lead[0])) != NULL; p++)
				if (strncmp(p, lead, LEADLEN) == 0)
					break;
			if (p == NULL) {
				fprintf(stderr, "%s: can't find next article \
with offset %ld from file pos %ld\n", name, offset, here);
				close(fd); return -1;
			}
			offset = (newpos - FUDGE) + (p - buf) - artstart;
			goto changeoffset;
		}
		lseek(fd, newpos, 0);
		if (read(fd, buf, sizeof buf - 1) < sizeof buf - 1) {
			fprintf(stderr, "%s: last article too short\n", name);
			close(fd); return -1;
		}
		here = newpos;
	}
}

grant@bluemoon.uucp (Grant DeLorean) (02/20/91)

em@dce.ie (Eamonn McManus) writes:

>henry@zoo.toronto.edu (Henry Spencer) writes:
>>It means "something's wrong with your batch":  relaynews did not find a
>>"#! rnews nnnnn" line where one should have been.  Typically this means
>>garbling during preparation or transmission.  One notorious trouble spot
>>is that the batch format cannot tolerate transformations of newlines to
>>CR-LF pairs; the byte counts in the "#! rnews" lines must be spot-on.

>We had `unbatcher out of sync' problems at a site I was involved in, which
>was fed its news by mail from a VMS site (ugh).  The VMS mailer (PMDF) got
>confused when lines exceeded 256 characters, as References lines often do,
>and would make a total hash of the header when this happened.  As a result,
>the "#! rnews" count would always be off by a small amount for the affected
>article.  C News resyncs at the next "#! rnews" line, but if the count is
>too long for the actual article contents it will have missed the start of
>the article following the garbled one.

 The other way that relaynews can get out of sync is if there are two
or three spaces between "#! rnews" and the number. I know at least
ufgate's batcher does this (who cares, right?). The simple solution
to this one is to have relaynews look again to find the number with
the string length incremented by 1 (doing it again with the string length
2 longer than originally intended doesn't seem like a bad idea). 
-- 
 Grant DeLorean  (grant@bluemoon)    {n8emr|nstar}!bluemoon!grant