[comp.archives] A program to interpret comp.archives stuff

comparc@twwells.uucp (comp.archives) (11/15/88)

I have been working on a program to validate the DB: articles.  I
added an option to display the site entries in something more human
readable and so I thought it was time to send it out.  Right at the
moment, it does a limited amount of interpretation of site entries
and just passes on the rest of the article. The human-readable part
could also stand quite a bit of work.  However, it is a start, and I
thought some of you would like to see it, primitive as it is.

---
Bill
{uunet|novavax}!proxftl!twwells!bill

send comp.archives postings to twwells!comp-archives
send comp.archives related mail to twwells!comp-archives-request

---

Warning: this is long!

-------------------------<CUT HERE>-------------------------------------------
#include <stdio.h>
#include <ctype.h>

/* This is a program to deal with the comp.archives database.  This
   version is the earliest; it only verifies the DB: input files.  */

extern int      exit();
extern char     *malloc();
extern char     *realloc();
extern char     *strchr();
extern char     *optarg;
extern int      optind;
extern int      opterr;

/* This is used when constructing the database in memory, it let's us
   keep track of several lines of the same kind of information. */

typedef struct STRLIST {
	struct STRLIST *s_next;
	char    *s_string;
} STRLIST;

/* Here is a table of time zones. @@It is incomplete, but it is
   better than nothing. */

char *Zones[] = {
	"EST",  "Eastern",
	"PST",  "Pacific",
	0,      0,
};

/* The state stuff. Note that ST_DONE must be zero or the state loop
   won't work as expected. */

int                         st_newart(), st_dbart(), st_delcmd(), st_addcmd(),
			    st_snm(),    st_sen(),   st_stm(),    st_stt(),
			    st_sad(),    st_sma(),   st_sco(),    st_six(),
			    st_skw(),    st_sde(),   st_skip(),   st_sxx(),
			    st_end();
enum {ST_DONE,              ST_NEWART,   ST_DBART,   ST_DELCMD,   ST_ADDCMD,
			    ST_SNM,      ST_SEN,     ST_STM,      ST_STT,
			    ST_SAD,      ST_SMA,     ST_SCO,      ST_SIX,
			    ST_SKW,      ST_SDE,     ST_SKIP,     ST_SXX,
			    ST_END};
int  (*States[])() = {exit, st_newart,   st_dbart,   st_delcmd,   st_addcmd,
			    st_snm,      st_sen,     st_stm,      st_stt,
			    st_sad,      st_sma,     st_sco,      st_six,
			    st_skw,      st_sde,     st_skip,     st_sxx,
			    st_end};

char *Sname[] = {"DONE",    "ST_NEWART", "ST_DBART", "ST_DELCMD", "ST_ADDCMD",
			    "ST_SNM",    "ST_SEN",   "ST_STM",    "ST_STT",
			    "ST_SAD",    "ST_SMA",   "ST_SCO",    "ST_SIX",
			    "ST_SKW",    "ST_SDE",   "ST_SKIP",   "ST_SXX",
			    "ST_END"};

/* Input buffer stuff. */

char    *Inbuf;                 /* The line input buffer. */
int     Inend;                  /* Flag: at end of input. */
int     Inlen;                  /* Length of the input buffer. */
int     Inline;                 /* Line number of the input. */

/* Lines for the site database. */

STRLIST *S_nm;                  /* The name of the site. */
STRLIST *S_en;                  /* Who entered the entry. */
STRLIST *S_tm;                  /* Times to call. */
STRLIST *S_tt;                  /* Archive site title. */
STRLIST *S_ad;                  /* Archive administrator. */
STRLIST *S_ma;                  /* Mailind address. */
STRLIST *S_co;                  /* Communications method. */
STRLIST *S_ix;                  /* Index file. */
STRLIST *S_kw;                  /* Keyword file. */
STRLIST *S_de;                  /* Description. */

/* Miscellaneous declarations. */

void    nzfree();
char    *myalloc();
void    readline();

int     Debug;                  /* Set to print debugging output. */
int     Error;                  /* Set if the current entry has an error. */
int     Print;                  /* Print the data */
char    Options[] = "px";       /* program options */
char    *Program_name;          /* The program name. */

void
usage(msg)
char    *msg;
{
	fprintf(stderr, "%s: %s.\nusage: %s -%s file\n",
	    Program_name, msg, Program_name, Options);
	exit(1);
}

int
main(argc, argv)
int     argc;
char    **argv;
{
	int     state;

	/* Parse the arguments. */

	if (Program_name = strchr(argv[0], '/')) {
		++Program_name;
	} else {
		Program_name = argv[0];
	}
	opterr = 0;
	while (1) {
		switch (getopt(argc, argv, Options)) {
		case -1:                        break;
		default:  usage("illegal option");
		case 'p': Print = 1;            continue;
		case 'x': Debug = 1;            continue;
		}
		break;
	}
	argv += optind;
	argc -= optind;
	switch (argc) {
	default: usage("too many arguments");
	case 1:
		if (!freopen(argv[0], "r", stdin)) {
			fprintf(stderr, "%s: unable to open %s\n",
			    Program_name, argv[0]);
			exit(3);
		}
		break;
	case 0:
		usage("missing file argument");
	}
	/* Read each line of the input. The current state determines
	   what is done with it. If a state returns a negative state
	   number, that means to go on to that state without reading
	   a new line. This loop exits because ST_DONE vectors to
	   exit(). */

	for (state = ST_NEWART; ; state = (*States[state])()) {
		if (state > 0) {
			readline();
		} else {
			state = -state;
		}
		if (Debug) {
			if (Inend) {
				printf("%s:\n", Sname[state]);
			} else {
				printf("%s:\"%s\"\n", Sname[state], Inbuf);
			}
		}
	}
}

/* Print an error message, with the current input line. */

void
complain(msg)
char    *msg;
{
	Error = 1;
	fprintf(stderr, "%s: %s, ", Program_name, msg);
	if (Inend) {
		fprintf(stderr, "at the end of the input\n");
	} else {
		fprintf(stderr, "line %d: \"%s\"\n", Inline, Inbuf);
	}
}

/* free a pointer, unless the pointer is null. */

void
nzfree(ptr)
char    *ptr;
{
	if (ptr) {
		free(ptr);
	}
}

/* Interface to malloc/realloc; error exits if malloc fails. */

char *
myalloc(obuf, size)
char    *obuf;                  /* old buffer, if reallocating */
int     size;                   /* allocation size */
{
	char    *ptr;

	if (!size) {
		nzfree(obuf);
		return (0);
	}
	if (!(ptr = obuf ? realloc(obuf, (unsigned)size)
			 : malloc((unsigned)size))) {
		fprintf(stderr, "%s: out of memory\n", Program_name);
		exit(2);
	}
	return (ptr);
}

/* Allocate space for a string and copy the string into that space. */

char *
stralloc(ptr)
char    *ptr;
{
	char    *np;

	np = myalloc((char *)0, strlen(ptr) + 1);
	strcpy(np, ptr);
	return (np);
}

/* Read a line into the input buffer; extend the buffer if necessary.
   It returns the number of characters in the line, counting the end
   of the line. */

void
readline()
{
	int     cc;
	int     len;

	if (Inend) {
		return;
	}
	++Inline;
	len = 0;
	do {
		switch (cc = getchar()) {
		case 0:
			fprintf(stderr, "%s: warning, nul on line %d\n",
			    Inline);
			continue;
		case EOF:
			if (!len) {
				Inend = 1;
				if (Inbuf) {
					Inbuf[0] = 0;
				}
				return;
			}
			/* no break */
		case '\n':
			cc = 0;
			break;
		}
		if (len >= Inlen) {
			Inlen += Inlen < 480 ? Inlen + 32 : 100;
			Inbuf = myalloc(Inbuf, Inlen);
		}
		Inbuf[len++] = cc;
	} while (cc);
}

/* Add a string to the end of list of strings. Leading spaces are
   removed from the string. */

STRLIST **
add_string(list, buf)
STRLIST **list;
char    *buf;
{
	while (*list) {
		list = &(*list)->s_next;
	}
	while (isspace(*buf)) {
		++buf;
	}
	*list = (STRLIST *)myalloc((char *)0, sizeof(STRLIST));
	(*list)->s_next = 0;
	(*list)->s_string = stralloc(buf);
	return (&(*list)->s_next);
}

/* Add a part of a string to the string list. */

STRLIST **
add_segment(list, buf, end)
STRLIST **list;
char    *buf;
char    *end;
{
	char    *ptr;
	int     len;

	while (*list) {
		list = &(*list)->s_next;
	}
	while (buf < end && isspace(*buf)) {
		++buf;
	}
	len = end - buf;

	*list = (STRLIST *)myalloc((char *)0, sizeof(STRLIST));
	(*list)->s_next = 0;
	(*list)->s_string = ptr = myalloc((char *)0, len + 1);
	strncpy(ptr, buf, len);
	ptr[len] = 0;
	return (&(*list)->s_next);
}

/* Add fields from a line containing semicolon separated fields. */

void
add_fields(list, ptr)
STRLIST **list;
char    *ptr;
{
	char    *ep;

	for ( ; ep = strchr(ptr, ';'); ptr = ep + 1) {
		list = add_segment(list, ptr, ep);
	}
	(void)add_string(list, ptr);
}

/* Count the number of fields on a line. */

int
count_fields(ptr)
char    *ptr;
{
	char    *ep;
	int     cnt;

	cnt = 0;
	for (++ptr; ep = strchr(ptr, ';'); ptr = ep + 1) {
		++cnt;
	}
	return (cnt + 1);
}

/* Print the strings in a string list. */

void
prstrlist(msg, sp)
char    *msg;
STRLIST *sp;
{
	for ( ; sp; sp = sp->s_next) {
		printf("%s%s\n", msg, sp->s_string);
	}
}

/* Free a list of strings. */

void
freestrlist(sp0)
STRLIST **sp0;
{
	STRLIST *sp;
	STRLIST *sp1;

	for (sp = *sp0; sp; sp = sp1) {
		sp1 = sp->s_next;
		nzfree(sp->s_string);
		nzfree((char *)sp);
	}
	*sp0 = 0;
}

/* This takes a user name and address and a name and stores it in the
   string table. */

void
add_name(list, buf)
STRLIST **list;
char    *buf;
{
	char    *p1;
	char    *p2;
	char    *p3;

	while (isspace(*buf)) {
		++buf;
	}
	if (!(p1 = strchr(buf, ' '))) {
		complain("missing real name");
		return;
	}
	p2 = p1;
	while (isspace(*p2)) {
		++p2;
	}
	if (*p2 != '(') {
		complain("missing the '('");
		return;
	}
	++p2;
	for (p3 = p2; *p3 && *p3 != ')'; ++p3)
		;
	if (*p3 != ')' || p3[1] != 0) {
		complain("the end of the real name is missing.");
		return;
	}
	list = add_segment(list, buf, p1);
	(void)add_segment(list, p2, p3);
}

/* This takes an "EN" line and creates the strings for it. */

void
add_enterer(list, buf)
STRLIST **list;
char    *buf;
{
	char    *p1;
	char    *p2;
	char    *p3;
	char    *p4;

	while (isspace(*buf)) {
		++buf;
	}
	if (!(p1 = strchr(buf, ' '))) {
		complain("missing real name");
		return;
	}
	p2 = p1;
	while (isspace(*p2)) {
		++p2;
	}
	if (*p2 != '(') {
		complain("missing the '('");
		return;
	}
	++p2;
	for (p3 = p2; *p3 && *p3 != ')'; ++p3)
		;
	if (*p3 != ')') {
		complain("missing the ')'");
		return;
	}
	p4 = p3 + 1;
	while (isspace(*p4)) {
		++p4;
	}
	list = add_segment(list, buf, p1);
	list = add_segment(list, p2, p3);
	(void)add_string(list, p4);
}

/* This is called when at the start of a new article. It looks for
   the Subject: line and goes to state ST_DBART. */

int
st_newart()
{
	char    *ptr;

	if (Inend) {
		complain("where's the article?");
		return (ST_DONE);
	}
	ptr = Inbuf;
	if (strncmp(ptr, "Subject:", 8) == 0) {
		ptr += 8;
		while (isspace(*ptr)) {
			++ptr;
		}
		if (strncmp(ptr, "DB:", 3) != 0) {
			complain("invalid Subject:");
		} else {
			printf("%s\n", Inbuf);
			return (ST_DBART);
		}
	} else if (*ptr == '@') {
		complain("missing Subject:");
		return (-ST_DBART);
	}
	return (ST_NEWART);
}

/* This is called after we have determined that this is an article.
   It looks for the start of the database information.  */

int
st_dbart()
{
	if (Inend) {
		complain("no database commands");
		return (ST_DONE);
	}
	if (Inbuf[0] == '@') {
		return (-ST_DELCMD);
	}
	if (strchr(Inbuf, '@')) {
		complain("possible misplaced @");
	}
	return (ST_DBART);
}

/* Processing any delete commands. These always come first.  */

int
st_delcmd()
{
	if (Inend) {
		complain("missing @END");
		return (ST_DONE);
	}
	if (strncmp(Inbuf, "@ADD ", 5) == 0) {
		return (-ST_ADDCMD);
	}
	if (strcmp(Inbuf, "@END") == 0) {
		return (ST_END);
	}
	if (strncmp(Inbuf, "@DEL ", 5) == 0) {
		if (strncmp(Inbuf + 5, "INFO ", 5) == 0) {
			/*@@validate the delete */
			printf("deleting %s from info database", Inbuf + 10);
		} else if (strncmp(Inbuf + 5, "SITE ", 5) == 0) {
			/*@@validate the delete */
			printf("deleting %s from site database", Inbuf + 10);
		} else if (strncmp(Inbuf + 5, "INDEX ", 6) == 0) {
			/*@@validate the delete */
			printf("deleting %s from index database", Inbuf + 11);
		} else {
			complain("invalid delete command");
		}
	} else if (strncmp(Inbuf, "@DELALL ", 8) == 0) {
		/*@@validate the delete */
		printf("deleting all %s from index database", Inbuf + 8);
	} else {
		complain("command expected");
	}
	return (ST_DELCMD);
}

/* Have process all delete commands, now do any add commands.  */

int
st_addcmd()
{
	if (Inend) {
		complain("missing @END");
		return (ST_DONE);
	}
	if (strcmp(Inbuf, "@END") == 0) {
		return (ST_END);
	}
	if (strncmp(Inbuf, "@ADD ", 5) == 0) {
		if (strcmp(Inbuf + 5, "INFO") == 0) {
			/*@@return (ST_INM);*/
			return (ST_SKIP);
		} else if (strcmp(Inbuf + 5, "SITE") == 0) {
			Error = 0;
			return (ST_SNM);
		} else if (strcmp(Inbuf + 5, "INDEX") == 0) {
			/*@@return (ST_XADD);*/
			return (ST_SKIP);
		} else {
			complain("invalid add command");
		}
	} else {
		complain("command expected");
	}
	return (-ST_SKIP);
}

/* We should have a site name. */

int
st_snm()
{
	if (strncmp(Inbuf, "NM ", 3) == 0) {
		(void)add_string(&S_nm, Inbuf + 3);
		return (ST_SEN);
	}
	complain("missing site name");
	return (-ST_SEN);
}

/* We should have the line for the enterer. */

int
st_sen()
{
	if (strncmp(Inbuf, "EN ", 3) == 0) {
		(void)add_enterer(&S_en, Inbuf + 3);
		return (ST_STM);
	}
	complain("missing enterer");
	return (-ST_STM);
}

/* We should have the line for the timezone and best times to call.
*/

int
st_stm()
{
	char    *ptr;
	char    **zp;

	if (strncmp(Inbuf, "TM ", 3) != 0) {
		complain("missing timezone");
		return (-ST_STT);
	}
	if (ptr = strchr(Inbuf + 3, ';')) {
		*ptr = 0;
	}
	for (zp = Zones; *zp && strcmp(*zp, Inbuf + 3) != 0;
	    zp += 2)
		;
	if (ptr) {
		*ptr = ';';
	}
	if (!*zp) {
		complain("invalid time zone");
		return (ST_STT);
	}
	(void)add_string(&S_tm, zp[1]);
	if (ptr) {
		add_fields(&S_tm, ptr + 1);
	}
	return (ST_STT);
}

/* We should have the archive title.  */

int
st_stt()
{
	if (strncmp(Inbuf, "TT ", 3) == 0) {
		(void)add_string(&S_tt, Inbuf + 3);
		return (ST_SAD);
	}
	complain("missing archive title");
	return (-ST_SAD);
}

/* We should have the administrator.  */

int
st_sad()
{
	if (strncmp(Inbuf, "AD ", 3) == 0) {
		(void)add_name(&S_ad, Inbuf + 3);
		return (ST_SMA);
	}
	complain("missing administrator");
	return (-ST_SMA);
}

/* We should have the administrator's mailing address.  */

int
st_sma()
{
	char    *ptr;
	char    *ep;

	if (strncmp(Inbuf, "MA ", 3) == 0) {
		add_fields(&S_ma, Inbuf + 3);
		return (ST_SCO);
	}
	if (strcmp(Inbuf, "MA") == 0) {
		return (ST_SCO);
	}
	complain("missing administrator's address");
	return (-ST_SCO);
}

/* We should have a communications method line.  */

int
st_sco()
{
	char    *ptr;

	ptr = Inbuf + 3;
	if (strncmp(Inbuf, "CO ", 3) == 0) {
		if (strncmp(ptr, "uucp;", 5) == 0) {
			if (count_fields(ptr) != 4) {
				complain("wrong number of fields for uucp");
			} else {
				add_fields(&S_co, ptr);
			}
		} else if (strncmp(ptr, "ftp;", 4) == 0) {
			if (count_fields(ptr) != 6) {
				complain("wrong number of fields for ftp");
			} else {
				add_fields(&S_co, ptr);
			}
		} else {
			complain("unknown communications method");
		}
		return (ST_SCO);
	}
	if (!S_co) {
		complain("missing communications data");
	}
	return (-ST_SIX);
}

/* We may have an index line.  */

int
st_six()
{
	if (strncmp(Inbuf, "IX ", 3) == 0) {
		if (count_fields(Inbuf + 3) != 6) {
			complain("wrong number of fields in index line");
		} else {
			add_fields(&S_ix, Inbuf + 3);
		}
		return (ST_SIX);
	}
	if (strcmp(Inbuf, "IX") == 0) {
		return (ST_SKW);
	}
	return (-ST_SKW);
}

/* We may have a keyword line.  */

int
st_skw()
{
	if (strncmp(Inbuf, "KW ", 3) == 0) {
		(void)add_string(&S_kw, Inbuf + 3);
		return (ST_SKW);
	}
	if (strcmp(Inbuf, "KW") == 0) {
		return (ST_SDE);
	}
	return (-ST_SDE);
}

/* We may have a description line. */

int
st_sde()
{
	if (strncmp(Inbuf, "DE ", 3) == 0) {
		(void)add_string(&S_de, Inbuf + 3);
		return (ST_SDE);
	}
	if (strcmp(Inbuf, "DE") == 0) {
		return (ST_SDE);
	}
	return (-ST_SXX);
}

/* Have not found an @ADD where expected. Skip till the next line found with
   a leading @. */

int
st_skip()
{
	if (Inend) {
		complain("missing @END");
		return (ST_DONE);
	}
	if (Inbuf[0] == '@') {
		return (-ST_ADDCMD);
	}
	if (strchr(Inbuf, '@')) {
		complain("possible misplaced @");
	}
	return (ST_SKIP);
}

/* Print an archive site entry. */

void
print_site()
{
	char    *sptr;
	char    *nptr;
	STRLIST *sp;

	printf("\nArchive site %s, %s\n", S_nm->s_string, S_tt->s_string);
	if (S_de) {
		prstrlist("\t", S_de);
	}
	if (S_kw) {
		printf("Here are some words that describe");
		printf(" what's in the archive:\n");
		prstrlist("\t", S_kw);
	}
	printf("This archive is administered by %s, %s",
	    S_ad->s_next->s_string, S_ad->s_string);
	if (strcmp(S_en->s_string, S_ad->s_string) == 0
	    && strcmp(S_en->s_next->s_string, S_ad->s_next->s_string) == 0) {
		printf(",\nwho submitted this entry on %s.\n",
		    S_en->s_next->s_next->s_string);
	} else {
		printf(";\nthe archive entry was submitted by %s, %s,\n",
		    S_en->s_next->s_string, S_en->s_string);
		printf("\ton %s.\n", S_en->s_next->s_next->s_string);
	}
	if (S_ma) {
		printf("The archive mailing address is:\n");
		prstrlist("\t", S_ma);
	}
	printf("The archive is in the %s time zone.\n", S_tm->s_string);
	if (S_tm->s_next) {
		printf("Here are times the archive is less loaded:\n");
		prstrlist("\t", S_tm->s_next);
	}
	for (sp = S_co; sp; sp = sp->s_next) {
		sptr = sp->s_string;
		sp = sp->s_next;
		printf("Files tagged with a pattern matching \"%s\"",
		    sp->s_string);
		sp = sp->s_next;
		if (strcmp(sptr, "uucp") == 0) {
			printf(" can be obtained via uucp.\n");
			printf("They can be found in directory \"%s\". ",
			    sp->s_string);
			sp = sp->s_next;
			printf("The L.sys entry to use is:\n\t%s\n",
			    sp->s_string);
		} else if (strcmp(sptr, "ftp") == 0) {
			printf(" can be obtained with ftp.\n");
			printf("The domain name is \"%s\"", sp->s_string);
			sp = sp->s_next;
			printf(" and its internet address is %s.\n",
			    sp->s_string);
			sp = sp->s_next;
			printf("The files are in directory \"%s\".\n",
			    sp->s_string);
			sp = sp->s_next;
			if (sp->s_string[0]) {
				printf("These are the times the files");
				printf(" may be accessed: %s.\n",
				    sp->s_string);
			}
		}
	}
	for (sp = S_ix; sp; sp = sp->s_next) {
		printf("There is an index file whose access tag is \"%s\" ",
		    sp->s_string);
		sp = sp->s_next;
		printf("in file \"%s\".", sp->s_string);
		sp = sp->s_next;
		if (sp->s_string[0]) {
			printf(" The file is %sK bytes.",
			    sp->s_string);
		}
		printf("\n");
		sp = sp->s_next;
		sp = sp->s_next;
		if (sp->s_string[0]) {
			printf("You will need these programs to uncompress");
			printf(" the file: %s.\n", sp->s_string);
		}
		sp = sp->s_next;
		if (sp->s_string[0]) {
			printf("\t%s\n", sp->s_string);
		}
	}
}

/* This is the end of the site entry. Display the site entry and free
   the data used by it. Then go to state ST_ADDCMD. */

int
st_sxx()
{
	if (Inend) {
		complain("missing @END");
		return (ST_DONE);
	}
	if (Inbuf[0] == '@') {
		complain("missing blank line after add");
		return (-ST_ADDCMD);
	}
	if (strchr(Inbuf, '@')) {
		complain("possible misplaced @");
		return (ST_SKIP);
	}
	if (Inbuf[0]) {
		complain("extraneous data");
		return (ST_SKIP);
	}
	if (Print && !Error) {
		print_site();
	}
	freestrlist(&S_nm);
	freestrlist(&S_en);
	freestrlist(&S_tm);
	freestrlist(&S_tt);
	freestrlist(&S_ad);
	freestrlist(&S_ma);
	freestrlist(&S_co);
	freestrlist(&S_ix);
	freestrlist(&S_kw);
	freestrlist(&S_de);
	return (ST_ADDCMD);
}

/* Have found an @END, skip to the end of the file, checking for
   possible misplaced @'s. */

st_end()
{
	if (Inend) {
		return (ST_DONE);
	}
	if (strchr(Inbuf, '@')) {
		complain("possible misplaced @");
	}
	return (ST_END);
}
-------------------------<CUT HERE>-------------------------------------------