[alt.sources] Tool to find duplicate articles

jerry@olivey.olivetti.com (Jerry Aguirre) (08/17/90)

Here is a tool I thru together when my news history got corrupted and
users started complaining about seeing duplicates of articles.

===BEGIN histdups.c===
#include <stdio.h>
#define LINESIZ 1024
#define MAXF 32

/* Expects the stdin to be the history file, sorted.  Stdout is a list
 * of file names which are duplicates of earlier articles.  Run after
 * expire -r and then "rm" the files listed in the output.
 *
 *	sort <history | histdups >dupfiles; xargs <dupfiles rm
 *
 * If the news history becomes corrupted then you can wind up with
 * duplicates.  These are both a waste of space and a pain for people
 * reading news.
 *
 * B news expire -r will find the dups and then enter all of them into
 * the history file.  (It doesn't even match up the cross postings
 * to each other correctly.)  This program will output the names of all
 * but the first duplicate in each news group.  (Where "first" is based
 * on article numbering which presumably represents arrival order.)
 *
 * 16Aug90 Jerry Aguirre <jerry@atc.olivetti.com>
 */

char files[MAXF][LINESIZ];
int nf;

long atol();
char *index();

main()
{
    char c, *p;
    int i, j;
    char line[LINESIZ];
    char id[LINESIZ];
    char lastline[LINESIZ];

    nf = 0;
    id[0] = '\0';
    lastline[0] = '\0';
    while (gets(line)) {
	p = index(line, '\t');
	if (p) {
	    *p = '\0';
	    if (strcmp(line, id) == 0) {	/* we have a dup */
		if (lastline[0] != '\0') {
		    parsefiles(lastline);
		    lastline[0] = '\0';
		}
		*p = '\t';
		parsefiles(line);
	    } else {
		printdups();
		strcpy(id, line);
		*p = '\t';
		strcpy(lastline, line);
		nf = 0;
	    }
	}
    }
}
parsefiles(line) char *line;
{
    char *pd, *pf, *p;

    pd = index(line, '\t');
    if (pd) pd++;
    else return;
    pf = index(pd, '\t');
    if (pf) pf++;
    else return;
    while (*pf) {
	while (*pf == ' ') pf++;
	if (*pf == '\0') return;
	if (nf >= MAXF) return;
	p = index(pf, ' ');
	if (p) *p = '\0';
	strcpy(files[nf], pf);
	nf++;
	if (p) {
	    pf = p + 1;
	    *p = ' ';
	}
	else return;
    }
}

printdups()
{
    int i1, i2, flags[MAXF];
    long n1, n2;
    char *p1, *p2;

    for (i1 = 0; i1 < nf; i1++) flags[i1] = 0;

    for (i1 = 0; i1 < nf; i1++) {
	p1 = index(files[i1], '/');
	if (!p1) continue;
	*p1 = '\0';
	n1 = atol(p1+1);
	for (i2 = i1 + 1; i2 < nf; i2++) {
	    p2 = index(files[i2], '/');
	    if (!p2) continue;
	    *p2 = '\0';
	    if (strcmp(files[i1], files[i2]) == 0) { /* same group */
		n2 = atol(p2+1);
		if (n2 > n1) flags[i2] = 1;	/* lowest number stays */
		else if (n2 < n1) flags[i1] = 1;
	    }
	    *p2 = '/';
	    n2 = atol(p2+1);
	}
	*p1 = '/';
    }
    for (i1 = 0; i1 < nf; i1++) {
	if (flags[i1] == 1) {
	    for (p1 = files[i1]; *p1; p1++) {
		if (*p1 == '.') putchar('/');
		else putchar(*p1);
	    }
	    putchar('\n');
	}
    }
}
===END histdups.c===