[comp.sources.unix] v16i052: Remove duplicates from a "bib" database

rsalz@uunet.uu.net (Rich Salz) (11/02/88)

Submitted-by: Brain in Neutral <bin@rhesus.primate.wisc.edu>
Posting-number: Volume 16, Issue 52
Archive-name: uniqbib

Uniqbib is used in conjunction with the suite of programs comprising the
refer system.  One problem with this system is that there is no provision
for removal of duplicates from bibliographic databases.  (Duplicates may
occur, for instance, if the output from two or more lookbib queries is
combined, where entries satisfy the keywords of more than one search.)
Uniqbib is used to remove such duplicates.  It reads its input and writes
all the unique entries therein to the standard output.  The entries in the
input do not have to be sorted (i.e., you do not have to run sortbib
first), nor do the entries produced by uniqbib come out in any particular
order.

#!/bin/sh
# shar:	Shell Archiver
#	Run the following text with /bin/sh to create:
#	uniqbib.c
#	bibinfo.h
#	BibAlloc.c
#	BibCmp.c
#	BibRead.c
#	BibWrite.c
#	panic.c
#	uniqbib.1
#	Makefile
#	bad1
#	bad2
#	bad3
echo shar: extracting uniqbib.c '(7434 characters)'
sed 's/^XX//' << \SHAR_EOF > uniqbib.c
XX/*
XX	uniqbib - uniq a bibliographic database to eliminate duplicates.
XX
XX	syntax: uniqbib [ -v ] [ file ]
XX
XX	-v means verbose: prints feedback.
XX
XX	Only one file may be explicitly named (use cat f1 f2 ... | uniqbib
XX	for > 1 file).
XX
XX	Strategy:
XX	Read input file, writing file position and summary information
XX	for each entry.  Sort on summary info.  For each set of entries
XX	having identical summary information, compare directly, and write
XX	out those that are different.  (For those entries that have unique
XX	summary info, the entry is unique, of course.)
XX
XX	The summary information is trivial:  the sum of the characters in
XX	the text of the entry.  This is invariant with respect to order of
XX	fields (records that are identical except for order of fields get
XX	identical checksums this way).
XX
XX	7 April 1988	Paul DuBois	dubois@rhesus.primate.wisc.edu
XX
XX	Change History:
XX
XX	08/10/88 Fixed failure to check for citations that are too long.
XX		Added -v flag.  Put better error messages in BERead
XX		(prints text of citation seen so far, input line number
XX		where error occurred).  Removed inability to uniq from
XX		a pipe or redirected input (citations are copied to
XX		holding file during reading phase 1).
XX*/
XX
XX# include	<stdio.h>
XX# include	<signal.h>
XX# include	"bibinfo.h"
XX
XX
XXint	count = 0;
XXint	verbose = 0;
XX
XXint	cleanup ();
XXint	onintr ();
XX
XXchar	keyFile[BUFSIZ];
XXchar	holdFile[BUFSIZ];
XX
XX
XXmain (argc, argv)
XXint	argc;
XXchar	**argv;
XX{
XXBibEnt	*be;
XXBibFld	*bf;
XXFILE	*kf, *hf;
XXchar	buf[BUFSIZ];
XXchar	*cp;
XXlong	sum;
XXint	len;
XXint	result;
XXint	empty = 1;
XXint	useHold = 0;
XXlong	holdPos = 0;
XX
XX/*
XX	Arrange for cleanup of temp files in
XX	the event of abnormal termination.
XX*/
XX
XX	setpanichook (cleanup);
XX	signal (SIGINT, onintr);
XX	signal (SIGHUP, onintr);
XX	signal (SIGQUIT, onintr);
XX	signal (SIGTERM, onintr);
XX
XX/*
XX	Process arguments and set up files.  If a file is named, use
XX	that for input, else read whatever's on stdin and arrange to
XX	hold a copy in a seekable temp file.
XX*/
XX
XX	--argc;
XX	++argv;
XX	if (argc > 0 && strcmp (*argv, "-v") == 0)
XX	{
XX		++verbose;
XX		--argc;
XX		++argv;
XX	}
XX	if (argc == 0)		/* reading from pipe or redirection */
XX	{
XX		sprintf (holdFile, "/tmp/ubh%05d", getpid ());
XX		if ((hf = fopen (holdFile, "w")) == NULL)
XX			panic ("Can't open temporary hold file");
XX		++useHold;
XX	}
XX	else if (argc == 1)
XX	{
XX		if (freopen (argv[0], "r", stdin) == NULL)
XX			panic ("Can't open: %s", argv[0]);
XX	}
XX	else
XX		panic ("Usage: uniqbib [-v] file");
XX
XX	sprintf (keyFile, "/tmp/ub%05d", getpid ());
XX	if ((kf = fopen (keyFile, "w")) == NULL)
XX		panic ("Can't open temporary key file.");
XX
XX/*
XX	Ready to make first pass through input.  Read each citation,
XX	compute the key and write out the key and file position of the
XX	citation to the key file.  If reading a pipe or redirected input
XX	write out the citation to a holding file so can re-read it.  In
XX	that case, the file position must be fixed, since the position
XX	in the holding file may well be different (e.g., if there are
XX	multiple blank lines between citations in the original input).
XX*/
XX
XX	if ((be = BEAlloc ()) == NULL)
XX		panic ("Out of memory.");
XX	if (verbose)
XX		fprintf (stderr, "Reading citations\n");
XX	while ((result = BERead (stdin, be)) > 0)
XX	{
XX		++count;
XX		if (verbose)
XX			fprintf (stderr, ".");
XX		empty = 0;
XX		sum = 0;
XX		cp = BEText (be);
XX		len = BELen (be);
XX		if (useHold)
XX		{
XX			BEWrite (hf, be);
XX			BEFilPos (be) = holdPos;
XX			holdPos += len + 1;	/* +1 for preceding newline */
XX		}
XX		while (len-- > 0)
XX			sum += *cp++;
XX		fprintf (kf, "%D %D\n", sum, BEFilPos (be));
XX	}
XX	fclose (kf);
XX	if (useHold)	/* if using hold file, close and attach to stdin */
XX	{
XX		fclose (hf);
XX		if (freopen (holdFile, "r", stdin) == NULL)
XX			panic ("Can't reopen hold file.");
XX	}
XX	if (result < 0 || empty)
XX	{
XX		cleanup ();
XX		exit (0);
XX	}
XX	if (verbose)
XX		fprintf (stderr, "\nPass 1 done (%d citations)\n", count);
XX
XX/*
XX	Pass two.  Sort the keys so duplicates will cluster, and uniq
XX	them.
XX*/
XX
XX	if (verbose)
XX		fprintf (stderr, "Sorting keys\n");
XX	sprintf (buf, "exec sort -o %s %s", keyFile, keyFile);
XX	system (buf);
XX	if (verbose)
XX		fprintf (stderr, "Sort done\n");
XX	if ((kf = fopen (keyFile, "r")) == NULL)
XX		panic ("Can't reopen key file.");
XX	UniqBib (kf);
XX	fclose (kf);
XX	if (verbose)
XX		fprintf (stderr, "\nDone\n");
XX	cleanup ();
XX	exit (0);
XX}
XX
XX
XX
XX/*
XX	The ugly heart of the program.
XX
XX	Read checksum/file-position pairs from f.  It's sorted on checksum,
XX	so that all groups of identical checksums will cluster.  The
XX	bib entries within each cluster may or *may not* be content-identical,
XX	so the algorithm hangs onto each entry until it either knows it's
XX	unique or that it's a dup, as follows:
XX
XX	First read one line and hold onto it for a reference.  Then read rest
XX	of lines.  If checksum is different, flush the reference and restart
XX	with the next line after the reference as the new reference.  If the
XX	checksum is the same, then do a direct compare of the bib entries
XX	themselves.  If they're the same, skip the reference and restart with
XX	the next line after the reference as the new reference.  If they are
XX	different, just keep reading.  (Eventually one will be found that's
XX	either a different checksum or identical, or EOF will be reached, so
XX	the reference can be either flushed or skipped.)
XX
XX	When restarting so that the reference is bounced to the next
XX	line in the summary file, check first whether the comparison
XX	is that line.  If so, don't bother to reread that line or to
XX	fetch the bibliographic entry itself.  Since except in perverse
XX	cases the comparison almost always becomes the next reference, this
XX	is a big win.
XX
XX*/
XX
XXUniqBib (f)
XXFILE	*f;
XX{
XXlong	refPos, comPos;		/* reference and comparison seek positions */
XXlong	refCkSum, comCkSum;	/* reference and comparison checksums */
XXint	getNextRef = 1;		/* non-zero if need to read ref sum, pos */
XXlong	refOff = 0, comOff;	/* offset of line after reference, comparison */
XXint	nCompares = 0;		/* number of comparisons made with ref */
XXBibEnt	b1, *beRef = &b1;
XXBibEnt	b2, *beCom = &b2;
XXBibEnt	*beTmp;
XXint	nondups = 0;
XX
XX	if (verbose)
XX		fprintf (stderr, "Comparing keys\n");
XX	for (;;)
XX	{
XX		if (verbose)
XX			fprintf (stderr, ".");
XX		if (getNextRef)
XX		{
XX			getNextRef = 0;
XX			if (nCompares == 1)	/* make comparison next ref */
XX			{
XX				refCkSum = comCkSum;
XX				refPos = comPos;
XX				refOff = comOff;
XX				beTmp = beRef;
XX				beRef = beCom;
XX				beCom = beTmp;
XX			}
XX			else	/* seek to correct spot, read summary */
XX			{	/* info and entry from bib file */
XX				fseek (f, refOff, 0);
XX				if (fscanf (f, "%D %D\n", &refCkSum, &refPos)
XX						!= 2)
XX					break;	/* no more refs in file */
XX				refOff = ftell (f);
XX				fseek (stdin, refPos, 0);
XX				if (!BERead (stdin, beRef))
XX					panic ("Can't read reference entry.");
XX			}
XX			nCompares = 0;
XX		}
XX		if (fscanf (f, "%D %D\n", &comCkSum, &comPos) != 2)
XX		{
XX			BEWrite (stdout, beRef);	/* flush reference */
XX			++nondups;
XX			++getNextRef;
XX			continue;
XX		}
XX		comOff = ftell (f);
XX		fseek (stdin, comPos, 0);
XX		if (!BERead (stdin, beCom))
XX			panic ("Can't read comparison entry.");
XX		++nCompares;
XX		if (refCkSum != comCkSum)		/* different - flush */
XX		{
XX			BEWrite (stdout, beRef);	/* current reference */
XX			++nondups;
XX		}
XX		else if (BECmp (beRef, beCom))		/* compare directly, */
XX			continue;			/* skip ref if diff */
XX		++getNextRef;
XX	}
XX	fprintf (stderr, "%d citations (%d + %d duplicates)\n",
XX			count, nondups, count-nondups);
XX}
XX
XX
XXcleanup ()
XX{
XX	(void) unlink (keyFile);
XX	(void) unlink (holdFile);
XX}
XX
XXonintr () { panic ("\nInterrupted..."); }
SHAR_EOF
if test 7434 -ne "`wc -c uniqbib.c`"
then
echo shar: error transmitting uniqbib.c '(should have been 7434 characters)'
fi
echo shar: extracting bibinfo.h '(2596 characters)'
sed 's/^XX//' << \SHAR_EOF > bibinfo.h
XX/*
XX	bibinfo.h - constants, types and variables to support operations
XX	on bibliography entries.
XX
XX	The BibEnt type is the basic unit of bibliographic information.
XX	It contains a buffer holding the text of the entry (unmodified),
XX	and structures which are set up when the entry is read in from
XX	a file.  These structures allow easy access to the various fields
XX	of the entry.  They also allow quick checks as to whether a field
XX	is present in the entry, how many authors there are, and so forth.
XX
XX	Macros and functions are used to access parts of records.  This
XX	allows the underlying implementation to change while preserving
XX	the programming interface.  Currently the implementation is
XX	as follows:
XX
XX	Lines of a bibliography entry are read one after the other
XX	into a buffer.  As each is read, the lines are categorized and
XX	information set up to allow each field to be accessed easily.
XX	For each field (except authors), one BibFld struct is set up.
XX	bfStart is the offset into the text buffer of the beginning of the
XX	field (starts at the % itself), and bfLen is the length of the
XX	field (includes length of all continuation lines).  Authors are
XX	anomalous, since there may be many of them; BibFld's for these are
XX	stored at the end of the regular field array, and there is a count
XX	variable.
XX*/
XX
XX# ifndef	NULL
XX# define	NULL	(0)
XX# endif
XX
XX
XX# define	beMaxBuf	1024	/* max size of entry's text */
XX# define	beMaxAuth	20	/* max authors allowed */
XX
XX
XXtypedef struct
XX{
XX	int	bfStart;	/* index of start of bib field text */
XX	int	bfLen;		/* length of bib field text */
XX} BibFld;
XX
XX
XXtypedef struct
XX{
XX	int	beTextLen;		/* length of entry text */
XX	char	beText[beMaxBuf];	/* entry text */
XX	BibFld	beFld[26+beMaxAuth];	/* field information */
XX	int	beAuthCnt;		/* number of authors */
XX	long	beFilPos;		/* starting position in file */
XX} BibEnt;
XX
XX
XX/*
XX	Macros for accessing various pieces of the bibliographic
XX	entry.  Most of these can be used as lvalues.  Arguments
XX	should be as follows:
XX
XX	BibEnt	*be
XX	BibFld	*bf
XX	char	f	('B'..'Z')
XX	int	n	(0..BEAuthCnt(be)-1)
XX
XX*/
XX
XX# define	BEText(be)		(be->beText)
XX# define	BELen(be)		(be->beTextLen)
XX# define	BEFilPos(be)		(be->beFilPos)
XX
XX# define	BEHaveFld(be,f)		(BEFldLen(be,f) != 0)
XX# define	BEFldPtr(be,f)		(&(be->beFld[(f)-'A']))
XX
XX# define	BEFldStart(bf)		(bf->bfStart)
XX# define	BEFldLen(bf)		(bf->bfLen)
XX# define	BEFldText(be,bf)	(&(BEText(be)[BEFldStart(bf)]))
XX
XX# define	BEAuthPtr(be,n)		(BEFldPtr(be,'Z'+n+1))
XX# define	BEAuthLen(be,n)		(BEFldLen(be,'Z'+n+1))
XX# define	BEAuthText(be,n)	(BEFldText(be,'Z'+n+1))
XX
XX# define	BEAuthCnt(be)		(be->beAuthCnt)
XX
XX
XXBibEnt *BEAlloc ();
SHAR_EOF
if test 2596 -ne "`wc -c bibinfo.h`"
then
echo shar: error transmitting bibinfo.h '(should have been 2596 characters)'
fi
echo shar: extracting BibAlloc.c '(205 characters)'
sed 's/^XX//' << \SHAR_EOF > BibAlloc.c
XX/*
XX	BibEnt structure allocation and disposal
XX*/
XX
XX# include	"bibinfo.h"
XX
XXBibEnt *BEAlloc ()
XX{
XXchar	*calloc ();
XX
XX	return ((BibEnt *) calloc (1, sizeof (BibEnt)));
XX}
XX
XX
XXBEFree (bp)
XXBibEnt	*bp;
XX{
XX	free (bp);
XX}
SHAR_EOF
if test 205 -ne "`wc -c BibAlloc.c`"
then
echo shar: error transmitting BibAlloc.c '(should have been 205 characters)'
fi
echo shar: extracting BibCmp.c '(1623 characters)'
sed 's/^XX//' << \SHAR_EOF > BibCmp.c
XX/*
XX	Compare two bibliographic entries for equality.  They do
XX	not need to have the fields in the same order (except that
XX	authors must be in the same order), just the same fields
XX	present, and the same information in corresponding fields.
XX
XX	Returns:
XX
XX	0	b1 = b2
XX	!0	b1 != b2
XX
XX	This should be extended to allow comparison to return ordering
XX	information by passing in sort ordering information.
XX
XX	First compare the number of authors.
XX	If equal, compare the authors by length and text.
XX	If equal, see if the same fields are present and have the same length
XX	and the same text.
XX	If equal, they're equal.
XX	Otherwise, not.
XX*/
XX
XX# include	<stdio.h>
XX# include	"bibinfo.h"
XX
XX# undef	DEBUG
XX# ifdef		DEBUG
XX# define	RETURN(x,y)	return (printf ("%s\n", x) ? y : y)
XX# else
XX# define	RETURN(x,y)	return (y)
XX# endif
XX
XXBECmp (b1, b2)
XXBibEnt	*b1, *b2;
XX{
XXint	i, len, n;
XXBibFld	*bfp1, *bfp2;
XX
XX	if ((n = BEAuthCnt (b1)) != BEAuthCnt (b2))
XX		RETURN ("author count", 1);
XX	for (i = 0; i < n; i++)
XX	{
XX		bfp1 = BEAuthPtr (b1, i);
XX		bfp2 = BEAuthPtr (b2, i);
XX		if ((len = BEFldLen (bfp1)) != BEFldLen (bfp2))
XX			RETURN ("author length", 1);
XX		if (strncmp (BEFldText (b1, bfp1), BEFldText (b2, bfp2), len))
XX			RETURN ("author text", 1);
XX	}
XX	for (i = 'B'; i <= 'Z'; i++)
XX	{
XX/*
XX	Don't have to check whether fields are present or not, since
XX	missing ones have length zero and will compare properly.
XX*/
XX		bfp1 = BEFldPtr (b1, i);
XX		bfp2 = BEFldPtr (b2, i);
XX		if ((len = BEFldLen (bfp1)) != BEFldLen (bfp2))
XX			RETURN ("field length", 1);
XX		if (strncmp (BEFldText (b1, bfp1), BEFldText (b2, bfp2), len))
XX			RETURN ("field text", 1);
XX	}
XX	RETURN ("same", 0);
XX}
SHAR_EOF
if test 1623 -ne "`wc -c BibCmp.c`"
then
echo shar: error transmitting BibCmp.c '(should have been 1623 characters)'
fi
echo shar: extracting BibRead.c '(2701 characters)'
sed 's/^XX//' << \SHAR_EOF > BibRead.c
XX/*
XX	Read a bibliographic entry from a file.
XX
XX	Read until have a non-blank line, then read until a blank line or
XX	EOF.  Return number of lines in citation.  (Zero means EOF, -1 error).
XX
XX	Error are returned for various conditions:
XX
XX	text of citation too long
XX	more than beMaxAuth authors
XX	first line does not begin with %
XX	% lines with non-capital letter following %
XX*/
XX
XX# include	<stdio.h>
XX# include	<ctype.h>
XX# include	<varargs.h>
XX# include	"bibinfo.h"
XX
XX
XXstatic	long	line = 0;
XXstatic	long	citation = 0;
XX
XX
XXBERead (f, bi)
XXFILE	*f;
XXBibEnt	*bi;
XX{
XXBibFld	*bf = NULL;
XXchar	*bp = BEText (bi);
XXint	left = sizeof (BEText (bi));
XXint	read = 1;
XXint	nLines = 0;
XXint	i, len;
XXint	c;
XX
XX/*
XX	Initialize entry to "nothing"
XX*/
XX
XX	BELen (bi) = 0;			/* no text */
XX	BEAuthCnt (bi) = 0;			/* no authors */
XX	for (i = 'A'; i <= 'Z'; i++)		/* no other fields, either */
XX		BEFldLen (BEFldPtr (bi, i)) = 0;
XX
XX	++citation;
XX
XX	while (read)
XX	{
XX		++line;
XX		if ((c = getc (f)) == EOF)
XX			break;
XX		if (c == '\n')			/* blank line */
XX		{
XX			read = (nLines == 0);	/* if haven't seen ref, keep */
XX			continue;		/* reading, else skip line */
XX		}
XX		if (++nLines == 1)		/* set pos on first line */
XX		{
XX			BEFilPos (bi) = ftell (f) - 1;
XX			if (c != '%')		/* make sure not cont. line */
XX			{
XX				BRError (bi, "First line in citation does not begin with '%%':\n%s\n", bp);
XX				return (-1);
XX			}
XX		}
XX		for (len = 0; left-- > 0; )	/* read line into buffer */
XX		{
XX			bp[len++] = c;
XX			if (c == '\n')		/* have full line now */
XX				break;
XX			if ((c = getc (f)) == EOF)
XX			{
XX				read = 0;	/* done reading */
XX				break;
XX			}
XX		}
XX		BELen (bi) += len;
XX		if (left < 0)
XX		{
XX			BRError (bi, "Citation too long, exceeds %d characters\n", sizeof (BEText (bi)));
XX			return (-1);
XX		}
XX		if (*bp == '%')			/* start of new field */
XX		{
XX			if (!isupper (bp[1]))
XX			{
XX				BRError (bi, "Bad key line\n");
XX				return (-1);
XX			}
XX			/*bp += 3;	/* skip to beginning of text */
XX			/*len -= 3;	/* adjust length of rest of line */
XX			if (bp[1] != 'A')
XX				bf = BEFldPtr (bi, bp[1]);
XX			else if (BEAuthCnt (bi) >= beMaxAuth)
XX			{
XX				BRError (bi, "Too many authors, only %d allowed.\n", beMaxAuth);
XX				return (-1);
XX			}
XX			else
XX				bf = BEAuthPtr (bi, BEAuthCnt (bi)++);
XX
XX			BEFldStart (bf) = bp - BEText (bi);
XX			BEFldLen (bf) = 0;
XX		}
XX		BEFldLen (bf) += len;
XX		bp += len;
XX	}
XX
XX	return (nLines);
XX}
XX
XX
XX
XXstatic BRError (va_alist)
XXva_dcl
XX{
XXva_list	args;
XXBibEnt	*bi;
XXchar	*fmt;
XX
XX	va_start (args);
XX	bi = va_arg (args, BibEnt *);
XX	fmt = va_arg (args, char *);
XX	fprintf (stderr, "Error at line %D (in citation %D)\n", line, citation);
XX	vfprintf (stderr, fmt, args);
XX	if (BELen (bi) > 0)
XX	{
XX		fprintf (stderr, "Text of entry up to error:\n");
XX		BEWrite (stderr, bi);
XX	}
XX	va_end (args);
XX}
SHAR_EOF
if test 2701 -ne "`wc -c BibRead.c`"
then
echo shar: error transmitting BibRead.c '(should have been 2701 characters)'
fi
echo shar: extracting BibWrite.c '(265 characters)'
sed 's/^XX//' << \SHAR_EOF > BibWrite.c
XX/*
XX	Write a bibliographic entry to a file.  This is simple; just
XX	write a newline followed by the text of the entry.
XX*/
XX
XX# include	<stdio.h>
XX# include	"bibinfo.h"
XX
XX
XXBEWrite (f, be)
XXFILE	*f;
XXBibEnt	*be;
XX{
XX	fputc ('\n', f);
XX	fwrite (BEText (be), 1, BELen (be), f);
XX}
SHAR_EOF
if test 265 -ne "`wc -c BibWrite.c`"
then
echo shar: error transmitting BibWrite.c '(should have been 265 characters)'
fi
echo shar: extracting panic.c '(670 characters)'
sed 's/^XX//' << \SHAR_EOF > panic.c
XX/*
XX	panic - print message and die with status 1.  Uses vprintf
XX	so that panic can take variable argument lists.
XX
XX	setpanichook - install function to be called with no arguments
XX	after printing panic message and before exiting.  Can be used
XX	for additional cleanup, like removing temp files, etc.
XX*/
XX
XX# include	<stdio.h>
XX# include	<varargs.h>
XX
XX
XXstatic	int	(*panichook) () = NULL;
XX
XXvoid setpanichook (p)
XXint	(*p) ();
XX{
XX	panichook = p;
XX}
XX
XX
XXvoid
XXpanic (va_alist)
XXva_dcl
XX{
XXva_list	args;
XXchar	*fmt;
XX
XX	va_start (args);
XX	fmt = va_arg (args, char *);
XX	vfprintf (stderr, fmt, args);
XX	va_end (args);
XX	fprintf (stderr, "\n");
XX	if (panichook != NULL)
XX		(*panichook) ();
XX	exit (1);
XX}
SHAR_EOF
if test 670 -ne "`wc -c panic.c`"
then
echo shar: error transmitting panic.c '(should have been 670 characters)'
fi
echo shar: extracting uniqbib.1 '(2518 characters)'
sed 's/^XX//' << \SHAR_EOF > uniqbib.1
XX.\" xroff -man.new % | lpr
XX.TH uniqbib 1
XX.UC 4
XX.SH NAME
XXuniqbib \- remove duplicates from bibliographic database
XX.SH SYNTAX
XX.B uniqbib
XX[
XX.B \-v
XX] [
XX.I file
XX]
XX.SH DESCRIPTION
XX.I Uniqbib
XXis used in conjunction with the suite of programs comprising the
XX.I refer
XXsystem.
XXOne problem with this system is that there is no provision for removal
XXof duplicates from bibliographic databases.
XX(Duplicates may occur, for instance, if the output from two or more
XX.I lookbib
XXqueries is combined, where entries
XXsatisfy the keywords of more than one search.)
XX.I Uniqbib
XXis used to remove such duplicates.
XXIt reads its input and writes all the unique entries therein to the
XXstandard output.
XXThe entries in the input do not have to be sorted (i.e., you do not have to
XXrun
XX.I sortbib
XXfirst), nor do the entries produced by
XX.I uniqbib
XXcome out in any particular order.
XX.PP
XXIf a file is named,
XX.I uniqbib
XXreads it; otherwise it reads its standard input.
XXThis means that
XX.I uniqbib
XXcan be used in pipelines.
XXIf more than one file is to be processed, use
XX.PP
XX.nf
XX.in +.5i
XXcat input1 input2 ... input\f2n\f1 | uniqbib > output
XX.in -.5i
XX.fi
XX.PP
XXEntries are considered duplicates if
XX(i) they contain the same authors, in the same order; and (ii)
XXthey contain the same non-author fields, and the content of corresponding
XXfields is identical.
XX.PP
XXFields (except for authors) do
XX.I not
XXhave to appear in the same order for two entries to be considered identical.
XXFor example, the following two entries are the same as far as
XX.I uniqbib
XXis concerned, because within the entries, the authors appear in the same
XXorder, and the other fields are identical with respect to content, even
XXthough they appear in different orders.
XX.PP
XX.nf
XX.in +.5i
XX.ta 2.5i
XX%A First Author		%T My title
XX%T My title		%J Some journal
XX%A Second Author	%A First Author
XX%D 1988	%A Second Author
XX%N 4	%D 1988
XX%J Some journal	%V 198
XX%V 198	%N 4
XX.in -.5i
XX.fi
XX.PP
XXThe following entries are
XX.I not
XXidentical:
XX.PP
XX.nf
XX.in +.5i
XX.ta 2.5i
XX%A First Author		%A Second author
XX%A Second Author	%A First Author
XX%T My title		%T My title
XX%J Some journal	%J Some journal
XX%V 198	%V 198
XX%N 4	%N 4
XX%D 1988	%D 1988
XX.in -.5i
XX.fi
XX.PP
XXIf the
XX.B \-v
XXflag is given,
XX.I uniqbib
XXgoes into verbose mode;
XXit announces the beginning and end of each phase of analysis, and prints
XXa ``.'' as each citation is read during the initial and comparison phases.
XX.SH "SEE ALSO"
XXlookbib(1), refer(1), sortbib(1)
XX.SH "WHO-TO-BLAME"
XXPaul DuBois, dubois@rhesus.primate.wisc.edu
XX.SH BUGS
XXShould probably be named ``snail.''
SHAR_EOF
if test 2518 -ne "`wc -c uniqbib.1`"
then
echo shar: error transmitting uniqbib.1 '(should have been 2518 characters)'
fi
echo shar: extracting Makefile '(841 characters)'
sed 's/^XX//' << \SHAR_EOF > Makefile
XXBINDIR=/usr/local
XXLIBS=biblib
XXINSTALL= -c -m 755 -o bin -g system
XXTROFF=xroff
XXMANMACROS=man.new
XXLOBJS=\
XX	BibAlloc.o\
XX	BibCmp.o\
XX	BibRead.o\
XX	BibWrite.o\
XX	panic.o
XX
XXall: uniqbib biblib
XXinstall: iuniqbib ibiblib
XXclean:
XX	rm -f *.o uniqbib biblib
XX
XXuniqbib: uniqbib.o biblib
XX	cc uniqbib.o ${LIBS} -o uniqbib
XXiuniqbib: uniqbib
XX	install ${INSTALL} uniqbib ${BINDIR}
XX
XXuniqbib.o: bibinfo.h
XX${LOBJS}: bibinfo.h
XX
XXbiblib: ${LOBJS}
XX	-rm -f biblib
XX	ar r biblib ${LOBJS}
XX	ranlib biblib
XXibiblib:
XX
XXman: uniqbib.1
XX	${TROFF} -{MANMACROS} uniqbib.1 | lpr
XX
XXshar:
XX	cshar -a uniqbib.c bibinfo.h BibAlloc.c BibCmp.c BibRead.c \
XX		BibWrite.c panic.c uniqbib.1 Makefile \
XX		bad1 bad2 bad3 > uniqbib.sha
XX
XXtest:
XX	@echo ERROR IS: too many authors
XX	-uniqbib bad1
XX	@echo ERROR IS: citation too long
XX	-uniqbib bad2
XX	@echo ERROR IS: first line is non-key line
XX	-uniqbib bad3
SHAR_EOF
if test 841 -ne "`wc -c Makefile`"
then
echo shar: error transmitting Makefile '(should have been 841 characters)'
fi
echo shar: extracting bad1 '(223 characters)'
sed 's/^XX//' << \SHAR_EOF > bad1
XX
XX%A auth 1
XX%A auth 2
XX%A auth 3
XX%A auth 4
XX%A auth 5
XX%A auth 6
XX%A auth 7
XX%A auth 8
XX%A auth 9
XX%A auth 10
XX%A auth 11
XX%A auth 12
XX%A auth 13
XX%A auth 14
XX%A auth 15
XX%A auth 16
XX%A auth 17
XX%A auth 18
XX%A auth 19
XX%A auth 20
XX%A auth 21
SHAR_EOF
if test 223 -ne "`wc -c bad1`"
then
echo shar: error transmitting bad1 '(should have been 223 characters)'
fi
echo shar: extracting bad2 '(1025 characters)'
sed 's/^XX//' << \SHAR_EOF > bad2
XX%A aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
XX%T ttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt
XX%V vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
XX%N nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
XX%P PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
XX%C ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
XX%I iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
XX%D ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd
XX%K kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk
XX%O ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
XX%E eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
XX%B bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
XX%X xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
SHAR_EOF
if test 1025 -ne "`wc -c bad2`"
then
echo shar: error transmitting bad2 '(should have been 1025 characters)'
fi
echo shar: extracting bad3 '(3 characters)'
sed 's/^XX//' << \SHAR_EOF > bad3
XX
XXt
SHAR_EOF
if test 3 -ne "`wc -c bad3`"
then
echo shar: error transmitting bad3 '(should have been 3 characters)'
fi
#	End of shell archive
exit 0

-- 
Please send comp.sources.unix-related mail to rsalz@uunet.uu.net.