[comp.sources.misc] v17i013: finddup - Find duplicate files, Part01/01

davidsen@crdos1.crd.ge.com (Bill Davidsen) (02/24/91)

Submitted-by: Bill Davidsen <davidsen@crdos1.crd.ge.com>
Posting-number: Volume 17, Issue 13
Archive-name: finddup/part01
Supersedes: finddup: Volume 16, Issue 95

#!/bin/sh
# shar:	Shell Archiver  (v1.29)
#
#	Run the following text with /bin/sh to create:
#	  finddup.c
#	  finddup.1C
#	  makefile
#	  getopt.c
#
echo "x - extracting finddup.c (Text)"
sed 's/^X//' << 'SHAR_EOF' > finddup.c &&
X/****************************************************************\
X|  finddup.c - the one true find duplicate files program
X|----------------------------------------------------------------
X|  Bill Davidsen, last hacked 2/22/91
X|  Copyright (c) 1991 by Bill Davidsen, all rights reserved. This
X|  program may be freely used and distributed in its original
X|  form. Modified versions must be distributed with the original
X|  unmodified source code, and redistribution of the original code
X|  or any derivative program may not be restricted.
X|----------------------------------------------------------------
X|  Calling sequence:
X|   finddup [-l] checklist
X|
X|  where checklist is the name of a file containing filenames to 
X|  be checked, such as produced by "find . -type f -print >file"
X|  returns a list of linked and duplicated files.
X|
X|  If the -l option is used the hard links will not be displayed.
X\***************************************************************/
X
X#include <stdio.h>
X#include <sys/types.h>
X#include <sys/stat.h>
X#include <malloc.h>
X
X/* parameters */
X#define MAXFN	120             /* max filename length */
X
X/* constants */
X#define EOS		((char) '\0')	/* end of string */
X#define FL_CRC	0x0001			/* flag if CRC valid */
X#define FL_DUP	0x0002			/* files are duplicates */
X#define FL_LNK	0x0004			/* file is a link */
X
X/* macros */
X#ifdef DEBUG
X#define debug(X) if (DebugFlg) printf X
X#define OPTSTR	"lhd"
X#else
X#define debug(X)
X#define OPTSTR	"lh"
X#endif
X#define SORT qsort((char *)filelist, n_files, sizeof(filedesc), comp1);
X#define GetFlag(x,f) ((filelist[x].flags & (f)) != 0)
X#define SetFlag(x,f) (filelist[x].flags |= (f))
X
Xtypedef struct {
X	off_t length;				/* file length */
X	unsigned long crc32;		/* CRC for same length */
X	dev_t device;				/* physical device # */
X	ino_t inode;				/* inode for link detect */
X	off_t nameloc;				/* name loc in names file */
X	char flags;					/* flags for compare */
X} filedesc;
X
Xfiledesc *filelist;				/* master sorted list of files */
Xlong n_files = 0;				/* # files in the array */
Xlong max_files = 0;				/* entries allocated in the array */
Xint linkflag = 1;				/* show links */
Xint DebugFlg = 0;				/* inline debug flag */
XFILE *namefd;					/* file for names */
Xextern int
X	opterr,						/* error control flag */
X	optind;						/* index for next arg */
X
X/* help message, in a table format */
Xstatic char *HelpMsg[] = {
X	"Calling sequence:",
X    "",
X	"  finddup [options] list",
X	"",
X	"where list is a list of files to check, such as generated",
X	"by \"find . -type f -print > file\"",
X	"",
X	"Options:",
X	"  -l - don't list hard links",
X#ifdef DEBUG
X	"  -d - debug (must compile with DEBUG)"
X#endif /* ?DEBUG */
X};
Xstatic HelpLen = sizeof(HelpMsg)/sizeof(char *);
X
X#ifndef	lint
Xstatic char *SCCSid[] = {
X	"@(#)finddup.c v1.13 - 2/22/91",
X	"Copyright (c) 1991 by Bill Davidsen, all rights reserved"
X};
X#endif
X
Xint comp1();					/* compare two filedesc's */
Xvoid scan1();					/* make the CRC scan */
Xvoid scan2();					/* do full compare if needed */
Xvoid scan3();					/* print the results */
Xunsigned long get_crc();		/* get crc32 on a file */
Xchar *getfn();					/* get a filename by index */
X
Xmain(argc, argv)
Xint argc;
Xchar *argv[];
X{
X	char curfile[MAXFN];
X	struct stat statbuf;
X	int ch;
X	int firsterr = 0;			/* flag on 1st error for format */
X	int firsttrace = 0;			/* flag for 1st trace output */
X	off_t loc;            		/* location of name in the file */
X	int zl_hdr = 1;				/* need header for zero-length files list */
X	filedesc *curptr;			/* pointer to current storage loc */
X
X	/* parse options, if any */
X	opterr = 0;
X	while ((ch = getopt(argc, argv, OPTSTR)) != EOF) {
X		switch (ch) {
X		case 'l': /* set link flag */
X			linkflag = 0;
X			break;
X#ifdef DEBUG
X		case 'd': /* debug */
X			++DebugFlg;
X			break;
X#endif /* ?DEBUG */
X		case 'h': /* help */
X		case '?':
X			for (ch = 0; ch < HelpLen; ++ch) {
X				printf("%s\n", HelpMsg[ch]);
X			}
X			exit(1);
X		}
X	}
X
X	/* correct for the options */
X	argc -= (optind-1);
X	argv += (optind-1);
X
X	/* check for filename given, and open it */
X	if (argc != 2) {
X		fprintf(stderr, "Needs name of file with filenames\n");
X		exit(1);
X	}
X	namefd = fopen(argv[1], "r");
X	if (namefd == NULL) {
X		perror("Can't open names file");
X		exit(1);
X	}
X
X	/* start the list of name info's */
X	filelist = (filedesc *) malloc(50 * sizeof(filedesc));
X	if (filelist == NULL) {
X		perror("Can't start files vector");
X		exit(1);
X	}
X	/* finish the pointers */
X	max_files = 50;
X	debug(("First vector allocated @ %08lx, size %d bytes\n",
X		(long) filelist, 50*sizeof(filedesc)
X	));
X	fprintf(stderr, "build list...");
X
X	/* this is the build loop */
X	while (loc = ftell(namefd), fgets(curfile, MAXFN, namefd) != NULL) {
X		/* check for room in the buffer */
X		if (n_files == max_files) {
X			/* allocate more space */
X			max_files += 50;
X			filelist =
X				(filedesc *) realloc(filelist, (max_files)*sizeof(filedesc));
X			if (filelist == NULL) {
X				perror("Out of memory!");
X				exit(1);
X			}
X			debug(("Got more memory!\n"));
X		}
X		curfile[strlen(curfile)-1] = EOS;
X
X		/* add the data for this one */
X		if (stat(curfile, &statbuf)) {
X			fprintf(stderr, "%c  %s - ", 
X				(firsterr++ == 0 ? '\n' : '\r'), curfile
X			);
X			perror("ignored");
X			continue;
X		}
X
X		/* check for zero length files */
X		if ( statbuf.st_size == 0) {
X			if (zl_hdr) {
X				zl_hdr = 0;
X				printf("Zero length files:\n\n");
X			}
X			printf("%s\n", curfile);
X			continue;
X		}
X
X		curptr = filelist + n_files++;
X		curptr->nameloc = loc;
X		curptr->length = statbuf.st_size;
X		curptr->device = statbuf.st_dev;
X		curptr->inode = statbuf.st_ino;
X		curptr->flags = 0;
X		debug(("%cName[%d] %s, size %ld, inode %d\n",
X			(firsttrace++ == 0 ? '\n' : '\r'), n_files, curfile,
X			(long) statbuf.st_size, statbuf.st_ino
X		));
X	}
X
X	/* sort the list by size, device, and inode */
X	fprintf(stderr, "sort...");
X	SORT;
X
X	/* make the first scan for equal lengths */
X	fprintf(stderr, "scan1...");
X	scan1();
X
X	/* make the second scan for dup CRC also */
X	fprintf(stderr, "scan2...");
X	scan2();
X
X	fprintf(stderr, "done\n");
X
X#ifdef DEBUG
X	for (loc = 0; DebugFlg > 1 && loc < n_files; ++loc) {
X		curptr = filelist + loc;
X		printf("%8ld %08lx %6d %6d %02x\n",
X			curptr->length, curptr->crc32,
X			curptr->device, curptr->inode,
X			curptr->flags
X		);
X	}
X#endif
X
X	/* now scan and output dups */
X	scan3();
X
X	exit(0);
X}
X
X/* comp1 - compare two values */
Xint
Xcomp1(p1, p2)
Xchar *p1, *p2;
X{
X	register filedesc *p1a = (filedesc *)p1, *p2a = (filedesc *)p2;
X	register int retval;
X
X	(retval = p1a->length - p2a->length) ||
X	(retval = p1a->crc32 - p2a->crc32) ||
X	(retval = p1a->device - p2a->device) ||
X	(retval = p1a->inode - p2a->inode);
X	
X	return retval;
X}
X
X/* scan1 - get a CRC32 for files of equal length */
X
Xvoid
Xscan1() {
X	FILE *fp;
X	int ix, needsort = 0;
X
X	for (ix = 1; ix < n_files; ++ix) {
X		if (filelist[ix-1].length == filelist[ix].length) {
X			/* get a CRC for each */
X			if (! GetFlag(ix-1, FL_CRC)) {
X				filelist[ix-1].crc32 = get_crc(ix-1);
X				SetFlag(ix-1, FL_CRC);
X			}
X			if (! GetFlag(ix, FL_CRC)) {
X				filelist[ix].crc32 = get_crc(ix);
X				SetFlag(ix, FL_CRC);
X			}
X			needsort = 1;
X		}
X	}
X
X	if (needsort) SORT;
X}
X
X/* scan2 - full compare if CRC is equal */
X
Xvoid
Xscan2() {
X	int ix, ix2, lastix;
X	int inmatch;				/* 1st filename has been printed */
X	int need_hdr = 1;			/* Need a hdr for the hard link list */
X	int lnkmatch;				/* flag for matching links */
X	register filedesc *p1, *p2;
X	filedesc wkdesc;
X
X	/* mark links and output before dup check */
X	for (ix = 0; ix < n_files; ix = ix2) {
X		p1 = filelist + ix;
X		for (ix2 = ix+1, p2 = p1+1, inmatch = 0;
X			ix2 < n_files
X				&& p1->device == p2->device
X				&& p1->inode == p2->inode;
X			++ix2, ++p2
X		) {
X			SetFlag(ix2, FL_LNK);
X			if (linkflag) {
X				if (need_hdr) {
X					need_hdr = 0;
X					printf("\n\nHard link summary:\n\n");
X				}
X
X				if (!inmatch) {
X					inmatch = 1;
X					printf("\nFILE: %s\n", getfn(ix));
X				}
X				printf("LINK: %s\n", getfn(ix2));
X			}
X		}
X	}
X	debug(("\nStart dupscan"));
X
X	/* now really scan for duplicates */
X	for (ix = 0; ix < n_files; ix = lastix) {
X		p1 = filelist + ix;
X		for (lastix = ix2 = ix+1, p2 = p1+1, lnkmatch = 1;
X			ix2 < n_files
X				&& p1->length == p2->length
X				&& p1->crc32 == p2->crc32;
X			++ix2, ++p2
X		) {
X			if ((GetFlag(ix2, FL_LNK) && lnkmatch)
X				|| fullcmp(ix, ix2) == 0
X			) {
X				SetFlag(ix2, FL_DUP);
X				/* move if needed */
X				if (lastix != ix2) {
X					int n1, n2;
X
X					debug(("\n  swap %d and %d", lastix, ix2));
X					wkdesc = filelist[ix2];
X					for (n1 = ix2; n1 > lastix; --n1) {
X						filelist[n1] = filelist[n1-1];
X					}
X					filelist[lastix++] = wkdesc;
X				}
X				lnkmatch = 1;
X			}
X			else {
X				/* other links don't match */
X				lnkmatch = 0;
X			}
X		}
X	}
X}
X
X/* scan3 - output dups */
X
Xvoid
Xscan3()
X{
X	register filedesc *p1, *p2;
X	int ix, ix2, inmatch, need_hdr = 1;
X	char *fnptr;				/* pointer to the filename for sups */
X	char headfn[132];			/* temp holder of the filename */
X
X	/* now repeat for duplicates, links or not */
X	for (ix = 0; ix < n_files; ++ix) {
X		if (GetFlag(ix, FL_DUP)) {
X			/* put out a header if you haven't */
X			if (!inmatch) {
X				strcpy(headfn, getfn(ix-1));
X				fnptr = headfn;
X			}
X			inmatch = 1;
X			if (linkflag || !GetFlag(ix, FL_LNK)) {
X				/* header on the very first */
X				if (need_hdr) {
X					need_hdr = 0;
X					printf("\n\nList of files with duplicate contents");
X					if (linkflag) printf(" (includes hard links)");
X					putchar('\n');
X				}
X
X				/* 1st filename if any dups */
X				if (fnptr != NULL) {
X					printf("\nFILE: %s\n", fnptr);
X					fnptr = NULL;
X				}
X				printf("DUP:  %s\n", getfn(ix));
X			}
X		}
X		else inmatch = 0;
X	}
X}
X
X/* get_crc - get a CRC32 for a file */
X
Xunsigned long
Xget_crc(ix)
Xint ix;
X{
X	FILE *fp;
X	register unsigned long val1 = 0x90909090, val2 = 0xeaeaeaea;
X	register int carry;
X	int ch;
X	char fname[MAXFN];
X
X	/* open the file */
X	fseek(namefd, filelist[ix].nameloc, 0);
X	fgets(fname, MAXFN, namefd);
X	fname[strlen(fname)-1] = EOS;
X	debug(("\nCRC start - %s ", fname));
X	if ((fp = fopen(fname, "r")) == NULL) {
X		fprintf(stderr, "Can't read file %s\n", fname);
X		exit(1);
X	}
X
X	/* build the CRC values */
X	while ((ch = fgetc(fp)) != EOF) {
X		carry = (val1 & 0x8000000) != 0;
X		val1 = (val1 << 1) ^ ch + carry;
X		val2 += ch << (ch & 003);
X	}
X	fclose(fp);
X	debug(("v1: %08lx v2: %08lx ", val1, val2));
X
X	return ((val1 & 0xffff) << 12) ^ (val2 && 0xffffff);
X}
X
X/* getfn - get filename from index */
X
Xchar *
Xgetfn(ix)
Xoff_t ix;
X{
X	static char fnbuf[MAXFN];
X
X	fseek(namefd, filelist[ix].nameloc, 0);
X	fgets(fnbuf, MAXFN, namefd);
X	fnbuf[strlen(fnbuf)-1] = EOS;
X
X	return fnbuf;
X}
X
X/* fullcmp - compare two files, bit for bit */
X
Xint
Xfullcmp(v1, v2)
Xint v1, v2;
X{
X	FILE *fp1, *fp2;
X	char filename[MAXFN];
X	register int ch;
X
X	/* open the files */
X	strcpy(filename, getfn(v1));
X	fp1 = fopen(filename, "r");
X	if (fp1 == NULL) {
X		fprintf(stderr, "%s: ", filename);
X		perror("can't access for read");
X		exit(1);
X	}
X	debug(("\nFull compare %s\n         and", filename));
X
X	strcpy(filename, getfn(v2));
X	fp2 = fopen(filename, "r");
X	if (fp2 == NULL) {
X		fprintf(stderr, "%s: ", filename);
X		perror("can't access for read");
X		exit(1);
X	}
X	debug(("%s", filename));
X
X	/* now do the compare */
X	while ((ch = getc(fp1)) != EOF) {
X		if (ch - getc(fp2)) break;
X	}
X
X	/* close files and return value */
X	fclose(fp1);
X	fclose(fp2);
X	debug(("\n      return %d", !(ch == EOF)));
X	return (!(ch == EOF));
X}
SHAR_EOF
chmod 0444 finddup.c || echo "restore of finddup.c fails"
echo "x - extracting finddup.1C (Text)"
sed 's/^X//' << 'SHAR_EOF' > finddup.1C &&
X.TH FINDDUP 1 LOCAL
X.SH NAME
Xfinddup - find duplicate files in a list
X.SH SYNOPSIS
Xfinddup [options] filename
X.SH DESCRIPTION
X.ds fd \fBfinddup\fP
X\*(fd reads a list of filenames from the named file and scans them,
Xbuilding a list of duplicate files and hard links. These are then
Xwritten to stdout for the user's information. This can be used to reduce
Xdisk usage, etc.
X.SS OPTIONS
X  -l - don't show info on hard links
X  -d - debug. May be used more than once for more info
X.SS How it works
X\*(fd stats each name and saves the file length, device, and inode. It
Xthen sorts the list and builds a CRC for each file which has the same
Xlength as another file. For files which have the same length and CRC, a
Xbyte by byte comparison is done to be sure that they are duplicates.
X.sp
XThe CRC step for N files of size S bytes requires reading n*S total
Xbytes, while the byte by byte check must be done for every file against
Xevery other, and read S*N*(N-1) bytes. Thus the CRC is a large timesaver
Xin most cases.
X.SH EXAMPLES
X $ find /u -type f -print > file.list.tmp
X $ finddup file.list.tmp
X.SH FILES
XOnly the file with the filenames.
X.SH SEE ALSO
Xfind(1).
X.SH DIAGNOSTICS
XIf files are named in the specification file but not present they will
Xbe ignored. If an existing file can not be read the program will
Xterminate rather than generate an incomplete list of duplicates.
X.SH LIMITATIONS
XAn option to generate a partial list could be added when a file can't be
Xaccessed. An option to list only duplites which are not hard links could
Xbe added.
X.SH AUTHOR
XBill Davidsen, davidsen@crdos1.crd.ge.com
X.SH Copyright
XCopyright (c) 1991 by Bill Davidsen, all rights reserved. This
Xprogram may be freely used and distributed in its original
Xform. Modified versions must be distributed with the original
Xunmodified source code, and redistribution of the original code
Xor any derivative program may not be restricted.
SHAR_EOF
chmod 0666 finddup.1C || echo "restore of finddup.1C fails"
echo "x - extracting makefile (Text)"
sed 's/^X//' << 'SHAR_EOF' > makefile &&
X# makefile for finddup
X
X# ================> custom here
XCC		= /bin/cc
XCFLAGS		= -O
XLFLAGS		=
XLIBS		=
X# if you don't have getopt on your library, add getopt.o here
XOPBLST		= finddup.o
X# your favorite shar prog
XSHAR		= shar
XSHARLST		= finddup.c finddup.1C makefile getopt.c
XBACKLST		= s.finddup.c finddup.1C makefile getopt.c
X# ================> custom ends
X
Xfinddup:	$(OBJLST)
X	$(CC) -o finddup $(LFLAGS) $(OBJLST) -s
X
Xdebug:		rfinddup
X
X# note that this does NOT leave an object file
Xrfinddup:	finddup.c
X	$(CC) -o rfinddup -g -DDEBUG finddup.c
X
Xshar:		finddup.shar
X
Xfinddup.shar:	$(SHARLST)
X	$(SHAR) $(SHARLST) > finddup.shar
X
Xclean:
X	if [ -f p.finddup.c -o ! -f s.finddup.c ]; then exit 1; fi
X	rm -f finddup.[co] finddup rfinddup finddup.shar
X
Xbackup:		clean finddup.zoo
X
Xfinddup.zoo:	$(BACKLST)
X	zoo aunM finddup $(BACKLST)
SHAR_EOF
chmod 0644 makefile || echo "restore of makefile fails"
echo "x - extracting getopt.c (Text)"
sed 's/^X//' << 'SHAR_EOF' > getopt.c &&
X/*
X**	@(#)getopt.c	2.5 (smail) 9/15/87
X*/
X
X/*
X * Here's something you've all been waiting for:  the AT&T public domain
X * source for getopt(3).  It is the code which was given out at the 1985
X * UNIFORUM conference in Dallas.  I obtained it by electronic mail
X * directly from AT&T.  The people there assure me that it is indeed
X * in the public domain.
X * 
X * There is no manual page.  That is because the one they gave out at
X * UNIFORUM was slightly different from the current System V Release 2
X * manual page.  The difference apparently involved a note about the
X * famous rules 5 and 6, recommending using white space between an option
X * and its first argument, and not grouping options that have arguments.
X * Getopt itself is currently lenient about both of these things White
X * space is allowed, but not mandatory, and the last option in a group can
X * have an argument.  That particular version of the man page evidently
X * has no official existence, and my source at AT&T did not send a copy.
X * The current SVR2 man page reflects the actual behavor of this getopt.
X * However, I am not about to post a copy of anything licensed by AT&T.
X */
X
X
X/*LINTLIBRARY*/
X#define NULL	0
X#define EOF	(-1)
X#define ERR(s, c)	if(opterr){\
X	extern int write();\
X	char errbuf[2];\
X	errbuf[0] = c; errbuf[1] = '\n';\
X	(void) write(2, argv[0], (unsigned)strlen(argv[0]));\
X	(void) write(2, s, (unsigned)strlen(s));\
X	(void) write(2, errbuf, 2);}
X
Xextern char *strchr();
X
Xint	opterr = 1;
Xint	optind = 1;
Xint	optopt;
Xchar	*optarg;
X
Xint
Xgetopt(argc, argv, opts)
Xint	argc;
Xchar	**argv, *opts;
X{
X	static int sp = 1;
X	register int c;
X	register char *cp;
X
X	if(sp == 1)
X		if(optind >= argc ||
X		   argv[optind][0] != '-' || argv[optind][1] == '\0')
X			return(EOF);
X		else if(strcmp(argv[optind], "--") == NULL) {
X			optind++;
X			return(EOF);
X		}
X	optopt = c = argv[optind][sp];
X	if(c == ':' || (cp=strchr(opts, c)) == NULL) {
X		ERR(": illegal option -- ", c);
X		if(argv[optind][++sp] == '\0') {
X			optind++;
X			sp = 1;
X		}
X		return('?');
X	}
X	if(*++cp == ':') {
X		if(argv[optind][sp+1] != '\0')
X			optarg = &argv[optind++][sp+1];
X		else if(++optind >= argc) {
X			ERR(": option requires an argument -- ", c);
X			sp = 1;
X			return('?');
X		} else
X			optarg = argv[optind++];
X		sp = 1;
X	} else {
X		if(argv[optind][++sp] == '\0') {
X			sp = 1;
X			optind++;
X		}
X		optarg = NULL;
X	}
X	return(c);
X}
SHAR_EOF
chmod 0644 getopt.c || echo "restore of getopt.c fails"
exit 0
--
bill davidsen (ibmbin-request@crdgw1.crd.ge.com, uunet!crdgw1!ibmbin-request)
  moderator c.b.i.p newsgroup and 386users mailing list.
  binary submissions to ibmbin@crdgw1.crd.ge.com (uunet!crdgw1!ibmbin)
            "Stupidity, like virtue, is its own reward" -me

exit 0 # Just in case...
-- 
Kent Landfield                   INTERNET: kent@sparky.IMD.Sterling.COM
Sterling Software, IMD           UUCP:     uunet!sparky!kent
Phone:    (402) 291-8300         FAX:      (402) 291-4362
Please send comp.sources.misc-related mail to kent@uunet.uu.net.