[alt.sources] cmpall -- find identical

lee@sqarc.sq.com (Liam R. E. Quin) (08/03/90)

I wrote cmpall some time ago, when I found that I had lots of copies and
duplicated directory hierarchies.

You say (for example)
	find $HOME | cmpall
and it says
	same /home/lee/src/lqtext/doc/README /home/lee/doc/README
and so forth.

This version only compares files that have the same name and size.
You could change this to compare files that have the same size if you
wanted -- I wasn't too bothered by that.  It won't (of course) detect
duplicates if one is compressed.

It works unchanged on SysV and SunOS, should be OK on other Unix systems.

No manpage.  Just do
	make cmpall
(you don't even need a makefile).

Let me know if it helps, or if you change it...

Lee

: To unbundle, sh this file
echo x - cmpall.c 1>&2
sed 's/^X//' >cmpall.c <<'@@@End of cmpall.c'
X/* $Header: /home/lee/src/cmpall/cmpall.c,v 1.2 90/08/02 17:22:12 lee Exp Locker: lee $
X *
X */
X
X#include <stdio.h>
X#include <malloc.h>
X#include <errno.h>
X#include <fcntl.h>
X#include <sys/types.h>
X#include <sys/stat.h>
X
Xextern int errno;
X
Xtypedef struct s_listel {
X    char *Name;
X    struct stat *statbuf;
X    int n_seen;
X    struct s_listel *Next;
X} t_listel;
X
X#define STREQ(henry,utzoo) ((*(henry)== *(utzoo))&&!strcmp(henry,utzoo))
X#define new(type) ((type *) malloc(sizeof(type)))
X
Xint UseBytes = 0;
Xchar *progname;
X
X/* read filenames a line at a time.
X * whenever we get a file we've seen before, compare it
X * with the other of the sames name.
X * If they are the same, print "choose -s first second"
X * otherwise, print "choose -d first second"
X */
Xmain(argc, argv)
X    int argc;
X    char *argv[];
X{
X    t_listel *NewEntry();
X    void AddEntry();
X    char *GetLine();
X    char *Path;
X
X    progname = argv[0];
X
X    if (argc != 1) {
X	if (argc == 2 && STREQ(argv[1], "-b")) {
X	    UseBytes = 1;
X	} else {
X	    (void) fprintf(stderr, "usage: find <dir> ... -print | %s [-b]\n",
X								progname);
X	    exit(1);
X	}
X    }
X
X    while ((Path = GetLine(stdin, "/dev/stdin")) != (char *) 0) {
X	AddEntry(stdin, "/dev/stdin", Path);
X    }
X
X    exit(0);
X}
X
Xt_listel *SeenSoFar = (t_listel *) 0;
X
Xvoid
XAddEntry(fd, Name, Path)
X    FILE *fd;
X    char*Name;
X    char *Path;
X{
X    extern char *strrchr();
X
X    t_listel *e;
X    t_listel **lpp;
X
X    if ((e = new(t_listel)) == (t_listel *) 0) {
X	(void) fprintf(stderr, "no RAM reading \"%s\" ", Name);
X	/* possibly we dumped core just then... */
X	(void) fprintf(stderr, "at \"%s\"\n", Path);
X	exit(1);
X    }
X
X    e->Name = Path;
X    e->n_seen = 1;
X    e->statbuf = (struct stat *) 0;
X
X    for (lpp = &SeenSoFar; *lpp; lpp = &(*lpp)->Next) {
X	register char *Old = (*lpp)->Name;
X	int i;
X	char *PathLastSlash;
X	char *OldLastSlash;
X
X	if ((PathLastSlash = strrchr(Path, '/')) == (char *) 0) {
X	    PathLastSlash = Path;
X	} else {
X	    ++PathLastSlash;
X	}
X
X	if ((OldLastSlash = strrchr(Old, '/')) == (char *) 0) {
X	    OldLastSlash = Old;
X	} else {
X	    ++OldLastSlash;
X	}
X
X	if (*PathLastSlash == *OldLastSlash) {
X	    i = strcmp(PathLastSlash, OldLastSlash);
X	} else {
X	    i = *PathLastSlash - *OldLastSlash;
X	}
X
X	if (i < 0) continue; /* not there yet */
X	if (i == 0) {
X	    /* found it... */
X
X	    /* If the files are the same, we should add them to a single
X	     * linked list and traverse it at the end, I suppose!
X	     */
X	    if (Compare(fd, Name, *lpp, e) == 0) {
X		/* They were the same, so no point storing this one... */
X		(void) printf("same '%s' '%s'\n", (*lpp)->Name, e->Name);
X		if (e->statbuf) {
X		    (void) free((char *) e->statbuf);
X		}
X		(void) free((char *) e);
X		(void) free(Path);
X		return;
X	    }
X	    /* the files were different, so add the new one to the list */
X	    /* FALLTHROUGH */
X	}
X	break;
X    }
X
X    /* lpp points either at the last "Next", or at the Next
X     * of the element before which we must insert a new entry...
X     */
X
X    e->Next = (*lpp);
X    *lpp = e;
X    return;
X}
X
X/*ARGSUSED*/
Xint
XCompare(fd, Name, Old, New)
X    FILE *fd;
X    char *Name;
X    t_listel *Old;
X    t_listel *New;
X{
X    /* We return
X     * 0 if the files are identical
X     * -1 if we can't stat Path
X     * 1 if the files are (or might be) different
X     * 2 if they are definitely different
X     *
X     * If we can stat Path but not e, we return 3, and delete e
X     * from the list.
X     */
X    if (Old->statbuf == (struct stat *) 0) {
X	if ((Old->statbuf = new(struct stat)) == (struct stat *) 0) {
X	    (void) fprintf(stderr, "reading \"%s\", no RAM to stat \"%s\"\n",
X							Name, Old->Name);
X	    exit(1);
X	}
X	if (stat(Old->Name, Old->statbuf) < 0) {
X	    Delete(Old);
X	    return 3;
X	}
X    }
X
X    if (New->statbuf == (struct stat *) 0) {
X	if ((New->statbuf = new(struct stat)) == (struct stat *) 0) {
X	    (void) fprintf(stderr, "reading \"%s\", no RAM to stat \"%s\"\n",
X							Name, New->Name);
X	    exit(1);
X	}
X	if (stat(New->Name, New->statbuf) < 0) {
X	    return -1;
X	}
X    }
X
X    /* if they are not normal files, they are the same if the major/minor
X     * numbers are the same:
X     */
X    /* NOTDONE */
X    
X    /** Now compare the files: */
X
X    /*  first, are they the same size? */
X    if (Old->statbuf->st_size != New->statbuf->st_size) {
X	return 2; /* definitely different */
X    }
X
X    if (Old->statbuf->st_ino == New->statbuf->st_ino &&
X	Old->statbuf->st_dev == New->statbuf->st_dev &&
X	Old->statbuf->st_nlink == New->statbuf->st_nlink) {
X
X	/* they are linked to each other, so they are the same! */
X	return 0; /* this is only really a heuristic */
X    }
X
X    switch (CmpFile(Old->Name, New->Name)) {
X    case 0: return 0;
X    case 1: return 2; /* different! */
X    case -1: /* no a */
X    case -2: /* no b */
X	return -1;
X    default:
X	return -1;
X    }
X}
X
Xint
XCmpFile(a, b)
X    char *a, *b;
X{
X    /* return 0 if same, -ve on error, 1 otherwise */
X    int afd = open(a, O_RDONLY, 0);
X    int bfd = open(b, O_RDONLY, 0);
X    char ablk[1024], bblk[1024];
X    int ar, br;
X
X    if (afd < 0) return -1;
X    if (bfd < 0) return -2;
X
X    while ((ar = read(afd, ablk, 1024)) == (br = read(bfd, bblk, 1024))) {
X	if ((ar == 0) || memcmp(ablk, bblk, ar) != 0) {
X	    (void) close(afd);
X	    (void) close(bfd);
X	    /* ar == br == 0: got to EOF, they are the same */
X	    return (ar == 0) ? 0 : 1;
X	} else if (ar < 0) {
X	    return -3; /* read error! */
X	}
X    }
X    /* Since we use stat to check the size before calling this function,
X     * this should never happen... except in the case of a read error.
X     */
X    return 1; /* different lengths */
X}
X
Xchar *
XGetLine(fd, Name)
X    FILE *fd;
X    char *Name;
X{
X    char *Result;
X    register char *p;
X    unsigned int len = 20;
X    int ch;
X
X    if ((Result = malloc(len)) == (char *) 0)  {
X	(void) fprintf(stderr, "out of mem reading \"%s\"\n", Name);
X	exit(1);
X    }
X
X    p = Result;
X
X    while ((ch = getc(fd)) != EOF) {
X	if (ch == '\n') break;
X	if (p - Result >= len - 1) {
X	    int WhereWeWere = p - Result;
X
X	    if ((Result = realloc(Result, len += 20)) == (char *) 0) {
X		(void) fprintf(stderr, "no more mem in \"%s\"\n", Name);
X		exit(1);
X	    }
X
X	    /* Realloc() might have returned a different address,
X	     * so we must ensure that p points to the new array...
X	     */
X	    p = &Result[WhereWeWere];
X	}
X	*p++ = ch;
X    }
X    *p = '\0';
X
X    if (ch == EOF) {
X	if (Result[0] != '\0') {
X	    return Result;
X	}
X	return (char *) 0;
X    }
X    if (p - Result > 1) {
X	Result = realloc(Result, (unsigned) (p - Result + 1));
X	if (Result == (char *) 0) {
X	    (void) fprintf(stderr, "can't save line in \"%s\"\n", Name);
X	    exit(1);
X	}
X    }
X    return Result;
X}
X
XDelete(e)
X    t_listel *e;
X{
X    register t_listel **lpp;
X
X    for (lpp = &SeenSoFar; *lpp; lpp = &(*lpp)->Next) {
X	if ((*lpp)->Name == e->Name) { /* same address, NOT strcmp! */
X	    t_listel *next = e->Next;
X
X	    (void) free(e->Name);
X	    if (e->statbuf) {
X		(void) free((char *) e->statbuf);
X	    }
X	    (void) free((char *) e);
X	    *lpp = next;
X
X	    break;
X	}
X    }
X}
X
X/*
X * $Log:	cmpall.c,v $
X * Revision 1.2  90/08/02  17:22:12  lee
X * delinted it a little...
X * 
X * Revision 1.1  90/08/02  17:04:26  lee
X * Initial revision
X * 
X *
X */
@@@End of cmpall.c
ls -l cmpall.c
exit 0

Lee
-- 
Liam R. E. Quin,  lee@sq.com, {utai,utzoo}!sq!lee,  SoftQuad Inc., Toronto
``He left her a copy of his calculations [...]  Since she was a cystologist,
  she might have analysed the equations, but at the moment she was occupied
  with knitting a bootee.''  [John Boyd, Pollinators of Eden, 217]

merlyn@iwarp.intel.com (Randal Schwartz) (08/03/90)

In article <1990Aug2.212411.3078@sq.sq.com>, lee@sqarc (Liam R. E. Quin) writes:
| I wrote cmpall some time ago, when I found that I had lots of copies and
| duplicated directory hierarchies.

"That's not a knife... *This* (pulls out his piece) is a knife!"

Here's a little ditty called "findsame".  It first does what you do,
in that it finds files that are the same length.  For all files of the
same length, it then runs "sum" to see who's actually the same, and
then "ls -l"'s the matching pairs combinations.

Perl, of course.

================================================== cut here
#!/local/usr/bin/perl

$| = 1;

@ARGV = ('.') unless $#ARGV >= 0;

open(F,"find @ARGV -type f -print|") || die "Cannot open find ($!)";
while (<F>) {
	chop;
	@stat = stat($_);
	$bysize{$stat[7]} .= "$_\n";
}
close(F);

sub numeric {
	0+$a < 0+$b ? -1 : 1;
}

for $asize (sort numeric keys(bysize)) {
	@files = split(/\n/, $bysize{$asize});
	next if $#files <= 0;
	unless(open(S,"-|")) {
		exec "sum", @files;
	}
	%bysum = ();
	while (<S>) {
		chop;
		@F = split;
		$bysum{$F[0]} .= "$F[2]\n";
	}
	close(S);
	for $asum (sort numeric keys(bysum)) {
		@files = split(/\n/, $bysum{$asum});
		next if $#files <= 0;
		system 'ls','-li', @files;
		print "\n"; # to separate them by blank lines
	}
}
==================================================

It's not the best Perl code (I wrote it a long time ago).  I'd
probably rewrite it quite a bit now. :-)

Just another Perl hacker,
-- 
/=Randal L. Schwartz, Stonehenge Consulting Services (503)777-0095 ==========\
| on contract to Intel's iWarp project, Beaverton, Oregon, USA, Sol III      |
| merlyn@iwarp.intel.com ...!any-MX-mailer-like-uunet!iwarp.intel.com!merlyn |
\=Cute Quote: "Welcome to Portland, Oregon, home of the California Raisins!"=/