[alt.sources] duplicate image detection utilities

phillips@cs.ubc.ca (George Phillips) (08/07/90)

Here are a couple of programs that I find useful for making a
first pass at detecting duplicate images.  The first program is
a PBM+ tool called pnmsig which computes a simple hash function for
a PBM, PGM or PPM file.  For PBM files, the value is the number of
black bits, for PGM files it is the sum of all the gray levels in
the image and for PPM files it is the sum of each R, G and B
component of each pixel.  The value computed is fairly arbitrary,
but at least the PBM sum is not changed by adding or deleting
whitespace; this is nice for cropped macpaint images.  The PPM
sum doesn't depend on a particular colourmap which makes finding GIF
duplicates a little easier.

The second program is a perl script (sigcheck) which reads in lists of
signatures and reports duplicates.  I have a master list of image
names and signatures.  When I get some more images, I can do a quick
check for duplicates like so:

foreach f (*.gif)
	echo -n $f ' ' >>SIG
	giftoppm $f | pnmsig >>SIG
end
sigcheck MASTERLIST SIG

Thrillsville, huh?  Now, if only ftp sites with images would adopt these
programs we could save a lot of useless image grabbing.....

#! /bin/sh
# This is a shell archive.  Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file".  To overwrite existing
# files, type "sh file -c".  You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g..  If this archive is complete, you
# will see the following message at the end:
#		"End of shell archive."
# Contents:  pnmsig.c sigcheck
# Wrapped by phillips@grads.cs.ubc.ca on Mon Aug  6 14:48:19 1990
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'pnmsig.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'pnmsig.c'\"
else
echo shar: Extracting \"'pnmsig.c'\" \(2271 characters\)
sed "s/^X//" >'pnmsig.c' <<'END_OF_FILE'
X/* pnmsig.c - read a portable anymap produce a numeric signature
X**
X** Copyright 1990 George Phillips
X**
X** Permission to use, copy, modify, and distribute this software and its
X** documentation for any purpose and without fee is hereby granted, provided
X** that the above copyright notice appear in all copies and that both that
X** copyright notice and this permission notice appear in supporting
X** documentation.  This software is provided "as is" without express or
X** implied warranty.
X*/
X
X#include <stdio.h>
X#include "pnm.h"
X
X#ifdef PPM
X#include "ppm.h"
X#include "libppm.h"
X#endif /*PPM*/
X
X#ifdef PGM
X#include "pgm.h"
X#include "libpgm.h"
X#endif /*PGM*/
X
X#ifdef PBM
X#include "pbm.h"
X#include "libpbm.h"
X#endif /*PBM*/
X
X
Xmain(argc, argv)
Xint argc;
Xchar *argv[];
X{
X	int	i;
X
X	pm_progname = argv[0];
X
X	if (argc < 2)
X		pnm_sig(NULL);
X	else {
X		for (i = 1; i < argc; i++)
X			pnm_sig(argv[i]);
X	}
X	exit(0);
X}
X
Xpnm_sig(fname)
Xchar* fname;
X{
X	FILE* ifp;
X	xelval maxval;
X	register xel *xelrow, *xP;
X	int rows, cols, format, row, col, sig;
X
X	if (fname != NULL)
X		ifp = pm_openr(fname);
X	else
X		ifp = stdin;
X
X	pnm_readpnminit(ifp, &cols, &rows, &maxval, &format);
X	xelrow = pnm_allocrow(cols);
X	sig = 0;
X
X	for (row = 0; row < rows; row++) {
X		pnm_readpnmrow(ifp, xelrow, cols, maxval, format);
X		for (col = 0, xP = xelrow; col < cols; col++, xP++)
X			switch (format) {
X#ifdef PPM
X			case PPM_FORMAT:
X			case RPPM_FORMAT:
X				sig += PPM_GETR(*xP) + PPM_GETG(*xP) + PPM_GETB(*xP);
X				break;
X#endif /*PPM*/
X
X#ifdef PGM
X			case PGM_FORMAT:
X			case RPGM_FORMAT:
X				sig += (gray) PNM_GET1(*xP);
X				break;
X#endif /*PGM*/
X
X#ifdef PBM
X			case PBM_FORMAT:
X			case RPBM_FORMAT:
X				sig += (bit) PNM_GET1(*xP);
X				break;
X#endif /*PBM*/
X
X			default:
X				pm_error( "can't happen", 0,0,0,0,0 );
X			}
X	}
X
X	if (fname != NULL) {
X		pm_close(ifp);
X		printf("%s: ", fname);
X	}
X	printf("%d x %d x ", cols, rows);
X	switch (format) {
X#ifdef PPM
X	case PPM_FORMAT:
X	case RPPM_FORMAT:
X		printf("%d colour", maxval + 1);
X		break;
X#endif /*PPM*/
X#ifdef PGM
X	case PGM_FORMAT:
X	case RPGM_FORMAT:
X		printf("%d grayscale", maxval + 1);
X		break;
X#endif /*PGM*/
X#ifdef PBM
X	case PBM_FORMAT:
X	case RPBM_FORMAT:
X		printf("2 bitmap");
X		break;
X#endif /*PBM*/
X	}
X	printf(" [%d]\n", sig);
X
X	pnm_freerow(xelrow);
X}
END_OF_FILE
if test 2271 -ne `wc -c <'pnmsig.c'`; then
    echo shar: \"'pnmsig.c'\" unpacked with wrong size!
fi
# end of 'pnmsig.c'
fi
if test -f 'sigcheck' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'sigcheck'\"
else
echo shar: Extracting \"'sigcheck'\" \(470 characters\)
sed "s/^X//" >'sigcheck' <<'END_OF_FILE'
X#!/cs/public/bin/perl
X#
X# sigcheck -- check signatures of pictures to see if they could be the
X# same.
X#
X# Reads a list of signatures with <>.  Typical usage will be to take a list
X# of new signatures and a list of old signatures and spit out the duplicates.
X
Xwhile (<>) {
X	if (!/\[(\d+)\]/) {
X	#	print "ignoring: ";
X	#	print;
X		next;
X	}
X	$s = $1;
X	if ($sig_got{$s} == 1) {
X		print "\n";
X		print;
X		print $info{$s};
X	}
X	else {
X		$sig_got{$s} = 1;
X		$info{$s} = $_;
X	}
X}
END_OF_FILE
if test 470 -ne `wc -c <'sigcheck'`; then
    echo shar: \"'sigcheck'\" unpacked with wrong size!
fi
chmod +x 'sigcheck'
# end of 'sigcheck'
fi
echo shar: End of shell archive.
exit 0