phillips@cs.ubc.ca (George Phillips) (08/07/90)
Here are a couple of programs that I find useful for making a first pass at detecting duplicate images. The first program is a PBM+ tool called pnmsig which computes a simple hash function for a PBM, PGM or PPM file. For PBM files, the value is the number of black bits, for PGM files it is the sum of all the gray levels in the image and for PPM files it is the sum of each R, G and B component of each pixel. The value computed is fairly arbitrary, but at least the PBM sum is not changed by adding or deleting whitespace; this is nice for cropped macpaint images. The PPM sum doesn't depend on a particular colourmap which makes finding GIF duplicates a little easier. The second program is a perl script (sigcheck) which reads in lists of signatures and reports duplicates. I have a master list of image names and signatures. When I get some more images, I can do a quick check for duplicates like so: foreach f (*.gif) echo -n $f ' ' >>SIG giftoppm $f | pnmsig >>SIG end sigcheck MASTERLIST SIG Thrillsville, huh? Now, if only ftp sites with images would adopt these programs we could save a lot of useless image grabbing..... #! /bin/sh # This is a shell archive. Remove anything before this line, then unpack # it by saving it into a file and typing "sh file". To overwrite existing # files, type "sh file -c". You can also feed this as standard input via # unshar, or by typing "sh <file", e.g.. If this archive is complete, you # will see the following message at the end: # "End of shell archive." # Contents: pnmsig.c sigcheck # Wrapped by phillips@grads.cs.ubc.ca on Mon Aug 6 14:48:19 1990 PATH=/bin:/usr/bin:/usr/ucb ; export PATH if test -f 'pnmsig.c' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'pnmsig.c'\" else echo shar: Extracting \"'pnmsig.c'\" \(2271 characters\) sed "s/^X//" >'pnmsig.c' <<'END_OF_FILE' X/* pnmsig.c - read a portable anymap produce a numeric signature X** X** Copyright 1990 George Phillips X** X** Permission to use, copy, modify, and distribute this software and its X** documentation for any purpose and without fee is hereby granted, provided X** that the above copyright notice appear in all copies and that both that X** copyright notice and this permission notice appear in supporting X** documentation. This software is provided "as is" without express or X** implied warranty. X*/ X X#include <stdio.h> X#include "pnm.h" X X#ifdef PPM X#include "ppm.h" X#include "libppm.h" X#endif /*PPM*/ X X#ifdef PGM X#include "pgm.h" X#include "libpgm.h" X#endif /*PGM*/ X X#ifdef PBM X#include "pbm.h" X#include "libpbm.h" X#endif /*PBM*/ X X Xmain(argc, argv) Xint argc; Xchar *argv[]; X{ X int i; X X pm_progname = argv[0]; X X if (argc < 2) X pnm_sig(NULL); X else { X for (i = 1; i < argc; i++) X pnm_sig(argv[i]); X } X exit(0); X} X Xpnm_sig(fname) Xchar* fname; X{ X FILE* ifp; X xelval maxval; X register xel *xelrow, *xP; X int rows, cols, format, row, col, sig; X X if (fname != NULL) X ifp = pm_openr(fname); X else X ifp = stdin; X X pnm_readpnminit(ifp, &cols, &rows, &maxval, &format); X xelrow = pnm_allocrow(cols); X sig = 0; X X for (row = 0; row < rows; row++) { X pnm_readpnmrow(ifp, xelrow, cols, maxval, format); X for (col = 0, xP = xelrow; col < cols; col++, xP++) X switch (format) { X#ifdef PPM X case PPM_FORMAT: X case RPPM_FORMAT: X sig += PPM_GETR(*xP) + PPM_GETG(*xP) + PPM_GETB(*xP); X break; X#endif /*PPM*/ X X#ifdef PGM X case PGM_FORMAT: X case RPGM_FORMAT: X sig += (gray) PNM_GET1(*xP); X break; X#endif /*PGM*/ X X#ifdef PBM X case PBM_FORMAT: X case RPBM_FORMAT: X sig += (bit) PNM_GET1(*xP); X break; X#endif /*PBM*/ X X default: X pm_error( "can't happen", 0,0,0,0,0 ); X } X } X X if (fname != NULL) { X pm_close(ifp); X printf("%s: ", fname); X } X printf("%d x %d x ", cols, rows); X switch (format) { X#ifdef PPM X case PPM_FORMAT: X case RPPM_FORMAT: X printf("%d colour", maxval + 1); X break; X#endif /*PPM*/ X#ifdef PGM X case PGM_FORMAT: X case RPGM_FORMAT: X printf("%d grayscale", maxval + 1); X break; X#endif /*PGM*/ X#ifdef PBM X case PBM_FORMAT: X case RPBM_FORMAT: X printf("2 bitmap"); X break; X#endif /*PBM*/ X } X printf(" [%d]\n", sig); X X pnm_freerow(xelrow); X} END_OF_FILE if test 2271 -ne `wc -c <'pnmsig.c'`; then echo shar: \"'pnmsig.c'\" unpacked with wrong size! fi # end of 'pnmsig.c' fi if test -f 'sigcheck' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'sigcheck'\" else echo shar: Extracting \"'sigcheck'\" \(470 characters\) sed "s/^X//" >'sigcheck' <<'END_OF_FILE' X#!/cs/public/bin/perl X# X# sigcheck -- check signatures of pictures to see if they could be the X# same. X# X# Reads a list of signatures with <>. Typical usage will be to take a list X# of new signatures and a list of old signatures and spit out the duplicates. X Xwhile (<>) { X if (!/\[(\d+)\]/) { X # print "ignoring: "; X # print; X next; X } X $s = $1; X if ($sig_got{$s} == 1) { X print "\n"; X print; X print $info{$s}; X } X else { X $sig_got{$s} = 1; X $info{$s} = $_; X } X} END_OF_FILE if test 470 -ne `wc -c <'sigcheck'`; then echo shar: \"'sigcheck'\" unpacked with wrong size! fi chmod +x 'sigcheck' # end of 'sigcheck' fi echo shar: End of shell archive. exit 0