[mod.sources] Software similarity tester for C programs

sources-request@panda.UUCP (02/11/86)

Mod.sources:  Volume 3, Issue 119
Submitted by: Dick Grune <talcott!seismo!mcvax!vu44!dick>


	The enclosed shar-archive contains a program that will detect
stretches in C-programs that look similar (or are just plain equal).
This is useful for finding "borrowed" soft-ware or for isolating
possible subroutines in large software systems.  It is very fast and
gives results.  We have been using it for about half a year now.

					Dick Grune
					Vrije Universiteit
					de Boelelaan 1081
					1081 HV  Amsterdam
					the Netherlands
					..!mcvax!vu44!dick
					dick@vu44.UUCP



: This is a shar archive.  Extract with sh, not csh.
: This archive ends with exit, so do not worry about trailing junk.
: --------------------------- cut here --------------------------
PATH=/bin:/usr/bin
echo Extracting \R\E\A\D\_\M\E
sed 's/^X//' > \R\E\A\D\_\M\E << '+ END-OF-FILE '\R\E\A\D\_\M\E
XSat Jan 11 15:47:16 1986
X
XThis program tests for similar (or equal) stretches in one or more C-programs.
XSee sim.1
X
XTo compile, call "make", which will generate one executable called sim, and
Xwill run two small tests to show sample output.
X
XTo install, examine the Makefile and reset BINDIR and MANDIR to sensible
Xpaths, and call "make install"
X
XTo change the default run size or the page width, adjust the file params.h
Xand recompile.
X
XTo add another language X, write a file X.l along the lines of clang.l,
Xextend the Makefile and recompile.  All knowledge about C in located in
Xclang.l; the rest of the programs expect each C lexical unit to be a single
Xcharacter.
X
X					Dick Grune
X					Vrije Universiteit
X					de Boelelaan 1081
X					1081 HV  Amsterdam
X					the Netherlands
X					..!mcvax!vu44!dick
X					dick@vu44.UUCP
+ END-OF-FILE READ_ME
chmod 'u=rw,g=r,o=r' \R\E\A\D\_\M\E
set `sum \R\E\A\D\_\M\E`
sum=$1
case $sum in
26812)	:;;
*)	echo 'Bad sum in '\R\E\A\D\_\M\E >&2
esac
echo Extracting \M\a\k\e\f\i\l\e
sed 's/^X//' > \M\a\k\e\f\i\l\e << '+ END-OF-FILE '\M\a\k\e\f\i\l\e
X#	This file is part of the software similarity tester SIM.
X#	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X#
X
XBINDIR = /user1/dick/bin#		# where to put the binary (sim)
XMANDIR = /user1/dick/man#		# where to put the manual page (sim.1)
X
X#
X#	Each module (set of programs that together perform some function)
X#	has the following sets defined for it:
X#		FLS	all files of that module, for, e.g.,
X#			printing, sharring, inventory, etc.
X#		SRC	the source files, from which other files derive
X#		CFS	the C-files, from which the object files derive
X#		OBJ	objects
X#		GRB	garbage produced by compiling the module
X#
X#	(This is a feeble attempt at software-engineering a Makefile.)
X#
X
XTEST =	pass3.c#			# guinea pig
X
Xall:	sim.res lang.res
X
X# The C Language module
XCLN_OBJ =	clang.o
XCLN_CFS =	clang.c
XCLN_SRC =	clang.l
XCLN_FLS =	$(CLN_SRC)
X
Xclang.c:	clang.l
X	lex -t clang.l >clang.c
XCLN_GRB =	clang.c
X
X# Common modules:
XCOM_OBJ =	stream.o idf.o buff.o error.o
XCOM_CFS =	stream.c idf.c buff.c error.c
XCOM_SRC =	$(COM_CFS)
XCOM_FLS =	stream.h idf.h buff.h $(COM_SRC)
X
X# The top-package:
XTOP_OBJ =	top.o
XTOP_CFS =	top.c
XTOP_SRC =	$(TOP_CFS)
XTOP_FLS =	top.p top.g top.h $(TOP_SRC)
X
X# The similarity tester:
XSIM_OBJ =	sim.o pass1.o hash.o compare.o add_run.o pass2.o pass3.o
XSIM_CFS =	sim.c pass1.c hash.c compare.c add_run.c pass2.c pass3.c
XSIM_SRC =	$(SIM_CFS)
XSIM_FLS =	params.h text.h debug.h $(SIM_SRC)
X
XSIM =	$(CLN_OBJ) $(COM_OBJ) $(TOP_OBJ) $(SIM_OBJ)
XCFS =	$(CLN_CFS) $(COM_CFS) $(TOP_CFS) $(SIM_CFS)
X
Xsim:	$(SIM)
X	$(CC) $(SIM) -o sim
X
Xsim.res:	sim $(TEST)
X	sim -hr 20 $(TEST)
X
Xlint:	$(CFS)
X	lint -xa $(CFS)
XSIM_GRB =	sim
X
X# The language streamliner as a main program:
XSTR_OBJ =	main.o
XSTR_CFS =	main.c
XSTR_SRC =	$(STR_CFS)
XSTR_FLS =	$(STR_SRC)
X
XLANG_CFS =	$(CLN_CFS) $(STR_CFS) $(COM_CFS)
XLANG_OBJ =	$(CLN_OBJ) $(STR_OBJ) $(COM_OBJ)
Xlang:	$(LANG_OBJ)
X	$(CC) $(LANG_OBJ) -o lang
X
Xlang.res:	lang $(TEST)
X	lang -1 $(TEST) >lang1.res
X	lang -2 $(TEST) >lang2.res
X	wc lang[12].res $(TEST)
XLANG_GRB =	lang lang1.res lang2.res
X
Xlang.lint:
X	lint -xa $(LANG_CFS)
X
X# various other entries
XFLS =	READ_ME Makefile sim.1 \
X	$(COM_FLS) $(TOP_FLS) $(SIM_FLS) $(STR_FLS) $(CLN_FLS)
XSRC =	$(COM_SRC) $(TOP_SRC) $(SIM_SRC) $(STR_SRC) $(CLN_SRC)
XOBJ =	$(COM_OBJ) $(TOP_OBJ) $(SIM_OBJ) $(STR_OBJ) $(CLN_OBJ)
X
Xprint:	$(FLS)
X	pr $(FLS) >print
XPRINT_GRB =	print
X
Xshar:	$(FLS)
X	shar $(FLS) >shar
X
Xfiles:	Makefile
X	ls $(FLS) >files
X
Xcchk:
X	cchk $(CFS)
X
Xsimsim:	sim
X	sim -hfr 20 $(SRC)
X
Xsimsimx:	sim
X	sim -hfxr 20 $(SRC)
X
Xtags:	$(SRC)
X	ctags $(SRC)
X
Xinstall:	$(BINDIR)/sim $(MANDIR)/sim.1
X
X$(BINDIR)/sim:	sim
X	cp sim $(BINDIR)/sim
X
X$(MANDIR)/sim.1:	sim.1
X	cp sim.1 $(MANDIR)/sim.1
X
Xclean:
X	rm -f $(OBJ) $(CLN_GRB) $(SIM_GRB) $(LANG_GRB) $(PRINT_GRB) \
X		a.out core
X
X#------------------------------------------------------------------------
Xadd_run.o: buff.h text.h top.p top.h
Xbuff.o: buff.h
Xclang.o: idf.h stream.h
Xcompare.o: buff.h text.h top.p top.h
Xhash.o: buff.h text.h
Xidf.o: idf.h
Xpass1.o: buff.h text.h
Xpass2.o: text.h top.p top.h debug.h
Xpass3.o: params.h text.h top.p top.h debug.h buff.h
Xsim.o: params.h
Xstream.o: stream.h
Xtop.o: text.h top.p top.h top.g
+ END-OF-FILE Makefile
chmod 'u=rw,g=r,o=r' \M\a\k\e\f\i\l\e
set `sum \M\a\k\e\f\i\l\e`
sum=$1
case $sum in
32570)	:;;
*)	echo 'Bad sum in '\M\a\k\e\f\i\l\e >&2
esac
echo Extracting \s\i\m\.\1
sed 's/^X//' > \s\i\m\.\1 << '+ END-OF-FILE '\s\i\m\.\1
X.\"	This file is part of the software similarity tester SIM.
X.\"	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X.\"
X.TH SIM I
X.SH NAME
Xsim \- find similarities in C-files
X.SH SYNOPSIS
X.B sim
X[
X.B \-[fns]
X.BI \-r N
X]
Xfile ...
X.SH DESCRIPTION
X.I Sim
Xreads the C-files
X.I file ...
Xand looks for pieces of text that are similar; two pieces of C-text
Xare similar if they only differ in layout, comment, identifiers and
Xthe contents of numbers, strings and characters.  If any runs
Xof sufficient length
Xare found, they are reported on standard output; the default length
Xminimum is 24, but can be reset by the
X.BR \-r -parameter.
X.PP
XThe program can be used for finding copied pieces of code in
Xpurportedly unrelated programs (with the
X.BR \-s -flag),
Xor for finding accidentally duplicated code in larger projects
X(without the
X.BR \-s -flag
Xbut with the
X.BR \-f -flag).
X.PP
XSince it reads the files several times, it cannot read from standard input.
X.PP
XThere are the following options:
X.TP
X.B \-f
XRuns are restricted to pieces with balancing parentheses, to isolate
Xpotential functions.
X.TP
X.B \-n
XSimilarities found are only summarized, not displayed.
X.TP
X.B \-s
XThe contents of a file are not compared to itself (\-s = not self).
X.PP
XThe matching process uses a hash table so that tens of thousands of
Xlines are processed in a few minutes; if, however, there is not
Xenough memory for the table, the matching process uses sequential
Xsearch, which can take hours.
X.SH AUTHOR
XDick Grune, Vrije Universiteit, Amsterdam.
X.SH BUGS
XStrong periodicity in the input text (like a table of
X.I N
Xalmost identical lines) causes problems.
X.I Sim
Xtries to cope with this but cannot avoid giving appr.
X.I log N
Xmessages about it.  The best advice is still to take the offending
Xfiles out of the game.
+ END-OF-FILE sim.1
chmod 'u=rw,g=r,o=r' \s\i\m\.\1
set `sum \s\i\m\.\1`
sum=$1
case $sum in
56000)	:;;
*)	echo 'Bad sum in '\s\i\m\.\1 >&2
esac
echo Extracting \s\t\r\e\a\m\.\h
sed 's/^X//' > \s\t\r\e\a\m\.\h << '+ END-OF-FILE '\s\t\r\e\a\m\.\h
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X/*
X	Interface between the language-dependent lex module and
X	the stream module.
X*/
X
X/* communication variables */
Xextern int lex_no;	/* pass 1: return stream of C condensed chars */
X			/* pass 2: return C-char/ASCII-char position
X				pairs at each \n */
Xextern int lex_ch;	/* condensed C-char produced by pass 1 */
Xextern unsigned int lex_ch_cnt;
X			/* C-char position reported at each \n by pass 2 */
Xextern long lex_ls_pos;	/* lseek position reported at each \n by pass 2 */
X
X/* #defines for the lex module */
X#define	cput(ch)	if (lex_ch_cnt++, lex_no == 1) \
X				{lex_ch = ch; return 1;} else
X#define	c_eol()		if (lex_no == 2) return 1; else
X#define	count()		lex_ls_pos += yyleng
+ END-OF-FILE stream.h
chmod 'u=rw,g=r,o=r' \s\t\r\e\a\m\.\h
set `sum \s\t\r\e\a\m\.\h`
sum=$1
case $sum in
61866)	:;;
*)	echo 'Bad sum in '\s\t\r\e\a\m\.\h >&2
esac
echo Extracting \i\d\f\.\h
sed 's/^X//' > \i\d\f\.\h << '+ END-OF-FILE '\i\d\f\.\h
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X/* the struct for keywords etc. */
Xstruct idf	{
X	char *id_tag;	/* an interesting identifier */
X	char id_tr;	/* with its one-character translation */
X};
X
X#define idf2char(s,l)	findidf(s, l, sizeof l / sizeof l[0])
+ END-OF-FILE idf.h
chmod 'u=rw,g=r,o=r' \i\d\f\.\h
set `sum \i\d\f\.\h`
sum=$1
case $sum in
29562)	:;;
*)	echo 'Bad sum in '\i\d\f\.\h >&2
esac
echo Extracting \b\u\f\f\.\h
sed 's/^X//' > \b\u\f\f\.\h << '+ END-OF-FILE '\b\u\f\f\.\h
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
Xextern char *buff;
Xextern unsigned int text_length();
+ END-OF-FILE buff.h
chmod 'u=rw,g=r,o=r' \b\u\f\f\.\h
set `sum \b\u\f\f\.\h`
sum=$1
case $sum in
20348)	:;;
*)	echo 'Bad sum in '\b\u\f\f\.\h >&2
esac
echo Extracting \s\t\r\e\a\m\.\c
sed 's/^X//' > \s\t\r\e\a\m\.\c << '+ END-OF-FILE '\s\t\r\e\a\m\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	<stdio.h>
X#include	"stream.h"
X
X/* imports from the lex module */
Xextern int yylex();
Xextern yystart();
Xextern FILE *yyin;
X
Xint lex_no;		/* pass 1: return stream of C condensed chars */
X			/* pass 2: return C-char/ASCII-char position
X				pairs at each \n
X			*/
X
Xint lex_ch;		/* condensed C-char produced by pass 1 */
Xunsigned int lex_ch_cnt;/* C-char position reported at each \n by pass 2 */
Xlong lex_ls_pos;	/* lseek position reported at each \n by pass 2 */
X
Xint
XOpenStream(pass, fname)
X	char *fname;
X{
X	lex_no = pass;
X	lex_ch_cnt = 0;
X	lex_ls_pos = 0L;
X	
X	/* start the lex machine */
X	yyin = fopen(fname, "r");
X	yystart();
X	return yyin != NULL;
X}
X
Xint
XNextChar(cp)		/* lex_no must be 1 */
X	char *cp;
X{
X	if (!yylex())
X		return -1;
X	*cp = lex_ch;
X	return 0;
X}
X
Xint
XNextPair(ccp, lsp)	/* lex_no must be 2 */
X	unsigned int *ccp;
X	long *lsp;
X{
X	if (!yylex())
X		return -1;
X	*ccp = lex_ch_cnt;
X	*lsp = lex_ls_pos;
X	return 0;
X}
X
XCloseStream()	{
X	fclose(yyin);
X}
+ END-OF-FILE stream.c
chmod 'u=rw,g=r,o=r' \s\t\r\e\a\m\.\c
set `sum \s\t\r\e\a\m\.\c`
sum=$1
case $sum in
24424)	:;;
*)	echo 'Bad sum in '\s\t\r\e\a\m\.\c >&2
esac
echo Extracting \i\d\f\.\c
sed 's/^X//' > \i\d\f\.\c << '+ END-OF-FILE '\i\d\f\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"idf.h"
X
Xint
Xfindidf(str, list, size)
X	char *str;
X	struct idf list[];
X{
X	int first = 0;
X	int last = size - 1;
X
X	while (first < last)	{
X		int middle = (first + last) / 2;
X
X		if (strcmp(str, list[middle].id_tag) > 0)
X			first = middle + 1;
X		else	last = middle;
X	}
X	return strcmp(str, list[first].id_tag) == 0 ? list[first].id_tr : -1;
X}
+ END-OF-FILE idf.c
chmod 'u=rw,g=r,o=r' \i\d\f\.\c
set `sum \i\d\f\.\c`
sum=$1
case $sum in
60560)	:;;
*)	echo 'Bad sum in '\i\d\f\.\c >&2
esac
echo Extracting \b\u\f\f\.\c
sed 's/^X//' > \b\u\f\f\.\c << '+ END-OF-FILE '\b\u\f\f\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"buff.h"
X
X#define	BFSIZE		10000
X
Xextern char *malloc(), *realloc(), *calloc();
X
Xchar *buff;			/* to be filled by malloc */
Xstatic unsigned int buff_size;	/* size of buffer at this moment */
Xstatic unsigned int bfree;	/* next free position in array buff[] */
X
Xinit_buff()	{
X	/* Allocate the text buffer */
X	buff = malloc(buff_size = BFSIZE);
X	if (!buff)
X		error("out of space");
X	bfree = 1;				/* don't use position 0 */
X}
X
Xstore(ch)	{
X	if (bfree == buff_size)	{
X		buff = realloc(buff, buff_size += BFSIZE);
X		if (!buff || buff_size < bfree)	{
X			/* overflow */
X			error("out of space");
X		}
X	}
X	buff[bfree++] = ch;
X}
X
Xunsigned int
Xtext_length()	{
X	return bfree;
X}
+ END-OF-FILE buff.c
chmod 'u=rw,g=r,o=r' \b\u\f\f\.\c
set `sum \b\u\f\f\.\c`
sum=$1
case $sum in
14186)	:;;
*)	echo 'Bad sum in '\b\u\f\f\.\c >&2
esac
echo Extracting \e\r\r\o\r\.\c
sed 's/^X//' > \e\r\r\o\r\.\c << '+ END-OF-FILE '\e\r\r\o\r\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	<stdio.h>
X
Xerror(msg)
X	char *msg;
X{
X	fprintf(stderr, "sim: %s\n", msg);
X	exit(1);
X}
+ END-OF-FILE error.c
chmod 'u=rw,g=r,o=r' \e\r\r\o\r\.\c
set `sum \e\r\r\o\r\.\c`
sum=$1
case $sum in
10416)	:;;
*)	echo 'Bad sum in '\e\r\r\o\r\.\c >&2
esac
echo Extracting \t\o\p\.\p
sed 's/^X//' > \t\o\p\.\p << '+ END-OF-FILE '\t\o\p\.\p
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X/*	These are the parameters with which to instantiate top.g
X*/
X#define		TTSIZE		100		/* the TTSIZE best objects */
X#define		TTTYPE		struct run	/* the type of the object */
X#define		TTBETTER	longer		/* how to compare objects */
+ END-OF-FILE top.p
chmod 'u=rw,g=r,o=r' \t\o\p\.\p
set `sum \t\o\p\.\p`
sum=$1
case $sum in
20036)	:;;
*)	echo 'Bad sum in '\t\o\p\.\p >&2
esac
echo Extracting \t\o\p\.\g
sed 's/^X//' > \t\o\p\.\g << '+ END-OF-FILE '\t\o\p\.\g
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X/* This is a generic package for keeping a top-10 list of objects of
X/* formal type.
X/*
X/* Specification in Ada-style:
X/* generic				-- 3 formals,supplied by #define
X/*	TTSIZE: int;			-- the size of the top-N list
X/*	type TTTYPE is private;		-- the type of the objects
X/*	with function int TTBETTER(ip, jp) TTTYPE *ip, *jp;
X/*					-- 1 if object *ip better than
X/*					-- object *jp, 0 otherwise
X/* package TOP is			-- reflected in top.h
X/*	function InitTop();		-- clears the list
X/*	function InsertTop(obj) TTTYPE *obj;
X/*					-- accepts a (pointer to) an object
X/*	-- a generator yields the objects in best-to-worst order:
X/*	type TopGen is private;		-- to declare the generator
X/*	function OpenTop(tg) TopGen *tg;-- starts the generator
X/*	function TTTYPE *NextTop(tg) TopGen *tg;
X/*					-- yields next object, moves generator
X/*	NoObject: constant (TTTYPE*);	-- yielded at end-of-list
X/*	function CloseTop(tg) TopGen *tg;-- stops the generator
X/* end TOP				-- */
X/* The application of this file must be preceded by
X/* #include "top.h", which defines the interface, and by
X/* #include "top.p", which defines the parameters of the instantiation.
X
X/* package body TOP is			-- */
Xstatic TTTYPE val[TTSIZE];
Xstatic TTTYPE *list[TTSIZE];
Xstatic int cnt;
X
XInitTop()	{
X	cnt = 0;
X}
X
XInsertTop(obj) register TTTYPE *obj;	{
X	register int i;
X
X	if (cnt < TTSIZE)	{	/* there is still room */
X		list[cnt] = &val[cnt];
X		val[cnt++] = *obj;
X	}
X	else
X	if (TTBETTER(obj, list[TTSIZE-1]))	{
X		/* preferable to worst in set */
X		*list[TTSIZE-1] = *obj;
X	}
X	else	return;			/* we're not interested */
X
X	for (i = cnt-2; i >= 0 && TTBETTER(list[i+1], list[i]); i--)	{
X		register TTTYPE *jp = list[i+1];
X		list[i+1] = list[i];
X		list[i] = jp;
X	}
X}
X
XOpenTop(tg) register TopGen *tg;	{
X	*tg = 0;
X}
X
XTTTYPE *
XNextTop(tg) register TopGen *tg;	{
X	return *tg >= cnt ? NoObject : list[(*tg)++];
X}
X
XCloseTop(tg) register TopGen *tg;	{
X	*tg = TTSIZE;
X}
X/* end TOP				-- */
+ END-OF-FILE top.g
chmod 'u=rw,g=r,o=r' \t\o\p\.\g
set `sum \t\o\p\.\g`
sum=$1
case $sum in
62947)	:;;
*)	echo 'Bad sum in '\t\o\p\.\g >&2
esac
echo Extracting \t\o\p\.\h
sed 's/^X//' > \t\o\p\.\h << '+ END-OF-FILE '\t\o\p\.\h
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X/* This is the public interface of top.g, a generic package for keeping
X/* a top-10 list of objects of formal type.
X/*
X/* The application of this file must be preceded by
X/* #include "top.p", which defines the parameters of the instantiation.
X*/
X
Xextern InitTop();
Xextern InsertTop();
Xtypedef int TopGen;
Xextern OpenTop();
Xextern TTTYPE *NextTop();
X#define	NoObject	((TTTYPE*)0)
Xextern CloseTop();
+ END-OF-FILE top.h
chmod 'u=rw,g=r,o=r' \t\o\p\.\h
set `sum \t\o\p\.\h`
sum=$1
case $sum in
48552)	:;;
*)	echo 'Bad sum in '\t\o\p\.\h >&2
esac
echo Extracting \t\o\p\.\c
sed 's/^X//' > \t\o\p\.\c << '+ END-OF-FILE '\t\o\p\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"text.h"
X#include	"top.p"
X#include	"top.h"
X
X#ifndef	lint		/* lint won't take this define ?!?!?! */
X#define	longer(r0,r1)	(r0->rn_quality > r1->rn_quality)
X
X#else
Xstatic int
Xlonger(r0, r1) struct run *r0, *r1;	{
X	return r0->rn_quality > r1->rn_quality;
X}
X#endif	lint
X
X/* Instantiate top.g  */
X#include	"top.g"
+ END-OF-FILE top.c
chmod 'u=rw,g=r,o=r' \t\o\p\.\c
set `sum \t\o\p\.\c`
sum=$1
case $sum in
10035)	:;;
*)	echo 'Bad sum in '\t\o\p\.\c >&2
esac
echo Extracting \p\a\r\a\m\s\.\h
sed 's/^X//' > \p\a\r\a\m\s\.\h << '+ END-OF-FILE '\p\a\r\a\m\s\.\h
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#define	MIN_RUN		24		/*	default minimum size
X						of interesting run
X					*/
X#define	PAGE_WIDTH	80
+ END-OF-FILE params.h
chmod 'u=rw,g=r,o=r' \p\a\r\a\m\s\.\h
set `sum \p\a\r\a\m\s\.\h`
sum=$1
case $sum in
36700)	:;;
*)	echo 'Bad sum in '\p\a\r\a\m\s\.\h >&2
esac
echo Extracting \t\e\x\t\.\h
sed 's/^X//' > \t\e\x\t\.\h << '+ END-OF-FILE '\t\e\x\t\.\h
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
Xstruct text	{
X	char *tx_fname;		/* the file name */
X	int tx_needed;		/* set if file plays a role in final output */
X	unsigned int tx_start;	/* positions in buff for the text */
X	unsigned int tx_limit;
X};
X
Xstruct chunk	{
X	/* a chunk of text in various representations */
X	struct text *ch_text;	/* a pointer to the file text */
X	unsigned int ch_st_ch;	/* first in chunk, counted in C-chars */
X	unsigned int ch_lm_ch;	/* first not in chunk */
X	long ch_st_ls;		/* same in lseek positions */
X	long ch_lm_ls;
X	unsigned int ch_st_nl;	/* same in line numbers */
X	unsigned int ch_lm_nl;
X};
X
Xstruct run	{		/* a 'run' of coincident chars */
X	struct chunk rn_cn0;	/* chunk in left file */
X	struct chunk rn_cn1;	/* chunk in right file */
X	unsigned int rn_quality;
X};
+ END-OF-FILE text.h
chmod 'u=rw,g=r,o=r' \t\e\x\t\.\h
set `sum \t\e\x\t\.\h`
sum=$1
case $sum in
43449)	:;;
*)	echo 'Bad sum in '\t\e\x\t\.\h >&2
esac
echo Extracting \d\e\b\u\g\.\h
sed 's/^X//' > \d\e\b\u\g\.\h << '+ END-OF-FILE '\d\e\b\u\g\.\h
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#define	DEBUG	0
+ END-OF-FILE debug.h
chmod 'u=rw,g=r,o=r' \d\e\b\u\g\.\h
set `sum \d\e\b\u\g\.\h`
sum=$1
case $sum in
64324)	:;;
*)	echo 'Bad sum in '\d\e\b\u\g\.\h >&2
esac
echo Extracting \s\i\m\.\c
sed 's/^X//' > \s\i\m\.\c << '+ END-OF-FILE '\s\i\m\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"params.h"
X
Xint min_run_size = MIN_RUN;
X
Xchar options[128];			/* for various, extensible flags */
X
Xmain(argc, argv)
X	char *argv[];
X{
X	argv++, argc--;			/* skip program name */
X
X	while (argc > 0 && argv[0][0] == '-')	{
X		char *par = &argv[0][1];
X		
X		while (*par)	{
X			switch (*par)	{
X			case 'r':
X				min_run_size = atoi(argv[1]);
X				argc--, argv++;
X				break;
X			default:
X				options[*par]++;
X				break;
X			}
X			par++;
X		}
X		argc--, argv++;
X	}
X	if (min_run_size == 0)
X		error("Minimum run size equals 0");
X	
X	init_buff();
X	
X	/* Read the files */
X	pass1(argv, argc);
X	
X	/* Set up the hash table */
X	make_hash();
X	
X	/* Compare various files */
X	compare();
X	
X	/* Delete hash table */
X	free_hash();
X	
X	/* Find positions of found similarities */
X	pass2();
X
X	/* Print the similarities */
X	pass3();
X	return 0;
X}
+ END-OF-FILE sim.c
chmod 'u=rw,g=r,o=r' \s\i\m\.\c
set `sum \s\i\m\.\c`
sum=$1
case $sum in
42365)	:;;
*)	echo 'Bad sum in '\s\i\m\.\c >&2
esac
echo Extracting \p\a\s\s\1\.\c
sed 's/^X//' > \p\a\s\s\1\.\c << '+ END-OF-FILE '\p\a\s\s\1\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"buff.h"
X#include	"text.h"
X
Xextern char *calloc();
X
Xstruct text *text;		/* to be filled in by calloc */
Xint ntexts;			/* number of text records */
X
Xpass1(argv, argc)
X	char *argv[];
X{
X	int n;
X	
X	/* allocate the array of text descriptors */
X	ntexts = argc;
X	text = (struct text *)calloc((unsigned)ntexts, sizeof (struct text));
X	if (!text)
X		error("Too many files");
X
X	/* read the files */
X	for (n = 0; n < ntexts; n++)	{
X		char *fname = argv[n];
X		struct text *txt = &text[n];
X		char ch;
X	
X		printf("File %s: ", fname);
X	
X		txt->tx_fname = fname;
X		txt->tx_start = txt->tx_limit = text_length();
X		if (!OpenStream(1, txt->tx_fname))	{
X			printf("cannot open\n");
X			OpenStream(1, "/dev/null");
X		}
X		while (NextChar(&ch) == 0)
X			store(ch);
X		CloseStream();
X	
X		txt->tx_limit = text_length();
X		printf("%u C-units\n", txt->tx_limit - txt->tx_start);
X	}
X	printf("Total: %u C-units\n", text_length() - 1);
X	printf("\n");
X}
+ END-OF-FILE pass1.c
chmod 'u=rw,g=r,o=r' \p\a\s\s\1\.\c
set `sum \p\a\s\s\1\.\c`
sum=$1
case $sum in
58561)	:;;
*)	echo 'Bad sum in '\p\a\s\s\1\.\c >&2
esac
echo Extracting \h\a\s\h\.\c
sed 's/^X//' > \h\a\s\h\.\c << '+ END-OF-FILE '\h\a\s\h\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"buff.h"
X#include	"text.h"
X
Xextern char *calloc();
X
Xextern char options[];
Xextern int ntexts;
Xextern struct text *text;
Xextern int min_run_size;
X
Xstatic int hash_code();
Xstatic print_hash();
X
X#define	N_HASH	10639			/* any suitable prime */
X
Xunsigned int *hash_table;		/* to be filled by malloc() */
X
X/* to judge the quality of the hash code */
Xstatic tally_right = 0, tally_wrong = 0;
Xstatic tally_hash(), print_tally();
X
Xmake_hash()	{
X	unsigned int last[N_HASH];
X	/*	last[i] is the index of the latest char with hash_code i,
X		or 0 if there is none.
X	*/
X	int n;
X	
X	for (n = 0; n < N_HASH; n++)
X		last[n] = 0;
X	
X	hash_table = (unsigned int *)
X			calloc(text_length(), sizeof (unsigned int));
X	if (options['x'])
X		hash_table = 0;
X	if (!hash_table)	{
X		printf(">>> Not enough memory for the hash table, ");
X		printf("this is going to take time!\n\n");
X		return;
X	}
X	
X	for (n = 0; n < ntexts; n++)	{
X		struct text *txt = &text[n];
X		unsigned int j;
X		
X		for (
X			j = txt->tx_start;
X			j < txt->tx_limit - min_run_size + 1;
X			j++
X		)	{
X			int h = hash_code(&buff[j]);
X			
X			if (last[h])	{
X				hash_table[last[h]] = j;
X				if (options['h'])
X					tally_hash(last[h], j);
X			}
X			last[h] = j;
X		}
X	}
X	if (options['h'])
X		print_tally();
X	if (options['H'])
X		print_hash();
X}
X
Xstatic int
Xhash_code(p)
X	char *p;
X{
X	/*	hash_code(p) returns the hash code of the min_run_size first
X		characters starting at p; caller guarantees that there
X		are at least min_run_size chars.
X	*/
X	int h = 0;
X	int i;
X	
X	for (i = 0; i < min_run_size; i++)
X		h = ((h << 1) + *p++) % N_HASH;
X	return h;
X}
X
Xstatic
Xprint_hash()
X{
X	/* will not be called if hash_table == 0 */
X	unsigned int i;
X	
X	for (i = 1; i < text_length(); i++)	{
X		printf("%d: %c: ", i, buff[i]);
X		printf("%u\n", hash_table[i]);
X	}
X}
X
Xstatic
Xtally_hash(i0, i1)
X	unsigned int i0, i1;
X{
X	int i;
X	
X	for (i = 0; i < min_run_size; i++)	{
X		if (buff[i0++] != buff[i1++])	{
X			tally_wrong++;
X			return;
X		}
X	}
X	tally_right++;
X}
X
Xstatic
Xprint_tally()
X{
X	printf("Tally_right = %d, tally_wrong = %d, ",
X		tally_right, tally_wrong);
X	printf("hash code efficiency = %d%%\n",
X		100 * tally_right / (tally_right + tally_wrong));
X}
X
Xfree_hash()	{
X	if (hash_table)
X		free((char *)hash_table);
X}
+ END-OF-FILE hash.c
chmod 'u=rw,g=r,o=r' \h\a\s\h\.\c
set `sum \h\a\s\h\.\c`
sum=$1
case $sum in
22444)	:;;
*)	echo 'Bad sum in '\h\a\s\h\.\c >&2
esac
echo Extracting \c\o\m\p\a\r\e\.\c
sed 's/^X//' > \c\o\m\p\a\r\e\.\c << '+ END-OF-FILE '\c\o\m\p\a\r\e\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"buff.h"
X#include	"text.h"
X#include	"top.p"
X#include	"top.h"
X
X/* from the Language Department: */
Xextern int MayBeStartOfRun();
Xextern unsigned int CheckRun();
X
Xextern char options[];
Xextern int ntexts;
Xextern struct text *text;
Xextern int min_run_size;
Xextern unsigned int *hash_table;
Xextern add_run();
X
Xstatic struct text *txt_at();
Xstatic unsigned int lcs();
X
Xcompare()
X{
X	int n;
X	
X	InitTop();
X	for (n = 0; n < ntexts; n++)	{
X		struct text *txt0 = &text[n];
X		unsigned int i0 = txt0->tx_start;
X		
X		while (i0 < txt0->tx_limit - min_run_size + 1)	{
X			i0 += lcs(txt0, i0);
X		}
X	}
X}
X
Xstatic unsigned int
Xlcs(txt0, i0)
X	struct text *txt0;
X	unsigned int i0;
X{
X	/*	find the longest common substring in:
X			txt0, starting precisely at i0 and
X			the rest of the text
X	*/
X	struct text *txt1 = txt0;
X	unsigned int i1 = i0;
X	struct text *txt_best;
X	unsigned int i_best;
X	unsigned int size_best = 0;
X	
X	if (!MayBeStartOfRun(buff[i0]))	{
X		return 1;
X	}
X	
X	while(
X		i1 = hash_table ? hash_table[i1] : i1 + 1,
X		txt1 = txt_at(txt1, i1)
X	)	{
X		
X		if (	/* we don't want to compare a file to itself */
X			options['s'] && i1 < txt0->tx_limit
X		)	{
X			/* skip this possibility */
X		}
X		else
X		if (	/* we are looking at the middle of a run */
X			i0 != txt0->tx_start && i1 != txt1->tx_start &&
X			buff[i0-1] == buff[i1-1]
X		)	{
X			/* skip this possibility */
X		}
X		else	{
X			/* see how far we can get */
X			unsigned int j0 = i0, j1 = i1;
X			unsigned int size = 0;
X			unsigned int limit0 = txt0->tx_limit;
X			unsigned int limit1 = txt1->tx_limit;
X			
X			while (	size < j1 - j0 &&
X				j0 < limit0 && j1 < limit1 &&
X				buff[j0] == buff[j1]
X			)	{
X				j0++, j1++, size++;
X			}
X			
X			if (size >= min_run_size)	{
X				/*	offer the run to the
X					Language Department
X				*/
X				size = CheckRun(&buff[i0], size);
X			}
X			
X			if (	/* we still have something better */
X				size >= min_run_size && size > size_best
X			)	{
X				/* record it */
X				txt_best = txt1;
X				i_best = i1;
X				size_best = size;
X			}
X		}
X	}
X	if (size_best)	{
X		add_run(txt0, i0, txt_best, i_best, size_best);
X		return size_best;
X	}
X	else
X		return 1;
X}
X
Xstatic struct text *
Xtxt_at(txt, i)
X	struct text *txt;
X	unsigned int i;
X{
X	if (i == 0 || i >= text_length())
X		return 0;
X	while (i >= txt->tx_limit)
X		txt++;
X	return txt;
X}
+ END-OF-FILE compare.c
chmod 'u=rw,g=r,o=r' \c\o\m\p\a\r\e\.\c
set `sum \c\o\m\p\a\r\e\.\c`
sum=$1
case $sum in
22558)	:;;
*)	echo 'Bad sum in '\c\o\m\p\a\r\e\.\c >&2
esac
echo Extracting \a\d\d\_\r\u\n\.\c
sed 's/^X//' > \a\d\d\_\r\u\n\.\c << '+ END-OF-FILE '\a\d\d\_\r\u\n\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"buff.h"
X#include	"text.h"
X#include	"top.p"
X#include	"top.h"
X
Xstatic set_chunk();
X
Xadd_run(txt0, i0, txt1, i1, size)
X	struct text *txt0, *txt1;
X	unsigned int i0, i1;
X	unsigned int size;
X{
X	/*	Adds the run of given size to our collection.
X	*/
X	struct run r;
X	
X	set_chunk(&r.rn_cn0, txt0, i0 - txt0->tx_start, size);
X	set_chunk(&r.rn_cn1, txt1, i1 - txt1->tx_start, size);
X	r.rn_quality = size;
X	
X	InsertTop(&r);
X}
X
Xstatic
Xset_chunk(cnk, txt, index, size)
X	struct chunk *cnk;
X	struct text *txt;
X	unsigned int index;
X	unsigned int size;
X{
X	/*	Fill the chunk *cnk with info about the piece of text
X		in txt starting at index extending over size characters.
X	*/
X	txt->tx_needed = 1;
X	cnk->ch_text = txt;
X	cnk->ch_st_ch = index;
X	cnk->ch_lm_ch = index + size;
X	cnk->ch_st_ls = cnk->ch_lm_ls = 0;
X	cnk->ch_st_nl = cnk->ch_lm_nl = 1;
X}
+ END-OF-FILE add_run.c
chmod 'u=rw,g=r,o=r' \a\d\d\_\r\u\n\.\c
set `sum \a\d\d\_\r\u\n\.\c`
sum=$1
case $sum in
53616)	:;;
*)	echo 'Bad sum in '\a\d\d\_\r\u\n\.\c >&2
esac
echo Extracting \p\a\s\s\2\.\c
sed 's/^X//' > \p\a\s\s\2\.\c << '+ END-OF-FILE '\p\a\s\s\2\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	"text.h"
X#include	"top.p"
X#include	"top.h"
X#include	"debug.h"
X
Xextern int ntexts;
Xextern struct text *text;
X
Xstatic upd_top(), upd_chunk();
X
Xpass2()	{
X	int n;
X	
X	for (n = 0; n < ntexts; n++)	{
X		struct text *txt = &text[n];
X		unsigned int ch_cnt;
X		long ls_pos;
X		unsigned int nl_cnt = 1;
X	
X		if (!txt->tx_needed)		/* an optimization */
X			continue;
X	
X		if (!OpenStream(2, txt->tx_fname))	{
X			printf("*** File %s disappeared\n", txt->tx_fname);
X			OpenStream(2, "/dev/null");
X		}
X	
X		while (NextPair(&ch_cnt, &ls_pos) == 0)	{
X			/*	fill in the lseek and line positions
X				in the collected runs
X			*/
X			nl_cnt++;
X#if	DEBUG == 1
X			printf("pass2 on %s: ch_cnt = %u, ls_pos = %ld\n",
X					txt->tx_fname, ch_cnt, ls_pos);
X#endif	DEBUG == 1
X			upd_top(txt, ch_cnt, ls_pos, nl_cnt);
X		}
X		CloseStream();
X	}
X}
X
Xstatic
Xupd_top(txt, ch_cnt, ls_pos, nl_cnt)
X	struct text *txt;
X	unsigned int ch_cnt;
X	long ls_pos;
X	unsigned int nl_cnt;
X{
X	TopGen tp;
X	struct run *run;
X	
X	OpenTop(&tp);
X	while ((run = NextTop(&tp)), run != NoObject)	{
X		struct chunk *cnk0 = &run->rn_cn0;
X		struct chunk *cnk1 = &run->rn_cn1;
X		
X		if (cnk0->ch_text == txt)
X			upd_chunk(cnk0, ch_cnt, ls_pos, nl_cnt);
X		if (cnk1->ch_text == txt)
X			upd_chunk(cnk1, ch_cnt, ls_pos, nl_cnt);
X	}
X	CloseTop(&tp);
X}
X
Xstatic
Xupd_chunk(cnk, ch_cnt, ls_pos, nl_cnt)
X	struct chunk *cnk;
X	unsigned int ch_cnt;
X	long ls_pos;
X	unsigned int nl_cnt;
X{
X	if (ch_cnt <= cnk->ch_st_ch)	{
X		cnk->ch_st_ls = ls_pos;
X		cnk->ch_st_nl = nl_cnt;
X	}
X	if (cnk->ch_lm_ls == 0 && cnk->ch_lm_ch <= ch_cnt)	{
X		cnk->ch_lm_ls = ls_pos;
X		cnk->ch_lm_nl = nl_cnt;
X	}
X}
+ END-OF-FILE pass2.c
chmod 'u=rw,g=r,o=r' \p\a\s\s\2\.\c
set `sum \p\a\s\s\2\.\c`
sum=$1
case $sum in
15182)	:;;
*)	echo 'Bad sum in '\p\a\s\s\2\.\c >&2
esac
echo Extracting \p\a\s\s\3\.\c
sed 's/^X//' > \p\a\s\s\3\.\c << '+ END-OF-FILE '\p\a\s\s\3\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X#include	<stdio.h>
X#include	"params.h"
X#include	"text.h"
X#include	"top.p"
X#include	"top.h"
X#include	"debug.h"
X
Xextern char options[];
X
Xstatic FILE *chunk_open();
Xstatic unsigned int fill_line();
Xstatic show_chunk(), show_line(), clear_line(), print_run();
X
X#define	MAXLINE		(PAGE_WIDTH/2-2)
X
Xpass3()	{
X	TopGen tp;
X	struct run *run;
X
X	OpenTop(&tp);
X	while ((run = NextTop(&tp)), run != NoObject)	{
X		print_run(run);
X		show_chunk(run);
X		printf("\n");
X	}
X	CloseTop(&tp);
X}
X
Xstatic
Xprint_run(run)
X	struct run *run;
X{
X#if	DEBUG == 1
X#include	"buff.h"
X	unsigned int i;
X	struct chunk *cnk0 = &run->rn_cn0;
X	struct chunk *cnk1 = &run->rn_cn1;
X	
X	printf("File %s vs. file %s:\n",
X		cnk0->ch_text->tx_fname,
X		cnk1->ch_text->tx_fname
X	);
X	printf("from C-char %d,%d to %d,%d:",
X		cnk0->ch_st_ch, cnk1->ch_st_ch,
X		cnk0->ch_lm_ch, cnk1->ch_lm_ch
X	);
X	printf(" from ASCII-char %d,%d to %d,%d:",
X		cnk0->ch_st_ls, cnk1->ch_st_ls,
X		cnk0->ch_lm_ls, cnk1->ch_lm_ls
X	);
X	printf(" from lines %d,%d to %d,%d:",
X		cnk0->ch_st_nl, cnk1->ch_st_nl,
X		cnk0->ch_lm_nl - 1, cnk1->ch_lm_nl - 1
X	);
X	printf(" %d C-chars\n", run->rn_quality);
X	
X	/* show C-chars, with a one-char margin */
X	for (	i = cnk0->ch_st_ch - 1;
X		i < cnk0->ch_lm_ch + 1;
X		i++
X	)	{
X		putchar(buff[cnk0->ch_text->tx_start + i]);
X	}
X	printf("\n");
X	for (	i = cnk1->ch_st_ch - 1;
X		i < cnk1->ch_lm_ch + 1;
X		i++
X	)	{
X		putchar(buff[cnk1->ch_text->tx_start + i]);
X	}
X	printf("\n");
X#endif	DEBUG == 1
X
X#ifdef	lint
X	run = run;
X#endif	lint
X}
X
X
Xstatic
Xshow_chunk(run)
X	struct run *run;
X{
X	/* The animals came in two by two ... */
X	struct chunk *cnk0 = &run->rn_cn0;
X	struct chunk *cnk1 = &run->rn_cn1;
X	unsigned int nl_cnt0 = cnk0->ch_lm_nl - cnk0->ch_st_nl;
X	unsigned int nl_cnt1 = cnk1->ch_lm_nl - cnk1->ch_st_nl;
X	FILE *f0;
X	FILE *f1;
X	char line0[MAXLINE + 1];
X	char line1[MAXLINE + 1];
X	extern char *sprintf();
X	
X	sprintf(line0, "%s: line %d-%d",
X		cnk0->ch_text->tx_fname,
X		cnk0->ch_st_nl, cnk0->ch_lm_nl - 1, run->rn_quality);
X	sprintf(line1, "%s: line %d-%d",
X		cnk1->ch_text->tx_fname,
X		cnk1->ch_st_nl, cnk1->ch_lm_nl - 1, run->rn_quality);
X	show_line(line0, line1);
X	if (options['n'])
X		return;			/* ... had enough so soon ... */
X
X	f0 = chunk_open(cnk0);
X	f1 = chunk_open(cnk1);
X	
X	/* fill lines and print them */
X	while (nl_cnt0 != 0 || nl_cnt1 != 0)	{
X		if (nl_cnt0)	{
X			fill_line(f0, line0);
X			nl_cnt0--;
X		}
X		else	clear_line(line0);
X		if (nl_cnt1)	{
X			fill_line(f1, line1);
X			nl_cnt1--;
X		}
X		else	clear_line(line1);
X		show_line(line0, line1);
X	}
X	
X	fclose(f0);
X	fclose(f1);
X}
X
Xstatic FILE *
Xchunk_open(cnk)
X	struct chunk *cnk;
X{
X	/*	opens the file in which the chunk resides and positions
X		the file at the beginning of the chunk
X	*/
X	char *fname = cnk->ch_text->tx_fname;
X	FILE *f = fopen(fname, "r");
X	
X	if (f == NULL)	{
X		printf("*** File %s disappeared\n", fname);
X		f = fopen("/dev/null", "r");
X	}
X	fseek(f, cnk->ch_st_ls, 0);
X	return f;
X}
X
Xstatic unsigned int
Xfill_line(f, ln)
X	FILE *f;
X	char ln[];
X{
X	/*	Reads one line from f and puts it in condensed form in ln.
X	*/
X	int ch;
X	int indent = 0, lpos = 0;
X	
X	/* condense and skip initial blank */
X	while ((ch = getc(f)), ch == ' ' || ch == '\t')	{
X		if (ch == '\t')
X			indent = 8;
X		else
X			indent++;
X		if (indent == 8)	{
X			/* every eight blanks give one blank */
X			if (lpos < MAXLINE)
X				ln[lpos++] = ' ';
X			indent = 0;
X		}
X	}
X	
X	/* store the rest */
X	while (ch >= 0 && ch != '\n')	{
X		if (ch == '\t')		/* replace tabs by blanks */
X			ch = ' ';
X		if (lpos < MAXLINE)
X			ln[lpos++] = ch;
X		ch = getc(f);
X	}
X	ln[lpos] = '\0';		/* always room for this one */
X}
X
Xstatic
Xclear_line(ln)
X	char ln[];
X{
X	/* a simple null byte will suffice */
X	ln[0] = '\0';
X}
X
Xstatic
Xshow_line(ln0, ln1)
X	char ln0[], ln1[];
X{
X	int i;
X	
X	for (i = 0; i < MAXLINE && ln0[i] != '\0'; i++)
X		putchar(ln0[i]);
X	for (; i < MAXLINE; i++)
X		putchar(' ');
X	printf(" |");
X
X	for (i = 0; i < MAXLINE && ln1[i] != '\0'; i++)
X			putchar(ln1[i]);
X	printf("\n");
X}
+ END-OF-FILE pass3.c
chmod 'u=rw,g=r,o=r' \p\a\s\s\3\.\c
set `sum \p\a\s\s\3\.\c`
sum=$1
case $sum in
35264)	:;;
*)	echo 'Bad sum in '\p\a\s\s\3\.\c >&2
esac
echo Extracting \m\a\i\n\.\c
sed 's/^X//' > \m\a\i\n\.\c << '+ END-OF-FILE '\m\a\i\n\.\c
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X/*
X	This is a service program for the similarity tester.
X	A call of 'cstream -1 inp.c' yields the tokens of inp.c as
X	single characters, as used by pass1 of sim.
X	A call of 'cstream -2 inp.c' yields a list of <token_number,
X	character_number> pairs, one for each line.  This is used by
X	pass3.
X*/
X
X#include	<stdio.h>
X
Xchar options[128];
X
Xmain(argc, argv)
X	char *argv[];
X{
X	if (argc != 3)	{
X		fprintf(stderr, "Call is: %s -[12] inp.c\n", argv[0]);
X		return 1;
X	}
X
X	if (!OpenStream(argv[1][1] - '0', argv[2]))	{
X		fprintf(stderr, "%s: cannot open\n", argv[2]);
X		return 1;
X	}
X
X	if (argv[1][1] == '1')	{
X		char ch;
X
X		while (NextChar(&ch) == 0)
X			putchar(ch);
X	}
X	else
X	if (argv[1][1] == '2')	{
X		unsigned int ch_cnt;
X		long ls_pos;
X
X		while (NextPair(&ch_cnt, &ls_pos) == 0)
X			printf("%ld,%ld\n", ch_cnt, ls_pos);
X	}
X	return 0;
X}
+ END-OF-FILE main.c
chmod 'u=rw,g=r,o=r' \m\a\i\n\.\c
set `sum \m\a\i\n\.\c`
sum=$1
case $sum in
07408)	:;;
*)	echo 'Bad sum in '\m\a\i\n\.\c >&2
esac
echo Extracting \c\l\a\n\g\.\l
sed 's/^X//' > \c\l\a\n\g\.\l << '+ END-OF-FILE '\c\l\a\n\g\.\l
X%{
X/*	This file is part of the software similarity tester SIM.
X	Written by Dick Grune, Vrije Universiteit, Amsterdam.
X*/
X
X/*
X	C language front end for the similarity tester.
X*/
X
X/* Language-dependent Code */
X#include	"idf.h"
X
Xstatic struct idf ppcmd[] =	{
X	"define",	'D',
X	"else",		'E',
X	"endif",	'Z',
X	"if",		'F',
X	"ifdef",	'Y',
X	"ifndef",	'N',
X	"include",	'I',
X	"line",		'L',
X	"undef",	'U'
X};
X
Xstatic struct idf reserved[] =	{
X	"auto",		'a',
X	"break",	'b',
X	"case",		'k',
X	"char",		'c',
X	"continue",	'z',
X	"default",	'_',
X	"do",		'd',
X	"double",	'm',
X	"else",		'e',
X	"enum",		'n',
X	"extern",	'q',
X	"float",	'y',
X	"for",		'f',
X	"goto",		'g',
X	"if",		'i',
X	"int",		'j',
X	"long",		'l',
X	"register",	'\0',		/* ignore */
X	"return",	'r',
X	"short",	'h',
X	"sizeof",	'o',
X	"static",	'p',
X	"struct",	's',
X	"switch",	'x',
X	"typedef",	't',
X	"union",	'u',
X	"unsigned",	'v',
X	"void",		'\0',		/* ignore */
X	"while",	'w'
X};
X
Xstatic int
Xis_trailer(ch)	{
X	return ch == ')' || ch == '}' || ch == ';';
X}
X
Xint
XMayBeStartOfRun(ch)	{
X	return !is_trailer(ch);
X}
X
Xunsigned int
XCheckRun(str, size) char *str; unsigned int size;	{
X	/*	Checks the run starting at str with length size for
X		acceptability in the language.  Cuts from the end if
X		necessary and returns the accepted length (which may
X		be zero).
X	*/
X	extern char options[];
X	
X	if (options['f'])	{	/* function-like forms only */
X		unsigned int pos;
X		unsigned int lb_pos = 0;/* latest balancing position */
X		int braces = 0;
X		int parens = 0;
X		int brackets = 0;
X		
X		for (pos = 0; pos < size; pos++)	{
X			switch (str[pos])	{
X			case '{': braces++; break;
X			case '}': braces--; break;
X			case '(': parens++; break;
X			case ')': parens--; break;
X			case '[': brackets++; break;
X			case ']': brackets--; break;
X			}
X			if (	/* this was one closer too many */
X				braces < 0 || parens < 0 || brackets < 0
X			)	{
X				break;
X			}
X			if (	/* it happens to balance here */
X				braces == 0 && parens == 0 && brackets == 0
X			)	{
X				lb_pos = pos + 1;
X			}
X		}
X		size = lb_pos;			/* cut to size */
X	}
X	else	{
X		while (	/* there is trailing garbage */
X			size != 0 &&
X			(str[size - 1] == '@' || is_trailer(str[size - 1]))
X		)	{
X			/* remove it */
X			size--;
X		}
X	}
X	return size;
X}
X
X/* Language-INdependent Code */
X#include	"stream.h"
X
Xyystart()	{
X	BEGIN INITIAL;
X}
X
Xstatic int
Xyywrap()	{
X	return 1;
X}
X
X%}
X
X%Start	Comment
X
XAnyQuoted	(\\.)
XStrChar		([^"\n\\]|{AnyQuoted})
XChrChar		([^'\\]|{AnyQuoted})
XComChar		([^*\n]|(\*[^/]))
XIdf		([A-Za-z][A-Za-z0-9_]*)
X
X%%
X
X\"{StrChar}*\"	{	/* strings */
X		cput('"');
X		count();
X	}
X
X\'{ChrChar}\'	{	/* characters */
X		cput('\'');
X		count();
X	}
X
X"/*"	{	/*	We cannot have one pattern for a comment
X			(although one can be written), since the matched
X			string would overflow lex-internal buffers like
X			yysbuf and yytext. So we have to break up the string
X			into lines and keep track of where we are in a start
X			condition <Comment>.
X		*/
X		BEGIN Comment;
X		count();
X	}
X
X<Comment>{ComChar}*	{	/* comment up to \n or end-of-comment */
X		count();
X	}
X
X<Comment>"*/"	{		/* end-of-comment */
X		BEGIN INITIAL;
X		count();
X	}
X
X#[ \t]*include.*	{	/* skip #include line */
X		count();
X	}
X
X#[ \t]*{Idf}	{	/* a preprocessor line */
X		char *n = yytext+1;
X		int ch;
X
X		/* skip layout in front of preprocessor identifier */
X		while (*n == ' ' || *n == '\t')
X			n++;
X		ch = idf2char(n, ppcmd);
X		if (ch < 0)	{
X			cput('#');
X		}
X		else
X			cput(ch);
X		count();
X	}
X
X{Idf}	{
X		int ch = idf2char(yytext, reserved);
X
X		if (ch < 0)
X			cput('@');
X		else
X		if (ch > 0)
X			cput(ch);
X		count();
X	}
X
X[ \t]	{	/* layout */
X		count();
X	}
X
X\n	{	/* count newlines */
X		count();
X		c_eol();
X	}
X
X.	{	/* copy other text */
X		cput(yytext[0]);
X		count();
X	}
X
X%%
+ END-OF-FILE clang.l
chmod 'u=rw,g=r,o=r' \c\l\a\n\g\.\l
set `sum \c\l\a\n\g\.\l`
sum=$1
case $sum in
21016)	:;;
*)	echo 'Bad sum in '\c\l\a\n\g\.\l >&2
esac
exit 0

sources-request@panda.UUCP (03/03/86)

Mod.sources:  Volume 4, Issue 12

[ Contained here are two updates to "sim", the software similarity
  tester that was submitted two weeks ago.  One provides for compiling
  the program on a Gould, and the other provides more ambitious changes
  including a new option to specify line width, as well as changes
  specific to the HP-UX environment.

    - John P. Nelson, Moderator, mod.sources
]
  
-------------------------------------------------------------------
Submitted by: tektronix!tekig4!georgew (George Walker)

This is a quick fix to allow sim to run on a Gould (which has 64K segment
problems).  It prevents stack overflow.

Change is to hash.c (I have moved the declaration out of the function and made
it a static):

20a21,25
> /*NOBASE*/				/* <- required by Gould compiler */
> static unsigned int last[N_HASH];
> /*	last[i] is the index of the latest char with hash_code i,
> 	or 0 if there is none.
> */
27,30d31
< 	unsigned int last[N_HASH];
< 	/*	last[i] is the index of the latest char with hash_code i,
< 		or 0 if there is none.
< 	*/

George Walker

(UUCP)	   {ucbvax,decvax,chico,pur-ee,cbosg,ihnss}!tektronix!tekig4!georgew
(CSNET)	   tekig4!georgew@tek
(ARPANET)  tekig4!georgew.tek@rand-relay


-------------------------------------------------------------------
Submitted by: hplabs!hp-lsd!paul (Paul Bame)


Having a large-screen display, I decided to change sim(1) to accept a command 
line argument allowing specification of line width.   The  following  patches
(output  of  diff  -c)  patch  the  files  sim.c,  sim.1 (yea, even fixed the
manual), pass3.c, and params.h to accommodate the  change.   I  attempted  to
make the changes in the same style as the original software.  

Patches to Makefile and hash.c are suitable for HP-UX installations  and  may
be  safely  removed  if you're not running HP-UX.  The local symbol space was
too large in a function in hash.c so I moved the offending array outside  the
function.  The MANDIR and BINDIR variables were changed in Makefile.  

Many thanks to Dick Grune for this software.

		--Paul Bame
		UUCP: {hplabs,ihnp4!hpfcla}!hp-lsd!paul
		CSNET: hp-lsd!paul@hp-labs
		ARPA: hp-lsd!paul%hp-labs@csnet-relay.arpa


*** /tmp/,RCSt1a03232	Thu Feb 20 13:53:49 1986
--- Makefile	Thu Feb 20 13:52:04 1986
***************
*** 2,9
  #	Written by Dick Grune, Vrije Universiteit, Amsterdam.
  #
  
! BINDIR = /user1/dick/bin#		# where to put the binary (sim)
! MANDIR = /user1/dick/man#		# where to put the manual page (sim.1)
  
  #
  #	Each module (set of programs that together perform some function)

--- 2,9 -----
  #	Written by Dick Grune, Vrije Universiteit, Amsterdam.
  #
  
! BINDIR = /usr/local/bin#		# where to put the binary (sim)
! MANDIR = /usr/local/man/man1#		# where to put the manual page (sim.1)
  
  #
  #	Each module (set of programs that together perform some function)
*** /tmp/,RCSt1a03237	Thu Feb 20 13:53:56 1986
--- hash.c	Thu Feb 20 13:52:12 1986
***************
*** 23,28
  static tally_right = 0, tally_wrong = 0;
  static tally_hash(), print_tally();
  
  make_hash()	{
  	unsigned int last[N_HASH];
  	/*	last[i] is the index of the latest char with hash_code i,

--- 23,31 -----
  static tally_right = 0, tally_wrong = 0;
  static tally_hash(), print_tally();
  
+ /* moved last[] out of make_hash() because too large local data area */
+ /* according to HPUX C compiler  - Paul Bame */
+ static unsigned int last[N_HASH];
  make_hash()	{
  	/*	last[i] is the index of the latest char with hash_code i,
  		or 0 if there is none.
***************
*** 24,30
  static tally_hash(), print_tally();
  
  make_hash()	{
- 	unsigned int last[N_HASH];
  	/*	last[i] is the index of the latest char with hash_code i,
  		or 0 if there is none.
  	*/

--- 27,32 -----
  /* according to HPUX C compiler  - Paul Bame */
  static unsigned int last[N_HASH];
  make_hash()	{
  	/*	last[i] is the index of the latest char with hash_code i,
  		or 0 if there is none.
  	*/
*** /tmp/,RCSt1a03242	Thu Feb 20 13:54:02 1986
--- params.h	Thu Feb 20 13:52:28 1986
***************
*** 5,8
  #define	MIN_RUN		24		/*	default minimum size
  						of interesting run
  					*/
! #define	PAGE_WIDTH	80

--- 5,9 -----
  #define	MIN_RUN		24		/*	default minimum size
  						of interesting run
  					*/
! #define	DEFAULT_PAGE_WIDTH	80	/* default page width */
! #define	MAX_PAGE_WIDTH		160	/* maximum possible page width */
*** /tmp/,RCSt1a03247	Thu Feb 20 13:54:08 1986
--- pass3.c	Thu Feb 20 13:52:36 1986
***************
*** 10,15
  #include	"debug.h"
  
  extern char options[];
  
  static FILE *chunk_open();
  static unsigned int fill_line();

--- 10,16 -----
  #include	"debug.h"
  
  extern char options[];
+ extern int page_width;
  
  static FILE *chunk_open();
  static unsigned int fill_line();
***************
*** 14,19
  static FILE *chunk_open();
  static unsigned int fill_line();
  static show_chunk(), show_line(), clear_line(), print_run();
  
  #define	MAXLINE		(PAGE_WIDTH/2-2)
  

--- 15,21 -----
  static FILE *chunk_open();
  static unsigned int fill_line();
  static show_chunk(), show_line(), clear_line(), print_run();
+ static int maxline;	/* Actual maxline */
  
  #define	MAXLINE		(MAX_PAGE_WIDTH/2-2)
  
***************
*** 15,21
  static unsigned int fill_line();
  static show_chunk(), show_line(), clear_line(), print_run();
  
! #define	MAXLINE		(PAGE_WIDTH/2-2)
  
  pass3()	{
  	TopGen tp;

--- 17,23 -----
  static show_chunk(), show_line(), clear_line(), print_run();
  static int maxline;	/* Actual maxline */
  
! #define	MAXLINE		(MAX_PAGE_WIDTH/2-2)
  
  pass3()	{
  	TopGen tp;
***************
*** 21,26
  	TopGen tp;
  	struct run *run;
  
  	OpenTop(&tp);
  	while ((run = NextTop(&tp)), run != NoObject)	{
  		print_run(run);

--- 23,30 -----
  	TopGen tp;
  	struct run *run;
  
+ 	maxline = page_width / 2 - 2;
+ 
  	OpenTop(&tp);
  	while ((run = NextTop(&tp)), run != NoObject)	{
  		print_run(run);
***************
*** 164,170
  			indent++;
  		if (indent == 8)	{
  			/* every eight blanks give one blank */
! 			if (lpos < MAXLINE)
  				ln[lpos++] = ' ';
  			indent = 0;
  		}

--- 168,174 -----
  			indent++;
  		if (indent == 8)	{
  			/* every eight blanks give one blank */
! 			if (lpos < maxline)
  				ln[lpos++] = ' ';
  			indent = 0;
  		}
***************
*** 174,180
  	while (ch >= 0 && ch != '\n')	{
  		if (ch == '\t')		/* replace tabs by blanks */
  			ch = ' ';
! 		if (lpos < MAXLINE)
  			ln[lpos++] = ch;
  		ch = getc(f);
  	}

--- 178,184 -----
  	while (ch >= 0 && ch != '\n')	{
  		if (ch == '\t')		/* replace tabs by blanks */
  			ch = ' ';
! 		if (lpos < maxline)
  			ln[lpos++] = ch;
  		ch = getc(f);
  	}
***************
*** 195,201
  {
  	int i;
  	
! 	for (i = 0; i < MAXLINE && ln0[i] != '\0'; i++)
  		putchar(ln0[i]);
  	for (; i < MAXLINE; i++)
  		putchar(' ');

--- 199,205 -----
  {
  	int i;
  	
! 	for (i = 0; i < maxline && ln0[i] != '\0'; i++)
  		putchar(ln0[i]);
  	for (; i < maxline; i++)
  		putchar(' ');
***************
*** 197,203
  	
  	for (i = 0; i < MAXLINE && ln0[i] != '\0'; i++)
  		putchar(ln0[i]);
! 	for (; i < MAXLINE; i++)
  		putchar(' ');
  	printf(" |");
  

--- 201,207 -----
  	
  	for (i = 0; i < maxline && ln0[i] != '\0'; i++)
  		putchar(ln0[i]);
! 	for (; i < maxline; i++)
  		putchar(' ');
  	printf(" |");
  
***************
*** 201,207
  		putchar(' ');
  	printf(" |");
  
! 	for (i = 0; i < MAXLINE && ln1[i] != '\0'; i++)
  			putchar(ln1[i]);
  	printf("\n");
  }

--- 205,211 -----
  		putchar(' ');
  	printf(" |");
  
! 	for (i = 0; i < maxline && ln1[i] != '\0'; i++)
  			putchar(ln1[i]);
  	printf("\n");
  }
*** /tmp/,RCSt1a03252	Thu Feb 20 13:54:16 1986
--- sim.1	Thu Feb 20 13:52:40 1986
***************
*** 9,14
  [
  .B \-[fns]
  .BI \-r N
  ]
  file ...
  .SH DESCRIPTION

--- 9,15 -----
  [
  .B \-[fns]
  .BI \-r N
+ .BI \-w N
  ]
  file ...
  .SH DESCRIPTION
***************
*** 22,27
  are found, they are reported on standard output; the default length
  minimum is 24, but can be reset by the
  .BR \-r -parameter.
  .PP
  The program can be used for finding copied pieces of code in
  purportedly unrelated programs (with the

--- 23,33 -----
  are found, they are reported on standard output; the default length
  minimum is 24, but can be reset by the
  .BR \-r -parameter.
+ .PP
+ The page width used may be changed from the default of 80 columns
+ with the
+ .BR \-w -parameter.
+ Maximum page width is 160 columns.
  .PP
  The program can be used for finding copied pieces of code in
  purportedly unrelated programs (with the
*** /tmp/,RCSt1a03257	Thu Feb 20 13:54:22 1986
--- sim.c	Thu Feb 20 13:52:47 1986
***************
*** 5,10
  #include	"params.h"
  
  int min_run_size = MIN_RUN;
  
  char options[128];			/* for various, extensible flags */
  

--- 5,11 -----
  #include	"params.h"
  
  int min_run_size = MIN_RUN;
+ int page_width   = DEFAULT_PAGE_WIDTH;
  
  char options[128];			/* for various, extensible flags */
  
***************
*** 20,25
  			switch (*par)	{
  			case 'r':
  				min_run_size = atoi(argv[1]);
  				argc--, argv++;
  				break;
  			default:

--- 21,30 -----
  			switch (*par)	{
  			case 'r':
  				min_run_size = atoi(argv[1]);
+ 				argc--, argv++;
+ 				break;
+ 			case 'w':
+ 				page_width = atoi(argv[1]);
  				argc--, argv++;
  				break;
  			default: