[net.sources] clash: discover indistinct identifiers

hugh@hcrvx1.UUCP (Hugh Redelmeier) (01/11/85)

There has been much discussion recently about how the C standard
may allow conforming compilers to ignore all but six characters
of an identifier, and to ignore case.  Also, many Unices (is that
a trade mark?) care about all of a long identifier (System V release 2
and 4.2BSD) while many others don't.  Other net discussions claim it
is sinful to distinguish identifiers by case.  So I am distributing "clash",
a program soon to celebrate its tenth birthday.  I wrote it under
5th edition Unix, when that was current.  I had hoped clash would become
obsolete in short order by improvements in cc/as/ld.  Then when lint
was first described, I had thought surely it would take over the
job.  Oh well.


# This is a shell archive.  Remove anything before this line, then
# unpack it by saving it in a file and typing "sh file".  (Files
# unpacked will be owned by you and have default permissions.)
#
# This archive contains:
# clash.1 clash.c

echo x - clash.1
cat > "clash.1" << '//E*O*F clash.1//'
.TH CLASH 1 Local
.SH NAME
clash \- find indistinguishable or long identifiers
.SH SYNOPSIS
.B clash
[
\fB\-actdsm\fIn\fR
] [ file ] ...
.SH DESCRIPTION
.IR Clash
finds identifiers that are not distinct in the first
.I numSigChars
characters,
or finds identifiers that are longer than
.I numSigChars
characters.
It lexically analyzes its input, ignoring comments.
It does not parse, so it does not understand scoping.
Some restrictions that
.I clash
might help detect:
.IP -
Most Unix file systems consider file names (components of pathnames) identical
if their first 14 characters are identical.
.IP -
Many Unix assemblers and the loaders consider only the first eight characters
of an identifier.
.IP -
Many C compilers treat identifiers as identical if their first seven
characters are the same (eight for identifiers that are not external).
In fact, the ANSI C standard will probably make it legal for
conforming compilers to ignore all but the first six characters
and to ignore case distinctions.
.IP -
Yacc terminals become C preprocessor symbols, and should therefore
differ within the first eight characters.
.PP
The argument list is a sequence of input file
names and flags.
If no input file name is given, the standard input is processed.
.PP
A flag operand starts with ``\-'' and continues with any number
of option names.
Flags d, l, s, and m toggle a corresponding switch.
.TP
.B \-a
the input is a PDP-11 assembler program
.TP
.B \-c
the input is a C program (default)
.TP
.B \-t
the input is some other language (``text'')
.TP
.B \-d
dump on error
.TP
.B \-l
print long identifiers
.TP
.B \-s
separate: process each file independantly
.TP
.B \-m
monocase: case distinctions don't count
.TP
.BI \- n
sets
.I numSigChars
(default is 7)
.SH "LOCAL INFO"
Written at the University of Toronto by D. Hugh Redelmeier.
.SH BUGS
Understands neither libraries nor #include commands: all
relevant files must be explicitly scanned.
.br
The maximum number of symbols and
the maximum number of characters in them
are fixed.
//E*O*F clash.1//

echo x - clash.c
cat > "clash.c" << '//E*O*F clash.c//'
/* Clash: find indistinct or long identifiers
 *
 * Synopsis:
 * 	clash [-actdsm<number>] file ...
 *
 * Description:
 *
 * This program finds identifiers that are longer than 'numSigChars',
 * and those that are not distinct in the first 'numSigChars' chars.
 * The user can specify whether case distinctions are significant.
 *
 * 	-a means Unix PDP-11 Assembler input
 * 	-c means input is c program (default)
 * 	-t means text (scan comments and strings)
 * 	-d means dump on error
 * 	-l means print long identifiers
 * 	-s means separate: process each file independantly
 *	-m means monocase: case distinctions don't count
 * 	-<number> sets 'numSigChars' (default is 7)
 *
 * Flags d, l, s, and m toggle a corresponding switch.
 *
 * Bugs:
 *	Does not understand libraries and include files: all
 *	relevant files must be explicitly scanned.
 *	The maximum number of symbols (sizeSymTab) and
 *	the maximum number of characters in them (sizeStrTab)
 *	are fixed at compile time.
 *
 * D. Hugh Redelmeier 75.06.11
 *
 * 1979?: Modified for Version 6 "typesetting tape" C.
 * 1984 Nov 29: Modified for System V C.
 * 1984 Dec 4: Added -m flag.
 */

#define UNINIT
#include <stdio.h>

#define TRUE 1
#define FALSE 0

int asmSw = FALSE;	/* if on, scan assembler */
int dumpOnError = FALSE;	/* controls method of exit on error */
int iLong = FALSE;	/* if on, any long idents are reported */
int separateSw = FALSE;	/* if on, all files are treated as separate */
int textSw = FALSE;	/* if on, scan comments and strings */
int caseMask = 0177;	/* if monocase, mask out case bit */
int numSigChars = 7;	/* see introduction */

#define sizeSymTab 1000
struct symCell {
		char *cIdPtr;
		int cIdLen;
		struct symCell *cClashPtr;
		struct symCell *cNext;
		};

struct symCell symTab[sizeSymTab] UNINIT;

int eLong UNINIT;
int eClash UNINIT;

struct symCell *freeSymCell UNINIT;

#define sizeHashTab 373
struct symCell *hashTab[sizeHashTab] UNINIT;

#define sizeStrTab 10000
char strTab[sizeStrTab] UNINIT;

char *freeChar UNINIT;
char *idPtr UNINIT;
unsigned idHash UNINIT;
int idLen UNINIT;

char lastInChar UNINIT;

int lineNo UNINIT;
char **argCursor UNINIT;
char *mIn UNINIT, *mFile UNINIT;	/* message about input file */

assert(p,s) register int p; register char *s; {
	if (!p) {
		printf("\nclash(%s)",*argCursor);
		if (lineNo)
			printf(" in line %d",lineNo);
		printf(": %s.\n",s);
		if (dumpOnError)
			abort();
		/*else*/
			exit(1);
		}
	}

init() {
	register struct symCell **p;

	eClash = eLong = FALSE;
	freeChar = strTab;
	freeSymCell = symTab;
	for (p=hashTab; p!=hashTab+sizeHashTab;)
		*p++ = NULL;
	lastInChar='\n';
	}

printAndInit() {
	if (freeChar!=strTab) {
		if (iLong && eLong)
			printLong();
		if (eClash)
			printClash();
		init();
		}
	}

int
strSimilar(p1,p2) register char *p1, *p2; {
	register int len;

	len = idLen<numSigChars? idLen : numSigChars;
	do ; while (--len>=0 && ((*p1++^*p2++)&caseMask)==0);
	return len<0;
	}

printSym(p) register struct symCell *p; {
	register int l;
	register char *c;

	c=p->cIdPtr;
	for (l=p->cIdLen; l!=0; l--)
		putchar(*c++);
	}

int
idHead(c) register char c; {
	return 'a'<=c && c<='z' ||
		'A'<=c && c<='Z' ||
		c=='_' ||
		asmSw && c=='.';
	}

int
getId() {
	register int c;
	register char delim;

	c=lastInChar;
	if (textSw)
		while (!idHead(c)) {
			if (c == '\n') {
				lineNo++;
				c=getchar();
				if (c ==EOF)
return FALSE;
				}
			else {
				assert(c!=EOF,"unexpected EOF");
				c=getchar();
				}
			}
	else if (asmSw)
		while (!idHead(c)) {
			switch (c) {
			case '\n':
				lineNo++;
				c=getchar();
				if (c==EOF)
return FALSE;
			continue;
			case '/':
				do c=getchar();
					while (c!='\n' && c!=EOF);
			continue;
			case '\"':
				c=getchar();
				if (c=='\\')
					c=getchar();
				if (c==EOF || c=='\n')
			break;
				/* fall through */
			case '\'':
				c=getchar();
				if (c=='\\')
					c=getchar();
				break;
			case '<':
				do {
					if (c=='\\')
						c=getchar();
					assert((c!='\n')&(c!=EOF),
						"bad string");
					c=getchar();
					}
				while (c!='>');
				break;
			case '\\':
				c=getchar();
				}
			assert(c!='\n',"unexpected newline");
			assert(c!=EOF,"unexpected EOF");
			c=getchar();
			}
	else	/* must be c */
		while (!idHead(c)) {
			switch (c) {
			case '\n':
				lineNo++;
				c=getchar();
				if (c==EOF)
return FALSE;
			continue;
			case '/':
				c=getchar();
				if (c=='/') {
					do c=getchar();
						while (c!='\n' && c!=EOF);
			continue;
					}
				else if (c=='*') {
					c=getchar();
					do {
						while (c!='*') {
							if (c=='\n')
								lineNo++;
							else
								assert(c!=EOF,"unending comment");
							c=getchar();
							}
						c=getchar();
						} while (c!='/');
				break;
					}
				else
			continue;
			case '\"':
			case '\'':
				delim=c;
				c=getchar();
				while (c!=delim) {
					if (c=='\\')
						c=getchar();
					assert((c!='\n') && (c!=EOF),
						"bad string");
					c=getchar();
					}
				}
		assert(c!=EOF,"unexpected EOF");
		c=getchar();
		}
	idPtr=freeChar;
	idHash=0;
	idLen=0;
	do {
		assert(freeChar<&strTab[sizeStrTab],
			"char space");
		*freeChar++ = c;
		if (++idLen <= numSigChars)
			idHash += (c&caseMask) * idLen;
		c=getchar();
		} while ('0'<=c && c<='9' || idHead(c));
	idHash %= sizeHashTab;
	if (idLen>numSigChars)
		eLong=TRUE;
	lastInChar=c;
	return TRUE;
	}

buildTables() {
	register struct symCell *s;
	register struct symCell *cp;

	while (getId())
		if (idLen<numSigChars && (caseMask&('a'^'A'))!=0)
			freeChar=idPtr;
		else {
			cp=NULL;
			for (s=hashTab[idHash]; ; s=s->cNext) {
				if (s == NULL) {
					/* identifier not found: add it */
					assert(freeSymCell<&symTab[sizeSymTab],
						"sym space");
					s=freeSymCell++;
					s->cIdPtr = idPtr;
					s->cIdLen = idLen;
					s->cClashPtr = cp;
					s->cNext = hashTab[idHash];
					hashTab[idHash] = s;
			break;
					}
				if ((s->cIdLen == idLen) &&
				    strncmp(s->cIdPtr,idPtr,idLen)==0) {
					/* identifier found: done */
					freeChar=idPtr;
			break;
					}
				if ((cp == NULL) &&
				    strSimilar(s->cIdPtr,idPtr)) {
					/* similar id found: remember */
					cp=s;
					eClash=TRUE;
					}
				}
			}
	}

printLong() {
	register struct symCell *s;

	printf("Symbols longer than %d chars%s%s:\n",numSigChars,mIn,mFile);
	for (s = &symTab[0]; s<freeSymCell; s++)
		if (s->cIdLen > numSigChars) {
			printSym(s);
			putchar('\n');
			}
	}

printClash() {
	register struct symCell *hp;
	register struct symCell *s;
	register struct symCell *t;

	printf("Identifiers not distinct in %d%s chars%s%s:\n",
		numSigChars, (caseMask&('a'^'A'))==0?" monocase" : "",
		mIn, mFile);
	for (hp = freeSymCell; hp > &symTab[0]; ) {
		hp--;
		if (hp->cClashPtr!=NULL) {
			for (s=hp; ; s=t) {
				printSym(s);
				t=s->cClashPtr;
				if (t==NULL)
			break;
				s->cClashPtr=NULL;
				putchar('/');
				}
			putchar('\n');
			}
		}
	}


int
main(argc,argv) int argc; char *argv[]; {
	register char *p;
	register int worked;

	init();
	worked = FALSE;
	for (argCursor=argv; --argc>0 || !worked;) {
		p = *++argCursor;
		if (argc > 0 && *p=='-') {
			while (*++p!='\0')
				switch (*p) {
				case 't':
					textSw=TRUE;
					break;
				case 'a':
					textSw=FALSE;
					asmSw=TRUE;
					break;
				case 'c':
					asmSw=textSw=FALSE;
					break;
				case 'd':
					dumpOnError=!dumpOnError;
					break;
				case 'l':
					printAndInit();
					iLong=!iLong;
					break;
				case 's':
					printAndInit();
					separateSw=TRUE;
					break;
				case 'm':
					printAndInit();
					caseMask ^= 'a'^'A';
					break;
				default:
					assert(('0' <= *p) && (*p <= '9'),"funny option");
					printAndInit();
					for (numSigChars=0; '0'<=*p && *p<='9'; p++)
						numSigChars = numSigChars*10+(*p-'0');
					p--;
					assert(numSigChars>0,"silly number");
					}
			}
		else {
			mIn="";
			mFile="";
			if (argc>0) {
				if (separateSw) {
					mIn=" in ";
					mFile = p;
					}
				assert(freopen(p,"r",stdin)!=NULL,
					"no such file");
				}
			if (separateSw)
				init();
			buildTables();
			if (separateSw || argc<=1)
				printAndInit();
			fclose(stdin);
			worked = TRUE;
			}
		}
	return 0;
	}
//E*O*F clash.c//

exit 0