[comp.sources.misc] v02i008: subst - substitute strings for strings

aeb@cwi.nl (Andries Brouwer) (01/20/88)

Comp.sources.misc: Volume 2, Issue 9
Submitted-By: Andries Brouwer <aeb@cwi.nl>
Archive-Name: subst

[The next few postings are from rs's slush pile.  At least we know he's still
alive.  ;-)  Also -- you will have noticed that there are occasional irregular
headers -- I aadding the compatible headers manually while trying to finish
the generalized posting program, I occasionally mess up.  Sorry!  ++bsa]

This program does unlimited string substitution.
I needed it because of the limits built into sed
(this program has no limits on string sizes or
number of strings), and the awkwardness of worrying
about the special characters of sed.
Maybe it is useful to others as well.

The program header documents it.

---------------------------------------------------
/* subst: substitute fixed strings for other fixed strings - aeb@cwi.nl */
/* written 11 Nov. 1987 - placed in public domain - do not delete header */
/*
 * Call: subst [-acs] sfile [ifile]
 * Here sfile is the file with descriptions of the substitutions
 * to be performed and ifile is the input file.
 * If ifile is not given, then stdin is read.
 * If the option -c is given, then the following argument is itself the
 * substitution description.
 * The description has the format:
 *	<old><tab><new>
 * and different description lines are separated by newlines.
 * In cases where <old> might contain tabs or <new> might contain
 * newlines, one can use subst -s .
 * Now description lines have the format
 *	<sep><old><sep><tab><sep2><old><sep2>
 * and again different description lines are separated by newlines.
 * (Here <sep> and <sep2> denote arbitrary single characters.)
 * By default, only the places where <old> occurs as "keyword",
 * i.e., not preceded or followed by a letter or digit, are substituted,
 * but the option -a causes substitution for all occurrences.
 */

/*
   Why not use sed or /lib/cpp or m4 or ... ? Well, m4 and /lib/cpp
   react to special characters in the file, but I want to leave the
   file as it is, except for these substitutions. What about sed?
   This is not so bad, but requires some preprocessing of sfile
   in case the strings may contain . or & etc. When both sfile and
   ifile are computer generated, this is a hassle, and the present
   solution is much cleaner. Moreover, subst has no built-in limits.
   Note: this program works reasonably well when the number of
   substitution strings is not too large. It uses a linear list, and
   this becomes very slow when there are thousands of substitution strings.
*/

#include <stdio.h>
extern char *malloc(), *realloc(), *strcpy(), *grow(), *alloc(), *input();

int opta, optc, opts;
FILE *inf, *sf;
char *sin, *iname, *sname;
int eoi;
struct repl {
	char *in;
	char *out;
	struct repl *next;
} *replhead, *repltail;

char speedup[256];	/* let us hope that characters have 8 bits ... */

usage(){
	fprintf(stderr, "subst: Usage is  subst [-cs] sfile [ifile]\n");
	exit(2);
}

main(argc,argv) int argc; char **argv; {
	while(argc > 1 && argv[1][0] == '-') {
		do {
			switch(argv[1][1]) {
			case 's':
				opts++;
				break;
			case 'c':
				optc++;
				break;
			case 'a':
				opta++;
				break;
			default:
				usage();
				exit(2);
			}
			argv[1]++;
		} while(argv[1][1]);
		argv++;
		argc--;
	}
	if(argc > 3 || argc < 2)
		usage();
	if(argc == 3) {
		iname = argv[2];
		inf = fopen(iname,"r");
		if(inf == NULL) {
			perror(iname);
			exit(1);
		}
	} else {
		inf = stdin;
		iname = "<stdin>";
	}
	if(!optc) {
		sname = argv[1];
		sf = fopen(sname,"r");
		if(sf == NULL) {
			perror(sname);
			exit(1);
		}
	} else {
		sin = argv[1];
		sname = "<argin>";
	}
	getsfile();
	do_it();
	return(0);
}

unsigned maxilth, maxolth;

getsfile(){

#define	LSIZ	4

#define put_in_buf(c) {\
	if(bufp >= buf + bsz) {\
		buf = grow(buf, bsz + LSIZ);\
		bufp = buf + bsz;\
		bsz += LSIZ;\
	}\
	*bufp++ = c;\
}

#define put_in_repl(inout,max) {\
	register unsigned lth = strlen(buf) + 1;\
	if(lth > max) max = lth;\
	replp->inout = alloc(lth);\
	(void) strcpy(replp->inout,buf);\
}

#define put_in_chain	{\
	replp->next = NULL;\
	if(replhead == NULL)\
		replhead = replp;\
	else\
		repltail->next = replp;\
	repltail = replp;\
	replp = (struct repl *) alloc(sizeof(struct repl));\
}

	char line[LSIZ],  *buf;
	register char *lp, *bufp;
	register int state = 0, bsz = 0, eos = 0;
	register struct repl *replp;
	char sep;

	if(optc)
		lp = sin;
	else {
		lp = line;
		line[0] = 0;
	}

	replp = (struct repl *) alloc(sizeof(struct repl));

	buf = alloc(LSIZ);
	bufp = buf;
	bsz = LSIZ;

	while(!eos) {
		if(!*lp) {
			if(optc) {
				lp = "\n";
				eos++;
			} else {
				if(fgets(line, sizeof(line), sf) == NULL) {
					if(ferror(sf)) {
						perror(sname);
						exit(1);
					}
					break;
				}
				lp = line;
				if(!*lp) return;	/* strange ... */
			}
		}
		switch(state) {
		case 0:		/* before in */
			state = 1;
			if(opts) {
				sep = *lp++;
				continue;
			}
			sep = '\t';
			/* fall through */
		case 1:		/* reading in */
			if(*lp != sep) {
				put_in_buf(*lp++);
				continue;
			}
			lp++;
			put_in_buf(0);
			put_in_repl(in,maxilth);
			bufp = buf;
			state = (opts ? 2 : 3);
			continue;
		case 2:		/* waiting for tab */
			if(*lp++ == '\t') state = 3;
			continue;
		case 3:		/* before out */
			state = 4;
			if(opts) {
				sep = *lp++;
				continue;
			}
			sep = '\n';
			/* fall through */
		case 4:		/* reading out */
			if(*lp != sep) {
				put_in_buf(*lp++);
				continue;
			}
			lp++;
			put_in_buf(0);
			put_in_repl(out,maxolth);
			bufp = buf;
			put_in_chain;
			state = (opts ? 5 : 0);
			continue;
		case 5:		/* waiting for newline */
			if(*lp++ == '\n') state = 0;
			continue;
		}
	}

	free((char *) replp);
	free(buf);
}


do_it(){

#define	ISIZ	16384

#define is_ok(c) (c < '0' || (c < '@' && c > '9') || (c < 'a' && c > 'Z') || c > 'z')

#define	assure_ip	if(ip == ibuf1) {\
	register char *tp;\
	if(eoi)\
		goto nxt;\
	output(ibuf, ibufp-ibuf);\
	tp = ibufp;\
	ip = ibufp = ibuf = ibuf0;\
	while(tp < ibuf1)\
		*ip++ = *tp++;\
	ibuf1 = input(ip, ibuf1-ip);\
	if(ip == ibuf1)\
		goto nxt;\
}

	register struct repl *replp;
	register char *ibuf, *ibuf0, *ibuf1, *ibufp, *cp, *ip;
	register unsigned ilth;
	int prevc_is_ok = 1;

	/* small speedup: remember first char of all in-strings */
	/* [this changes the semantics slightly: we do no longer
	    replace the empty string by something, but that would
	    otherwise lead to an infinite loop, so is useless anyway] */
	for(replp = replhead; replp; replp = replp->next)
		speedup[replp->in[0]] = 1;

	ilth = 2*maxilth;
	if(ISIZ > ilth)
		ilth = ISIZ;
	ibuf0 = alloc(ilth);
	ibufp = ibuf = ibuf1 = ibuf0 + ilth;

	while(1) {

		if(ibufp == ibuf1) {
			output(ibuf, ibufp-ibuf);
			if(eoi)
				return;
			ibufp = ibuf = ibuf0;
			ibuf1 = input(ibuf, ibuf1-ibuf);
			if(ibuf == ibuf1)
				return;
		}

		if(prevc_is_ok && speedup[*ibufp])
		for(replp = replhead; replp; replp = replp->next) {
			cp = replp->in;
			ip = ibufp;
			while (*cp) {
				assure_ip;
				if(*cp++ != *ip++)
					goto nxt;
			}
			/* found a match! */
			if(!opta) {
				assure_ip;
				if(!is_ok(*ip))
					goto nxt;
			}
			output(ibuf, ibufp-ibuf);
			fputs(replp->out, stdout);
			ibufp = ibuf = ip;
			goto nxt2;
		nxt:	;
		}
		if(!opta)
			prevc_is_ok = is_ok (*ibufp);
		ibufp++;
	nxt2:	;
	}
}

char *
input(ibuf,n) char *ibuf; register int n; {
	register int nn = fread(ibuf, sizeof(char), n, inf);
	if(nn < n) {
		if(feof(inf))
			eoi++;
		else {
			perror("subst: input error: ");
			exit(1);
		}
	}
	return(ibuf + nn);
}

output(obuf,n) char *obuf; register int n; {
	if(n > 0) {
		if(fwrite(obuf, sizeof(char), n, stdout) != n) {
			perror("subst: write error: ");
			exit(1);	/* probably file system full? */
		}
	}
}

char *
alloc(n) unsigned n; {
	register char *a = malloc(n);
	if(a == NULL) {
		fprintf(stderr, "subst: out of memory\n");
		exit(1);
	}
	return(a);
}

char *
grow(a,n) register char *a; register int n; {
	a = realloc(a, (unsigned) n);
	if(a == NULL) {
		fprintf(stderr, "subst: realloc failed\n");
		exit(1);
	}
	return(a);
}
-- 
      Andries Brouwer -- CWI, Amsterdam -- uunet!mcvax!aeb -- aeb@cwi.nl