[net.sources] Choose - select random lines from a file

sjoerd@tjalk.UUCP (Sjoerd Mullender) (08/28/85)

A few days ago choose was posted in mod.sources.  It chooses random lines
from its input files.  This was done by first copying the input files to
a temporary file while counting the lines.  After that it chooses the lines
from this temporary file.  This means it has to read the input twice.
Here is a version that will only read the input once.  It does not use a
temporary file and it does not need a huge amount of memory.
Here is some timing information:

input file with 24473 lines and 201032 characters
		  real	  user	 sys
old version	  49.0	  10.5	 3.4
new version	  14.0	   6.4	 1.4

input file with 141487 lines and 1596728 characters
		  real	  user	 sys
old version	2:38.0	1:20.1	24.1
new version	  59.0	  45.3	 9.9

	Sjoerd Mullender
	...!{decvax,philabs,seismo}!mcvax!vu44!sjoerd

This is NOT a shell archive, so leave the shell home this time.
----------------------CUT HERE---------------------------
/*
 * Choose - select random lines from a file.
 *
 * Usage:
 *	choose [ -n ] [ file ] ...
 *
 * Options:
 *	-n	# of lines to choose (default 1)
 *
 * Author:
 *	Sjoerd Mullender (sjoerd@tjalk.UUCP)
 *
 * Choose selects random lines from the concatenation of the input files
 * (standard input default).
 */
#include <stdio.h>

#define MAXLENGTH	2048

extern char *malloc();

char thisline[MAXLENGTH];

/*
 * Open the file `name' according to `mode.'
 * If the open fails, exit with an error message.
 */
FILE *efopen(name, mode)
char *name, *mode;
{
	register FILE *f;

	if ((f = fopen(name, mode)) == 0) {
		fprintf(stderr, "choose: cannot open %s\n", name);
		exit(1);
	}
	return f;
}

main(argc, argv)
char *argv[];
{
	register c;
	register char *p;
	register nlines;		/* # of lines in input so far */
	register FILE *f;
	register char (*chosenline)[MAXLENGTH];	/* the lines chosen */
	register nchoose;		/* # of lines to choose */
	long t;

	/* initlialisations */
	time(&t);
	srand(getpid() + (int)((t >> 16) + t));
	p = thisline;

	argv++;
	if (*argv && **argv == '-') {
		nchoose = atoi(&argv[0][1]);
		argv++;
	} else
		nchoose = 1;

	chosenline = (char (*)[MAXLENGTH]) malloc(nchoose * sizeof *chosenline);
	nlines = 0;

	if (*argv)
		f = efopen(*argv++, "r");
	else
		f = stdin;
	for (;;) {
		while ((c = getc(f)) == EOF) {
			fclose(f);
			if (*argv)
				f = efopen(*argv++, "r");
			else
				break;
		}
		if (c == EOF)
			break;
		if (c == '\n') {
			nlines++;
			*p++ = 0;
			if (nlines < nchoose)
				strcpy(chosenline[nlines-1], thisline);
			else if (rand() % nlines < nchoose)
				strcpy(chosenline[rand() % nchoose], thisline);
			p = thisline;
		} else if (p < &thisline[MAXLENGTH - 1])
			*p++ = c;
	}
	if (nlines >= nchoose)
		for (nlines = 0; nlines < nchoose; nlines++)
			puts(chosenline[nlines]);
	exit(nlines < nchoose);
}