sjoerd@tjalk.UUCP (Sjoerd Mullender) (08/28/85)
A few days ago choose was posted in mod.sources. It chooses random lines from its input files. This was done by first copying the input files to a temporary file while counting the lines. After that it chooses the lines from this temporary file. This means it has to read the input twice. Here is a version that will only read the input once. It does not use a temporary file and it does not need a huge amount of memory. Here is some timing information: input file with 24473 lines and 201032 characters real user sys old version 49.0 10.5 3.4 new version 14.0 6.4 1.4 input file with 141487 lines and 1596728 characters real user sys old version 2:38.0 1:20.1 24.1 new version 59.0 45.3 9.9 Sjoerd Mullender ...!{decvax,philabs,seismo}!mcvax!vu44!sjoerd This is NOT a shell archive, so leave the shell home this time. ----------------------CUT HERE--------------------------- /* * Choose - select random lines from a file. * * Usage: * choose [ -n ] [ file ] ... * * Options: * -n # of lines to choose (default 1) * * Author: * Sjoerd Mullender (sjoerd@tjalk.UUCP) * * Choose selects random lines from the concatenation of the input files * (standard input default). */ #include <stdio.h> #define MAXLENGTH 2048 extern char *malloc(); char thisline[MAXLENGTH]; /* * Open the file `name' according to `mode.' * If the open fails, exit with an error message. */ FILE *efopen(name, mode) char *name, *mode; { register FILE *f; if ((f = fopen(name, mode)) == 0) { fprintf(stderr, "choose: cannot open %s\n", name); exit(1); } return f; } main(argc, argv) char *argv[]; { register c; register char *p; register nlines; /* # of lines in input so far */ register FILE *f; register char (*chosenline)[MAXLENGTH]; /* the lines chosen */ register nchoose; /* # of lines to choose */ long t; /* initlialisations */ time(&t); srand(getpid() + (int)((t >> 16) + t)); p = thisline; argv++; if (*argv && **argv == '-') { nchoose = atoi(&argv[0][1]); argv++; } else nchoose = 1; chosenline = (char (*)[MAXLENGTH]) malloc(nchoose * sizeof *chosenline); nlines = 0; if (*argv) f = efopen(*argv++, "r"); else f = stdin; for (;;) { while ((c = getc(f)) == EOF) { fclose(f); if (*argv) f = efopen(*argv++, "r"); else break; } if (c == EOF) break; if (c == '\n') { nlines++; *p++ = 0; if (nlines < nchoose) strcpy(chosenline[nlines-1], thisline); else if (rand() % nlines < nchoose) strcpy(chosenline[rand() % nchoose], thisline); p = thisline; } else if (p < &thisline[MAXLENGTH - 1]) *p++ = c; } if (nlines >= nchoose) for (nlines = 0; nlines < nchoose; nlines++) puts(chosenline[nlines]); exit(nlines < nchoose); }