ok@SUN.COM@@UUNET.UU.NET:quintus.UUCP (12/09/87)
This is a UNIX utility which copies a random sample of the lines in its standard input to its standard output (in random order). It was originally written to generate test data for sorting utilities, but is generally useful. This version is 4.2ish; I find drand48() intimidating. #!/bin/sh cat >sample.1 <<'------ EOF ------' .TH SAMPLE 1 .\"/*%nroff -man % .SH NAME sample \*- selects a random sample of input .SH SYNOPSIS .B sample [ -# ] [ -w# ] [ -r# ] <in >out .SH DESCRIPTION .B Sample selects a random sample of the lines in its standard input and writes them in random order to its standard output. All samples are equally likely, and all permutations of any given sample are equally likely. .PP This program does .I not accept file names as arguments. To process more than one input file, pipe the output of cat(1) into .B sample. .PP The program does not need to know how many lines there are in the standard input, nor does it need to read the standard input more than once. However, the sample must fit into memory. .SH OPTIONS .IP -# 5 This specifies the number of lines in the sample. The default is 10. The number may be in decimal, octal, or hexadecimal, using the conventions of strtol(3). .IP -w# 5 This specifies the maximum Width of an input line (exclusive of new-lines). If any input line exceeds this limit, an error message will be printed and the program will stop without producing any output. The default is 256. Only one record of this size is allocated. .IP -r# 5 This specifies the seed for the Random number generator. If you want to be able the same sample again later, you must specify this option. The default is to use the current value of time(3) as the seed. .SH EXAMPLES .IP % 3 sample </usr/dict/words .PP outputs 10 words selected at random from /usr/dict/words .IP % 3 ypcat passwd | sample -020 .PP outputs 16 (020 is an octal number) random users from the Yellow Pages. .IP % 3 ls *.o | sample -w14 -30 .PP outputs 30 randomly selected files in the current directory whose names match the pattern "*.o", but if any such file has a name exceeding 14 characters (not just the ones selected) it prints an error message and stops without outputting any file names. .IP % 3 cat foo baz | sample -100 .PP outputs 100 lines drawn at random from foo and baz. It is possible that all the lines might come from foo or all of them from baz. .IP % 3 ( sample -50 <foo ; sample -50 <baz) | sample -100 .PP outputs 100 lines in random order with 50 of them coming from foo and 50 coming from baz, but randomly interleaved. .SH DIAGNOSTICS An error message is printed on stderr and the program halts without producing any output if .PP an input line is too long (-w option), or .PP the program is unable to obtain enough memory using malloc (if you want N lines each of which can contain W characters, you may need as much as (N+2)*(W+6) bytes), or .PP the command line is botched. .PP All error messages indicate the actual name the program was invoked by. If an error message is produced, the rest of the standard input is not read, so you are likely to get "Broken pipe" messages from upstream processes. .SH "SEE ALSO" head(1), awk(1), random(3) .SH AUTHOR Richard A. O'Keefe, Quintus Computer Systems, Inc. ------ EOF ------ ls -l sample.1 cat >sample.c <<'------ EOF ------' /* File : sample.c Author : Richard A. O'Keefe Updated: %G% Purpose: select a random sample from a file of lines SYNOPSIS cat file... | sample -# >output WHAT IT DOES sample -N selects a random sample of N lines from its standard input and writes them to its standard output. Although this has the form of a filter, no output can be generated until all of the input has been consumed. The sample is written out in random order. All samples are generated with the same probability, as are all permutations of each sample. OPTIONS It is a general policy of mine to make options insensitive to case, and to reserve the options -silent [as in many System V utilities] -interactive [as in rm(1)] -file [for command files as in awk(1) or sed(1)] -e<C> [for an escape character] -t<C> [for a field delimiter] -width N [for the input line width limit] -output F [as in cc(1) or sort(1)] Other things being equal, my policy on file names is to treat them as if there were an implicit "cat". This program does not accept file names in its command line; it only works from the standard input. The only relevant "standard" option is -width. If your records consist of more than one line, use paste -s -d'\n' file... | sample | tr '' '\n' with n-1 ^As in the paste argument to glue n lines together. This will be handled by a -group option in a later version. If you specify a sample size it is an error for the input to contain fewer lines. You can get a random permutation of all the input lines by writing cat file ... >#scratch sample -`wc -l #scratch` <#scratch >output In addition to the sample size, you can specify the maximum size of an input line by writing -width # -width=# -w# or any obvious variant. Normally, a different sample will be generated every time the program is run. To get the same sample, specify -randomise # -randomise=# -r# or any obvious variant. All numbers in the command line are processed with strtol(3) so decimal, octal, and hex can be used. DEFAULTS -width 1024 -10 SEE ALSO Knuth, The Art of Computer Programming, Vol 2, page 138. Knuth This program n sample_size t records_read_so_far+1 M random_choice+1 */ #include <stdio.h> extern long strtol(/* char*, char**, int */); extern char *malloc(/* unsigned */); extern char *fgets(/* char*, unsigned, FILE* */); extern char *strcpy(/* char*, char* */); extern /*void*/ srandom(/* long */); extern long random(); extern long time(/* long* */); #define IsDigit(c) ((unsigned)((c) - '0') < 10) #define Random(n) ((unsigned)(random()) % (unsigned)(n)) #define DFLT_SAMPLE_SIZE 10 #define DFLT_MAX_LINE_WIDTH 256 char *program_name; /* For error messages */ int max_line_width; /* -width : bound on length of input lines */ char *input_line; /* Dynamically allocated buffer */ int sample_size; /* -# : the number of lines to output */ int records_read_so_far; /* just what it says */ void OutputError() { (void) fprintf(stderr, "%s: I/O error near byte %ld of output\n", program_name, ftell(stdout)); exit(1); } /* estrtol(input, option, must_be_positive) converts a string (input) to binary. If must_be_positive is true, the result should be > 0. If anything goes wrong, an error message is written to the standard error stream and the program exits. This should only be used for converting numeric program arguments. Any integer value acceptable to C is accepted. Note that both the strtol() and the sscanf() version will accept leading white-space characters; I don't really want this, but it seems harmless. The sign check is actually useless in this program, because a negative sign will be ineffective (-r-3) or rejected (-r -3) no matter what. */ long estrtol(input, option, must_be_positive) char *input; /* to be converted */ char *option; /* for error message */ int must_be_positive; { long result; #ifdef NOSTRTOL int nmatched; char c; if (input[0] != '0') { /* decimal */ nmatched = sscanf(input, "%ld%c", &result, &c); } else if (input[1] == 'x' || input[1] == 'X') { /* 0x: hexadecimal */ nmatched = sscanf(input+2, "%lx%c", &result, &c); } else { /* 0: octal */ nmatched = sscanf(input, "%lo%c", &result, &c); } if (nmatched != 1 || must_be_positive && result <= 0) { #else char *after; result = strtol(input, &after, 0); if (after == input || *after || must_be_positive && result <= 0) { #endif (void) fprintf(stderr, "%s: ill-formed %s value: %s\n", program_name, option, input); exit(1); } return result; } /* read_input_line() reads one complete line (terminated by a new-line character or the end of the stream) from the standard input stream. If this line contains at most max_line_width characters, including the new-line character and a terminating NUL, those characters are stored in input_line and a normal return is done. If the end of the stream is encountered without any characters being read, NULL is returned and input_line is undefined. If the line is too long, an error message is printed and the program terminates. Note that the user-supplied line width is incremented by 2 to allow for the '\n' and '\0'. */ char *read_input_line() { int L; if (!fgets(input_line, max_line_width, stdin)) return NULL; L = strlen(input_line); if (input_line[L-1] != '\n') { /* the input line is too long; fgets has left the */ /* rest of it unread in the standard input */ while (fgets(input_line, max_line_width, stdin)) { int d = strlen(input_line); L += d; if (input_line[d-1] == '\n') break; } (void) fprintf(stderr, "%s: line %d of the input has %d characters, \ which exceeds %d\n", program_name, records_read_so_far+1, L-1, max_line_width-2); exit(1); } return input_line; } /* copy_input_line() copies the significant part of input_line into a newly allocated block and returns that block. If more space cannot be allocated, an error message is printed and the program terminates. Note that the max_line_width has no effect on the size of block which is allocated: only the actual length of the line matters. */ char *copy_input_line() { int L; char *result; L = strlen(input_line); result = malloc((unsigned)(L+1)); if (!result) { (void) fprintf(stderr, "%s: unable to allocate %d bytes for line %d of the input\n", program_name, L+1, records_read_so_far+1); exit(1); } return strcpy(result, input_line); } /* base_part(filename) takes a file name of the form xxx/xxx/xxx/basepart and returns a pointer to the beginning of the basepart. That is, to just after the last / if there is one, or to the beginning of the filename is there is none. In this program, this is just used in error messages. */ char *base_part(filename) char *filename; { char *b; for (b = filename; *b; b++) if (*b == '/') filename = b+1; return filename; } /* crack(argc, argv) analyses the arguments. It has to set program_name from argv[0] max_line_width from -w[idth][ ]N random seed from -r[andomise][ ]N sample_size from -# */ void crack(argc, argv) int argc; char **argv; { int random_seed; char **ap; char *p; program_name = "sample"; max_line_width = DFLT_MAX_LINE_WIDTH; random_seed = (int)time((long*)0); sample_size = DFLT_SAMPLE_SIZE; if (argc == 0) return; /* can this ever happen? */ program_name = argv[0]; for (ap = argv; p = *++ap; ) { if (*p != '-') { (void) fprintf(stderr, "%s: illegal argument %s\n", program_name, p); exit(1); } switch (*++p) { case 'w': case 'W': /* -width */ while (*p && !IsDigit(*p)) p++; if (!*p) { /* The input is -w # as two arguments */ p = *++ap; if (!p || !IsDigit(*p)) { (void) fprintf(stderr, "%s: no value for %s option\n", program_name, ap[-1]); exit(1); } } /* Input was -wxxx|p or -wxxx |p */ max_line_width = estrtol(p, "-width", 1); break; case 'r': case 'R': /* -randomise */ while (*p && !IsDigit(*p)) p++; if (!*p) { /* The input is -r # as two arguments */ p = *++ap; if (!p || !IsDigit(*p)) { (void) fprintf(stderr, "%s: no value for %s option\n", program_name, ap[-1]); exit(1); } } /* Input was -rxxx|p or -rxxx |p */ random_seed = (int)estrtol(p, "-randomise", 0); break; default: if (!IsDigit(*p)) { (void) fprintf(stderr, "%s: unknown option: %s\n", program_name, ap[0]); exit(1); } sample_size = estrtol(p, "-samplesize", 1); break; } } srandom(random_seed); max_line_width += 2; /* to allow for "\n\0" at the end */ } main(argc, argv) int argc; char **argv; { int random_choice; int n; char **reservoir; /* Analyse the arguments */ crack(argc, argv); /* Try to allocate the line buffer */ input_line = malloc((unsigned)max_line_width); if (!input_line) { (void) fprintf(stderr, "%s: unable to allocate %d bytes for line buffer\n", program_name, max_line_width); exit(1); } /* Try to allocate the reservoir */ reservoir = (char**) malloc((unsigned)sample_size * sizeof *reservoir); if (!reservoir) { (void) fprintf(stderr, "%s: unable to allocate space for %d pointers\n", program_name, sample_size); exit(1); } /* Now fill the reservoir */ for (records_read_so_far = 0; /*while*/ records_read_so_far < sample_size; /*doing*/ records_read_so_far++) { if (!read_input_line()) { (void) fprintf(stderr, "%s: input has only %d lines: %s %s\n", program_name, records_read_so_far, base_part(program_name), argv[1]); exit(1); } reservoir[records_read_so_far] = copy_input_line(); } /* read the rest of the input */ /* replacing reservoir entries at random */ while (read_input_line()) { records_read_so_far++; random_choice = Random(records_read_so_far); if (random_choice < sample_size) { (void)free(reservoir[random_choice]); reservoir[random_choice] = copy_input_line(); } } /* Finally, copy the reservoir to the standard output */ /* We scramble it as we go */ for (n = 0; n < sample_size; n++) { random_choice = Random(sample_size-n) + n; if (fputs(reservoir[random_choice], stdout) < 0) OutputError(); reservoir[random_choice] = reservoir[n]; } exit(0); } ------ EOF ------ ls -l sample.c