allbery@uunet.UU.NET (Brandon S. Allbery - comp.sources.misc) (10/08/89)
Posting-number: Volume 8, Issue 92 Submitted-by: ok@cs.mu.oz.au.UUCP (Richard O'Keefe) Archive-name: nsort #!/bin/sh # This is a shell archive, meaning: # 1. Remove everything above the #!/bin/sh line. # 2. Save the resulting text in a file. # 3. Execute the file with /bin/sh (not csh) to create the files: # README # makefile # nsort.1 # nsort.c cat >README <<'------ EOF ------' nsort is a totally stripped down sort(1); it reads a file into memory, sorts it using merge sort, and writes the result out. I wrote it to show that merge sort is fast -- sort(1) is documented as using a version of quicksort internally, and nsort beats it easily even when sort(1) is given enough memory to sort in memory. To compile it, just run make(1). ------ EOF ------ ls -l README cat >makefile <<'------ EOF ------' nsort: cc -o nsort -O nsort.c ------ EOF ------ ls -l makefile cat >nsort.1 <<'------ EOF ------' .TH NSORT 1 .\"/*%nroff -man % .SH NAME nsort \*- fast and simple sorting program .SH SYNOPSIS .B nsort <in >out .SH DESCRIPTION The .B nsort command reads lines from its standard input, sorts them, and writes the sorted lines to its standard output. It behaves just like sort(1) when the latter is given no arguments. The only reason for using .B nsort is speed: when it can be used it is up to twice as fast as sort(1). .PP This program does .I not accept file names as arguments. To process more than one input file, pipe the output of cat(1) into .B nsort. .PP The program does not need to know how many lines there are in the standard input, nor does it need to read the standard input more than once. However, the data must fit into memory. .SH OPTIONS The .B nsort command has no options. .SH EXAMPLES .IP % 3 ls -f . | nsort .PP has the same effect as a plain `ls'. .SH DIAGNOSTICS An error message is printed on stderr and the program halts without producing any output if .PP there are any arguments on the command line (status EX_USAGE is returned in this case), or .PP the program is unable to obtain enough memory using malloc (if the file has N lines and C characters, roughly 8N+C bytes are needed) (status EX_SOFTWARE is returned in this case), or .PP something goes wrong while reading standard input (status EX_IOERR is returned in this case). .PP All error messages indicate the actual name the program was invoked by. If an error message is produced, the rest of the standard input is not read, so you are likely to get "Broken pipe" messages from upstream processes. .SH BUGS Input lines which contain NUL characters are quietly truncated. sort(1) cannot handle such lines either, but complains. .SH "SEE ALSO" sort(1) .SH AUTHOR Richard A. O'Keefe ------ EOF ------ ls -l nsort.1 cat >nsort.c <<'------ EOF ------' /* File : nsort.c Author : Richard A. O'Keefe Updated: 1988 Purpose: Fast sorting command for smallish files. This program has no copyright notice. You can do anything you please with it, except blame me if it doesn't work. */ #ifndef lint static char SCCSid[] = "%Z%%E% %M% %I%"; #endif/*lint*/ /* nsort <input >output is a very simple sorting program which has been written to be as fast as possible. It is equivalent to sort with no arguments. That is, it sorts its standard input to its standard output, and compares entire lines. It uses a natural merge, so if the input is already in order it takes linear time, and it reads the entire file into memory, using a single read() if the standard input is a regular file. Sorting random permutations of /usr/dict/words on a Sun-3/50: sort(1) : 15 seconds nsort : 8 seconds (with SCSI disc) 17 seconds 11 seconds (NFS over Ethernet) */ #include <stdio.h> /* The following values are taken from <sysexits.h>, but are copied here in case you lack that BSD header file. */ #define EX_OK 0 /* nothing went wrong */ #define EX_USAGE 64 /* something wrong with the command line */ #define EX_SOFTWARE 70 /* internal error (here, memory ran out) */ #define EX_IOERR 71 /* transput error (here, read() trouble) */ #define STDIN 0 extern char * malloc(); extern char * strcpy(); extern int strcmp(); extern int strlen(); extern void perror(); extern int lseek(); extern int read(); typedef struct item_rec *item_ptr; struct item_rec { item_ptr next; char* data; }; #define IN_ORDER(x, y) (strcmp((x)->data, (y)->data) <= 0) item_ptr nat_sort(list) item_ptr list; { item_ptr stack[30]; item_ptr *sp = stack; int runs = 0; /* total number of runs processed */ register item_ptr p, q, r; struct item_rec header; int k; while (p = list) { /* pick up a run from the front of list, setting */ /* p = (pointer to beginning of run), list = (rest of list) */ if (q = p->next) { list = q->next; if (!IN_ORDER(p, q)) r = q, q = p, p = r; p->next = q; for (r = list; r && IN_ORDER(q, r); r = r->next) q->next = r, q = r; q->next = (item_ptr)0; list = r; } else { list = (item_ptr)0; } runs++; /* merge this run with 0 or more runs off the top of the stack */ for (k = runs; 1 &~ k; k >>= 1) { q = *--sp; r = &header; while (q && p) if (IN_ORDER(q, p)) { r->next = q; r = q, q = q->next; } else { r->next = p; r = p, p = p->next; } r->next = q ? q : p; p = header.next; } /* push the merged run onto the stack */ *sp++ = p; } if (sp == stack) return (item_ptr)0; /* merge all the runs on the stack */ p = *--sp; while (sp != stack) { q = *--sp; r = &header; while (q && p) if (IN_ORDER(q, p)) { r->next = q; r = q, q = q->next; } else { r->next = p; r = p, p = p->next; } r->next = q ? q : p; p = header.next; } return p; } item_ptr alloc_item(data) char *data; { register item_ptr result; result = (item_ptr)malloc(sizeof *result + strlen(data) + 1); if (result) { result->next = (item_ptr)0; result->data = strcpy((char*)result + sizeof *result, data); } return result; } /* What we really want to do is to slurp the entire file in with one call to read(). In order to do that, we need to know how much there is. A file's size can be determined either by calling fstat() or by using an lseek() to the end. The number you get from fstat() doesn't mean much for pipes and terminals, so lseek() appears to be the better approach. Note that even when stdin is connected to a file, part of it may have been read already. In that case, we should not include the part which has been read. So we do i = lseek(STDIN, 0, 1) to discover the current position (and simultaneously to find out if we can use lseek() at all on this file descriptor). One problem remains; the size of the file may change while we are reading it. In that case we'll stop early if we have to, but if the file grows we'll miss stuff added at the end. */ main(argc, argv) int argc; char **argv; { struct item_rec header; item_ptr p, q; char *progname = argc >= 1 ? argv[0] : "nsort"; int i; if (argc != 1) { fprintf(stderr, "Usage: %s <unordered >sorted\n", progname); exit(EX_USAGE); } i = lseek(STDIN, 0, 1); /* current position in stream */ if (i < 0) { /* can't find out the size by seeking */ char buffer[1024]; for (p = &header ; fgets(buffer, sizeof buffer, stdin) ; p->next = q, p = q) { i = strlen(buffer); if (i > 0 && buffer[i-1] == '\n') buffer[i-1] = '\0'; if (!(q = alloc_item(buffer))) { fprintf(stderr, "%s: ran out of memory\n", progname); exit(EX_SOFTWARE); } } else { register char *b, *c; register int n; n = lseek(STDIN, 0, 2) - i; /* part of stdin may have been read */ (void) lseek(STDIN, i, 0); /* go back to original position */ if (!(b = malloc(n+1))) { fprintf(stderr, "%s: ran out of memory\n", progname); exit(EX_SOFTWARE); } for (c = b; (i = n-(c-b)) > 0; c += i) { i = read(STDIN, c, i); if (i < 0) { perror(progname); exit(EX_IOERR); } else if (i == 0) { break; } } /* it is possible that the file may have been truncated */ /* while we were reading it. */ n = c-b; for (p = &header; n > 0; b = c, p->next = q, p = q) { for (c = b; --n >= 0; c++) if (*c == '\n') { *c++ = '\0'; break; } if (n < 0) *c = '\0'; if (!(q = (item_ptr) malloc(sizeof *q))) { fprintf(stderr, "nsort: ran out of memory\n"); exit(EX_SOFTWARE); } q->data = b; } } p->next = (item_ptr)0; p = header.next; for (p = nat_sort(p); p; p = p->next) puts(p->data); exit(EX_OK); } ------ EOF ------ ls -l nsort.c exit 0