[comp.sources.misc] v08i092: fast & simple sorting program

allbery@uunet.UU.NET (Brandon S. Allbery - comp.sources.misc) (10/08/89)
Posting-number: Volume 8, Issue 92
Submitted-by: ok@cs.mu.oz.au.UUCP (Richard O'Keefe)
Archive-name: nsort

#!/bin/sh
# This is a shell archive, meaning:                              
# 1. Remove everything above the #!/bin/sh line.                
# 2. Save the resulting text in a file.                          
# 3. Execute the file with /bin/sh (not csh) to create the files:
#	README
#	makefile
#	nsort.1
#	nsort.c
cat >README <<'------ EOF ------'
nsort is a totally stripped down sort(1); it reads a file into memory,
sorts it using merge sort, and writes the result out.  I wrote it to
show that merge sort is fast -- sort(1) is documented as using a
version of quicksort internally, and nsort beats it easily even when
sort(1) is given enough memory to sort in memory.
To compile it, just run make(1).
------ EOF ------
ls -l README
cat >makefile <<'------ EOF ------'
nsort:
	cc -o nsort -O nsort.c
------ EOF ------
ls -l makefile
cat >nsort.1 <<'------ EOF ------'
.TH NSORT 1
.\"/*%nroff -man %
.SH NAME
nsort \*- fast and simple sorting program
.SH SYNOPSIS
.B nsort
<in >out
.SH DESCRIPTION
The
.B nsort
command
reads lines from its standard input, sorts them, and writes the
sorted lines to its standard output.  It behaves just like
sort(1) when the latter is given no arguments.  The only reason
for using
.B nsort
is speed:  when it can be used it is up to twice as fast as sort(1).
.PP
This program does
.I not
accept file names as arguments.  To process more than
one input file, pipe the output of cat(1) into
.B nsort.
.PP
The program does not need to know how many lines there
are in the standard input, nor does it need to read the
standard input more than once.  However, the data must
fit into memory.
.SH OPTIONS
The
.B nsort
command has no options.
.SH EXAMPLES
.IP % 3
ls -f . | nsort
.PP
has the same effect as a plain `ls'.
.SH DIAGNOSTICS
An error message is printed on stderr and the program halts without
producing any output if
.PP
there are any arguments on the command line (status EX_USAGE is
returned in this case), or
.PP
the program is unable to obtain enough memory using malloc (if the
file has N lines and C characters, roughly 8N+C bytes are needed)
(status EX_SOFTWARE is returned in this case), or
.PP
something goes wrong while reading standard input
(status EX_IOERR is returned in this case).
.PP
All error messages indicate the actual name the program was invoked by.
If an error message is produced, the rest of the standard input is not
read, so you are likely to get "Broken pipe" messages from upstream
processes.
.SH BUGS
Input lines which contain NUL characters are quietly truncated.
sort(1) cannot handle such lines either, but complains.
.SH "SEE ALSO"
sort(1)
.SH AUTHOR
Richard A. O'Keefe
------ EOF ------
ls -l nsort.1
cat >nsort.c <<'------ EOF ------'
/*  File   : nsort.c
    Author : Richard A. O'Keefe
    Updated: 1988
    Purpose: Fast sorting command for smallish files.

    This program has no copyright notice.  You can do anything you please
    with it, except blame me if it doesn't work.
*/

#ifndef	lint
static	char SCCSid[] = "%Z%%E% %M%	%I%";
#endif/*lint*/

/*  nsort <input >output

    is a very simple sorting program which has been written to be as fast
    as possible.  It is equivalent to sort with no arguments.  That is,
    it sorts its standard input to its standard output, and compares
    entire lines.

    It uses a natural merge, so if the input is already in order it takes
    linear time, and it reads the entire file into memory, using a single
    read() if the standard input is a regular file.

    Sorting random permutations of /usr/dict/words on a Sun-3/50:
	sort(1) : 15 seconds  nsort :  8 seconds  (with SCSI disc)
		  17 seconds	      11 seconds  (NFS over Ethernet)
*/

#include <stdio.h>

/*  The following values are taken from <sysexits.h>, but are copied here
    in case you lack that BSD header file.
*/
#define	EX_OK		 0	/* nothing went wrong */
#define	EX_USAGE	64	/* something wrong with the command line */
#define	EX_SOFTWARE	70	/* internal error (here, memory ran out) */
#define	EX_IOERR	71	/* transput error (here, read() trouble) */

#define	STDIN		 0

extern	char *	malloc();
extern	char *	strcpy();
extern	int	strcmp();
extern	int	strlen();
extern	void	perror();
extern	int	lseek();
extern	int	read();

typedef struct item_rec *item_ptr;

struct item_rec
    {    
	item_ptr	next;
	char* 		data;
    };

#define	IN_ORDER(x, y) (strcmp((x)->data, (y)->data) <= 0)


item_ptr nat_sort(list)
    item_ptr list;
    {
	item_ptr stack[30];
	item_ptr *sp = stack;
	int runs = 0;			/* total number of runs processed */
	register item_ptr p, q, r;
	struct item_rec header;
	int k;

	while (p = list) {
	    /* pick up a run from the front of list, setting */
	    /* p = (pointer to beginning of run), list = (rest of list) */
	    if (q = p->next) {
		list = q->next;
		if (!IN_ORDER(p, q)) r = q, q = p, p = r;
		p->next = q;
		for (r = list; r && IN_ORDER(q, r); r = r->next)
		    q->next = r, q = r;
		q->next = (item_ptr)0;
		list = r;
	    } else {
		list = (item_ptr)0;
	    }
	    runs++;
	    /* merge this run with 0 or more runs off the top of the stack */
	    for (k = runs; 1 &~ k; k >>= 1) {
		q = *--sp;
		r = &header;
		while (q && p)
		    if (IN_ORDER(q, p)) {
			r->next = q;
			r = q, q = q->next;
		    } else {
			r->next = p;
			r = p, p = p->next;
		    }
		r->next = q ? q : p;
		p = header.next;
	    }
	    /* push the merged run onto the stack */
	    *sp++ = p;
	}
	if (sp == stack) return (item_ptr)0;
	/* merge all the runs on the stack */
	p = *--sp;
	while (sp != stack) {
	    q = *--sp;
	    r = &header;
	    while (q && p)
		if (IN_ORDER(q, p)) {
		    r->next = q;
		    r = q, q = q->next;
		} else {
		    r->next = p;
		    r = p, p = p->next;
		}
	    r->next = q ? q : p;
	    p = header.next;
	}
	return p;
    }

item_ptr alloc_item(data)
    char *data;
    {
	register item_ptr result;

	result = (item_ptr)malloc(sizeof *result + strlen(data) + 1);
	if (result) {
	    result->next = (item_ptr)0;
	    result->data = strcpy((char*)result + sizeof *result, data);
	}
	return result;
    }

/*  What we really want to do is to slurp the entire file in with one call
    to read().  In order to do that, we need to know how much there is.  A
    file's size can be determined either by calling fstat() or by using an
    lseek() to the end.  The number you get from fstat() doesn't mean much
    for pipes and terminals, so lseek() appears to be the better approach.
    Note that even when stdin is connected to a file, part of it may have
    been read already.  In that case, we should not include the part which
    has been read.  So we do
	i = lseek(STDIN, 0, 1)
    to discover the current position (and simultaneously to find out if we
    can use lseek() at all on this file descriptor).  One problem remains;
    the size of the file may change while we are reading it.  In that case
    we'll stop early if we have to, but if the file grows we'll miss stuff
    added at the end.
*/
main(argc, argv)
    int argc;
    char **argv;
    {
	struct item_rec header;
	item_ptr p, q;
	char *progname = argc >= 1 ? argv[0] : "nsort";
	int i;

	if (argc != 1) {
	    fprintf(stderr, "Usage: %s <unordered >sorted\n", progname);
	    exit(EX_USAGE);
	}
	i = lseek(STDIN, 0, 1);	/* current position in stream */
	if (i < 0) {		/* can't find out the size by seeking */
	    char buffer[1024];

	    for (p = &header
		; fgets(buffer, sizeof buffer, stdin)
		; p->next = q, p = q) {
		i = strlen(buffer);
		if (i > 0 && buffer[i-1] == '\n') buffer[i-1] = '\0';
		if (!(q = alloc_item(buffer))) {
		    fprintf(stderr, "%s: ran out of memory\n", progname);
		    exit(EX_SOFTWARE);
		}
	} else {
	    register char *b, *c;
	    register int n;

	    n = lseek(STDIN, 0, 2) - i;	/* part of stdin may have been read */
	    (void) lseek(STDIN, i, 0);	/* go back to original position */
	    if (!(b = malloc(n+1))) {
		fprintf(stderr, "%s: ran out of memory\n", progname);
		exit(EX_SOFTWARE);
	    }
	    for (c = b; (i = n-(c-b)) > 0; c += i) {
		i = read(STDIN, c, i);
		if (i < 0) {
		    perror(progname);
		    exit(EX_IOERR);
		} else 
		if (i == 0) {
		    break;
		}
	    }
	    /* it is possible that the file may have been truncated */
	    /* while we were reading it. */
	    n = c-b;
	    for (p = &header; n > 0; b = c, p->next = q, p = q) {
		for (c = b; --n >= 0; c++)
		    if (*c == '\n') {
			*c++ = '\0';
			break;
		    }
		if (n < 0) *c = '\0';
		if (!(q = (item_ptr) malloc(sizeof *q))) {
		    fprintf(stderr, "nsort: ran out of memory\n");
		    exit(EX_SOFTWARE);
		}
		q->data = b;
	    }
	}
	p->next = (item_ptr)0;
	p = header.next;
	for (p = nat_sort(p); p; p = p->next)
	    puts(p->data);
	exit(EX_OK);
    }

------ EOF ------
ls -l nsort.c
exit 0