broehl@watale.UUCP (Bernie Roehl) (08/24/86)
This is a really good sorter for IBM PCs. It is originally from the Dr. Dobbs Magazine. Have Fun !!!!! # This is a shell archive. Remove anything before this line, # then unpack it by saving it in a file and typing "sh file". # # Wrapped by watale!broehl on Sat Aug 23 19:31:42 EDT 1986 # Contents: read.me sort.c ssort.c echo x - read.me sed 's/^@//' > "read.me" <<'@//E*O*F read.me//' These files make up an expanded version of the DOS utility SORT. This posting also needs a subroutine out of the LIBRARY posting. @//E*O*F read.me// chmod u=rw,g=r,o=r read.me echo x - sort.c sed 's/^@//' > "sort.c" <<'@//E*O*F sort.c//' /* * SORT.C * * Copyright (c) 1986 Allen I. Holub * All rights reserved. */ #define LINT_ARGS #include <stdio.h> #include <ctype.h> #include <string.h> #include <malloc.h> #include "getargs.h" /*---------------------------------------------------------------------- * General purpose #defines. */ #define MAXBUF (132 + 1) /* Maximum input line length +1 */ #define MAXLINEC 1024 /* Maximum number of lines in */ /* an input file before merge */ /* files start to be created */ #define MAXTMP 18 /* The maximum number of temp- */ /* orary files that will be */ /* created. Two FILE ptrs. are */ /* needed for stdout, and */ /* stderr during output */ #define isnum(c1) ( isdigit(c1) || (c1) == '-' ) /*---------------------------------------------------------------------- * Variables for getargs. The immediately following variables will * be modified by getargs() according to what flags it finds on the * command line. */ static int Debug = 0 ; /* Debug Mode */ static int Verbose = 0 ; /* Say what you're doing */ static int Noblanks = 0 ; /* Blanks don't count */ static int Numeric = 0 ; /* Sort numbers by val */ static int Primary = 0 ; /* Primary sort key */ static int Secondary = 0 ; /* Secondary sort key */ static int Dictorder = 0 ; /* Use dictionary order */ static int Foldupper = 0 ; /* Fold upper case */ static int Reverse = 0 ; /* Sort in reverse order */ static int Delim = 32 ; /* Field separator */ static char *Mdir = "" ; /* Put temp files here */ static int Nodups = 0 ; /* Don't print duplicate */ /* lines. */ ARG Argtab[] = { { 'b' , ARG_FBOOLEAN, &Noblanks, "ignore leading whitespace (Blanks)"}, { 'd' , ARG_FBOOLEAN, &Dictorder, "sort in Dictionary order" }, { 'f' , ARG_FBOOLEAN, &Foldupper, "Fold upper into lower case" }, { 'n' , ARG_FBOOLEAN, &Numeric, "sort Numbers by numeric value" }, { 'p' , ARG_INTEGER, &Primary, "use field <num> as Primary key" }, { 'r' , ARG_FBOOLEAN, &Reverse, "do a reverse sort" }, { 's' , ARG_INTEGER, &Secondary, "use field <num> as Secondary key" }, { 't' , ARG_CHARACTER, &Delim, "use <C> to separate fields" }, { 'u' , ARG_FBOOLEAN, &Nodups, "delete duplicate lines in output" }, { 'v' , ARG_FBOOLEAN, &Verbose, "(Verbose) diagnostics to stderr" }, { 'w' , ARG_STRING, (int *)&Mdir, "prepend <str> to Temp file names" }, { 'z' , ARG_FBOOLEAN, &Debug, "Debug Mode" } }; #define NUMARGS (sizeof(Argtab) / sizeof(ARG)) /*---------------------------------------------------------------------- * Global variables. The Lines array is used for the initial * sorting. */ static int Options; /* Set by main if any options set */ static char *Lines[MAXLINEC]; /* Holds arrays of input lines */ static int Linec; /* # of items in Lines */ static char **Argv; /* Copies of argv and argc */ static int Argc; /*---------------------------------------------------------------------- * The heap used in the merge process: */ typedef struct { char string[MAXBUF]; /* One line from the merge file */ FILE *file; /* Pointer to input file */ } HEAP; HEAP *Heap[ MAXTMP ]; /* The heap itself */ /*----------------------------------------------------------------------*/ pheap( str, n ) char *str; { int i; printf("+--------------------------\n"); printf("| %s, heap is:\n", str); for(i = 0; i < n ; i++ ) { printf("|%02d: %s", i, *(Heap[i]->string) ? Heap[i]->string : "(null)\n" ); } printf("+--------------------------\n"); } /*----------------------------------------------------------------------*/ int stoi(instr) register char **instr; { /* Convert string to integer updating *instr to point * past the number. Return the integer value represented * by the string. */ register int num = 0 ; register char *str ; int sign = 1 ; str = *instr; if( *str == '-' ) { sign = -1 ; str++; } while( '0' <= *str && *str <= '9' ) num = (num * 10) + (*str++ - '0') ; *instr = str; return( num * sign ); } /*----------------------------------------------------------------------*/ int dedupe(argc, argv) int argc; char **argv; { /* Compress an argv-like array of pointers to strings so that * adjacent duplicate lines are eliminated. Return the * argument count after the compression. */ register int i ; int nargc ; char **dp ; nargc = 1; dp = &argv[1]; for ( i=1 ; i < argc ; i++ ) { if( strcmp(argv[i-1], argv[i]) != 0 ) { *dp++ = argv[i]; nargc++; } } return( nargc ); } /*----------------------------------------------------------------------*/ static char *skip_field(n, str) int n; char *str; { /* Skip over n fields. The delimiter is in the global * variable Delim. Return a pointer to either the character * to the right of the delimiter, or to the '\0'. */ while( n > 0 && *str ) { if( *str++ == Delim) --n; } return(str); } /*---------------------------------------------------------------------- * Comparison functions needed for sorting. * * ssort() will call either argvcmp or qcmp, passing them pointers * to linev entries. qcmp() calls two workhorse functions, qcmp1() * and qcmp2(). The workhorse functions will also be called by the * reheap() subroutine used to manipulate merge files. */ static int argvcmp( s1p, s2p ) char **s1p, **s2p; { return( strcmp( *s1p, *s2p ) ); } /*----------------------------------------------------------------------*/ static int qcmp( str1p, str2p ) char **str1p; char **str2p; { /* Takes care of all the sorting of fields, calling qcmp1 * to do the actual comparisons. Assuming here that * Secondary won't be set unless Primary is set too. */ return( qcmp1( *str1p, *str2p ) ); } /*----------------------------------------------------------------------*/ static int qcmp1( str1, str2 ) char *str1, *str2; { /* Workhorse comparison function. Takes care of sorting * fields. If a primary sort field is specified then * qcmp1() skips to that field and calls qcmp2 to do the * actual comparison. If the primary fields are equal, then * the secondary fields are compared in the same way. */ int rval; if( !Primary ) return qcmp2( str1, str2 ); else { rval = qcmp2( skip_field(Primary - 1, str1), skip_field(Primary - 1, str2) ); if( !rval && Secondary ) { /* The two primary keys are equal, search the */ /* secondary keys if one is specified */ rval = qcmp2( skip_field(Secondary - 1, str1), skip_field(Secondary - 1, str2) ); } return rval; } } /*----------------------------------------------------------------------*/ static int qcmp2( str1, str2 ) char *str1; char *str2; { /* Workhorse comparison function. Deals with all command line * options except fields. Returns * * 0 if str1 == str2 * positive if str1 > str2 * negative if str1 < str2 * * This is a general purpose (and therefore relatively slow) * routine. Use strcmp() if you need a fast compare. * Comparison stops on reaching end of string or on encountering * a Delim character (if one exists). So make sure Delim is * set to '\0' if you're not sorting by fields. */ register unsigned int c1, c2; if( Noblanks ) { /* * Skip past leading whitespace in both strings */ while( isspace(*str1) ) str1++; while( isspace(*str2) ) str2++; } do { if( Numeric && isnum(*str1) && isnum(*str2) ) { /* Add 0xff to the two numeric values so that * c1 and c2 can't be confused with a Delim * character later on. */ c1 = stoi( &str1 ) + 0xff ; c2 = stoi( &str2 ) + 0xff ; if( c1 == c2 ) continue; else break; } c1 = *str1++; c2 = *str2++; if(Dictorder) { /* Skip past any non-alphanumeric or blank * characters. */ while( c1 && !isalnum(c1) ) c1 = *str1++ ; while( c2 && !isalnum(c2) ) c2 = *str2++ ; } if(Foldupper) { /* Map c1 and c2 to upper case */ c1 = toupper( c1 ); c2 = toupper( c2 ); } /* Keep processing while the characters are the same * unless we've reached end of string or a delimiter. */ } while( (c1==c2) && c1 && c1 != Delim ); if( Delim ) /* If we're sorting on a field */ { /* and we've found a delimiter */ if(c1 == Delim) /* then map the delimiter to a */ c1 = 0; /* zero for purposes of */ /* comparison. */ if(c2 == Delim) c2 = 0; } return( Reverse ? (c2 - c1) : (c1 - c2) ); } /*----------------------------------------------------------------------*/ FILE *nextfile() { /* Return a FILE pointer for the next input file or NULL * if no more input files exist (ie. all of the files * on the command line have been processed) or a file * can't be opened. In this last case print an error message. * If Argc == 1 the first time we're called, then standard * input is returned (the first time only , NULL is returned * on subsequent calls). */ FILE *fp; static int first_time = 1 ; if( first_time ) { first_time = 0; if( Argc == 1 ) return stdin; } if( Argc-- > 1 ) { if( !(fp = fopen(*++Argv, "r")) ) fprintf(stderr, "sort: can't open %s for read\n", *Argv ); else if( Verbose ) fprintf(stderr, "sort: reading: %s\n", *Argv ); return fp; } return NULL; } /*----------------------------------------------------------------------*/ gtext() { /* Get text from the appropriate input source and put * the lines into Linev, updating Linec. Return non-zero * if more input remains. Note that non-zero will * be returned if there are exactly MAXLINEC lines in * the input, even though there isn't any more actual * input available. If malloc can't get space for the line, * we'll remember the line in buf and return 1. */ register int c; static FILE *fp = 0; static char buf[ MAXBUF ] = {0}; /* Input buffer */ int maxcount; char **lv; if( !fp ) /* This is only true the first */ fp = nextfile(); /* time we're called. */ lv = Lines ; Linec = 0 ; for( maxcount = MAXLINEC; --maxcount >= 0 ;) { if( !*buf ) while( fgets(buf, MAXBUF, fp) == NULL ) if( !(fp = nextfile()) ) return( 0 ); /* No more input */ if( *lv = malloc(strlen(buf) + 1) ) { strcpy( *lv++, buf ); *buf = '\0'; Linec++; } else return 1; } return( maxcount < 0 ); /* Return 1 if there's more input to get */ } /*----------------------------------------------------------------------*/ char *fname( num ) { /* Return a merge file name for the indicated merge pass. */ static char name[ 16 ]; if( num > MAXTMP ) { fprintf(stderr,"sort: input file too large\n" ); exit(1); } sprintf(name, "%smerge.%d", Mdir, num ); return( name ); } /*----------------------------------------------------------------------*/ outtext( passnum, more_to_go ) { /* Print out all the strings in the Lines array and free all * the memory that they use. Output is sent to standard * output if this is pass 1 and there's no more input * to process, otherwise output is sent to a merge file. */ register char **lv; register FILE *fp; if( passnum == 1 && !more_to_go ) fp = stdout; else if( !(fp = fopen( fname(passnum), "w" )) ) { fprintf(stderr,"Can't open merge file %s for write\n", fname(passnum)); exit(1); } else if( Verbose ) fprintf(stderr,"sort: creating: %s\n", fname(passnum)); for( lv = Lines ; --Linec >= 0; ) { fputs( *lv, fp ); free ( *lv++ ); } fclose( fp ); } /*----------------------------------------------------------------------*/ open_mergefiles( nfiles ) { /* Open all the merge files and create the heap. "nfiles" * merge-files exist and the heap will have that many * elements in it. The heap is unsorted on exit. */ HEAP **hp; int i; for( hp = Heap, i = nfiles; i > 0; hp++, --i ) { if( ! (*hp = (HEAP *) malloc(sizeof(HEAP))) ) { fprintf( stderr, "sort: out of memory!" ); exit( 1 ); } if( !( (*hp)->file = fopen( fname(i), "r") )) { fprintf(stderr,"sort: can't open %s for read", fname(i) ); exit( 1 ); } if( !fgets( (*hp)->string, MAXBUF, (*hp)->file ) ) { fprintf(stderr,"sort: merge file %s is empty", fname(i) ); exit( 1 ); } if( Verbose ) fprintf(stderr, "sort: merging: %s\n", fname(i) ); } } /*----------------------------------------------------------------------*/ mcmp( hpp1, hpp2 ) HEAP **hpp1, **hpp2; { /* Comparison routine for sorting the heap. Is passed * two pointers to HEAP pointers and compares the * string fields of these using the same workhorse * functions used in the initial sorting phase. */ return Options ? qcmp1 ((*hpp1)->string, (*hpp2)->string) : strcmp ((*hpp1)->string, (*hpp2)->string) ; } /*----------------------------------------------------------------------*/ reheap( nfiles ) { /* Reheap the Heap, assume that the first element (**Heap) * is the newly added one. */ register int parent, child; HEAP *tmp; for( parent = 0, child = 1; child < nfiles; ) { /* Find the smaller child. Then if the parent is less * than the smaller child, we're done. Otherwise * swap the parent and child, and continue the * reheaping process with a new parent. */ if( child+1 < nfiles ) /* if child+1 is in the heap */ if( mcmp(&Heap[child], &Heap[child+1]) > 0) child++; if( mcmp( &Heap[parent], &Heap[child]) <= 0) break; tmp = Heap[parent]; /* Exchange */ Heap[parent] = Heap[child ]; Heap[child ] = tmp; parent = child; child = parent << 1; /* child = parent * 2 */ } } /*----------------------------------------------------------------------*/ merge( nfiles ) int nfiles; /* Number of merge files */ { open_mergefiles( nfiles ); ssort( Heap, nfiles, sizeof(Heap[0]), mcmp ); while( nfiles > 0 ) { if( Debug ) pheap( "Merge: top of while loop", nfiles ); fputs( (*Heap)->string, stdout ); if( !fgets((*Heap)->string, MAXBUF, (*Heap)->file) ) { /* This input stream is exhausted. Reduce the * heap size to compensate. Note that Heap+1 * is the same as &Heap[1]; */ fclose( (*Heap)->file ); if( --nfiles ) memcpy(Heap, Heap+1, nfiles * sizeof(HEAP)); } reheap( nfiles ); } } /*----------------------------------------------------------------------*/ adjust_args() { /* Adjust various default arguments to fix mistakes made * on the command line. In particular Delim is always 0 * unless either Primary or Secondary was set. * If a secondary field is specified without a Primary, then * 1 is assumed for the primary. If no Delim is specified * then tab (\t) is assumed. "Options" is true if any of * the options that affect the sort order were specified * on the command line. */ if( !(Primary || Secondary) ) Delim = 0; else { /* if( !Delim ) Delim = ' ';*/ if( !Primary ) Primary = 1; } Options = Noblanks || Numeric || Dictorder || Foldupper || Reverse || Delim; } /*----------------------------------------------------------------------*/ void usage() { fprintf( stderr, "Usage: sort [%c[bdfnruv][p<num>|s<num>][t<c>][w<str>]] [files]...\n", ARG_Switch ); fprintf( stderr, "\n"); fprintf( stderr, "Set the environment variable SWITCHAR to the character\n"); fprintf( stderr, "you wish to use for the Switch Character\n"); fprintf( stderr, "\n" ); fprintf( stderr, "Case of the command line switches %s important\n", ARG_ICase ? "is not" : "is" ); fprintf( stderr, "\n" ); } /*----------------------------------------------------------------------*/ main(argc, argv) int argc; char **argv; { int numpasses = 0; /* Number of merge files used */ int more_input; /* True if input isn't exhausted */ ctlc(); ARG_ICase = 1; Argc = getargs( argc, argv, Argtab, NUMARGS ); Argv = argv; adjust_args(); do{ more_input = gtext(); if( Linec ) { ssort(Lines, Linec, sizeof(*Lines), Options ? qcmp : argvcmp); if( Nodups ) Linec = dedupe(Linec, Lines); outtext( ++numpasses, more_input ); } } while( more_input ); if( numpasses > 1 ) /* merge files were created */ { fclose( stdin ); /* Free up default file des- */ fclose( stdaux ); /* criptors for unused streams */ fclose( stdprn ); /* so that they can be used for */ /* merge files. */ merge( numpasses ); for(; numpasses > 0 ; --numpasses ) { unlink( fname(numpasses) ); if( Verbose ) fprintf(stderr, "sort: deleted: %s\n", fname(numpasses)); } } exit(0); } @//E*O*F sort.c// chmod u=rw,g=r,o=r sort.c echo x - ssort.c sed 's/^@//' > "ssort.c" <<'@//E*O*F ssort.c//' /* SSORT.C Works just like qsort() except that a shell * sort, rather than a quick sort, is used. This * is more efficient than quicksort for small numbers of elements * and it's not recursive so will use much less stack space. * This routine started out as the one in K&R. * * 12/13/85: Modified to select a gap from the series * 1, 4, 13, 40, 121 ... as per Knuth. * * Copyright (C) 1985, Allen I. Holub. All rights reserved */ void ssort( base, nel, width, cmp ) char *base; int nel, width; int (*cmp)(); { register int i, j; int gap, k, tmp ; char *p1, *p2; for( gap=1; gap <= nel; gap = 3*gap + 1 ) ; for( gap /= 3; gap > 0 ; gap /= 3 ) for( i = gap; i < nel; i++ ) for( j = i-gap; j >= 0 ; j -= gap ) { p1 = base + ( j * width); p2 = base + ((j+gap) * width); if( (*cmp)( p1, p2 ) <= 0 ) break; for( k = width; --k >= 0 ;) { tmp = *p1; *p1++ = *p2; *p2++ = tmp; } } } #ifdef DEBUG cmp( cpp1, cpp2 ) char **cpp1, **cpp2; { register int rval; printf("comparing %s to %s ", *cpp1, *cpp2 ); rval = strcmp( *cpp1, *cpp2 ); printf("returning %d\n", rval ); return rval; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ main( argc, argv ) int argc; char **argv; { ssort( ++argv, --argc, sizeof(*argv), cmp ); while( --argc >= 0 ) printf("%s\n", *argv++ ); } #endif @//E*O*F ssort.c// chmod u=rw,g=r,o=r ssort.c echo Inspecting for damage in transit... temp=/tmp/shar$$; dtemp=/tmp/.shar$$ trap "rm -f $temp $dtemp; exit" 0 1 2 3 15 cat > $temp <<\!!! 4 23 132 read.me 697 2600 16690 sort.c 71 300 1409 ssort.c 772 2923 18231 total !!! wc read.me sort.c ssort.c | sed 's=[^ ]*/==' | diff -b $temp - >$dtemp if [ -s $dtemp ] then echo "Ouch [diff of wc output]:" ; cat $dtemp else echo "No problems found." fi exit 0 -- Michael A. Shiels clyde-\ decvax-\\ ihnp4-\\\ +++-----> watmath!watale!broehl tektronix-/// ubc-vision-// utzoo-/