[comp.sys.mac.programmer] fastqsort.c -- 3 times faster than THINK C's qsort

huntley@copper.ucs.indiana.edu (Haydn Huntley) (09/25/90)

/*	fastQSort() is a replacement for qsort().  Compared with the Think C library
	version of qsort, fastQSort is about 3 times faster (it takes 1/3 the time), 
	and its speed isn't dependent on the data being sorted.
	
	One problem with qsort is that when the data is not in random order -- for 
	example when it's already ordered, in reverse order, or almost sorted -- then 
	qsort exhibits worst case behavior of N*N/2 operations.  For N=1024 this can 
	take about 30 seconds, instead of the usual 0.8 seconds.  The fastQSort 
	algorithm doesn't have this problem, because it randomly selects the pivot 
	element, and so for N=1024 it requires about 0.26 secs +/- 0.02 secs.
	
	Also, because fastQSort doesn't exhibit worst case behavior, it doesn't 
	require as much stack space, even though it has 12 bytes more of local
	variables.
	
	The things that make fastQSort so much faster than qsort are:
	
	1)	fastQSort picks the pivot element randomly, so it doesn't display worst
		case behavior.
		
	2)	fastQSort uses pointers and pointer arithmetic, avoiding multiplication
		whenever possible.
	
	3)	qsort uses std_swap() to swap the data between two locations, and 
		std_swap loops through the data 3 times to perform the exchange!  In
		comparison, swapMem() loops through the data just once.
		
	4)	fastQSort uses register variables whenever it makes good sense to do so.
	
	
	Researched and written by:
	
		Haydn Huntley
		Haydn's Consulting
		203 West Stone
		Fairfield, Iowa  52556
		(515) 472-7025
		
	During the school year, I may be reached at the following address:
	
		Haydn Huntley
		Eigenmann Hall #289
		Indiana University
		Bloomington, IN  47406
		(812) 857-8620
		
	E-mail:  huntley@copper.ucs.indiana.edu
*/


#include <stdio.h>
#include <stdlib.h>
#include "fastqsort.h"


#define	PIVOT_CUTOFF	25
#define	SEED			TickCount ()
#define FASTER			0				/* changes fastQSort() to use the median	*/
										/* of the first, middle, and a random 		*/
										/* element, but this requires more code		*/
										/* than simply always choosing an element 	*/
										/* at random.  612 bytes vs. 356 bytes		*/
										/* The median version does about 10% fewer	*/
										/* compares, and the speed difference is 	*/
										/* around 10%. For N=50,000 the savings is	*/
										/* 1.85 secs, and for N=1024 the savings	*/
										/* is only 0.025 secs.						*/

static size_t	gSize;
static int		(*gCmpFn) (void *, void *);


/* functions declarations */
static void 	doFastQSort (char *first, char *last);
static void 	chooseMedian (void *pDest, void *pA, void *pB, void *pC);
static void 	memSwap (void *a, void *b, size_t qSize);


void fastQSort (void *base, size_t n, size_t size, int (*cmpFn) (void *, void *))
{
	/* setup the global variables used by fastQSort() */
	gSize = size;
	gCmpFn = cmpFn;
	srand (SEED);
		
	/* the args for fastQSort () must be scaled by gSize */
	doFastQSort (base, (char *) base + n * gSize);
}


static void doFastQSort (register char *first, char *last)
{
	register char	*i, *j;
	register size_t	n, qSize = gSize;
	
	/* first, last, i, and j are scaled by gSize in this function */
	 	
	while ((n = (last - first) / qSize) > 1)
	{
		#if FASTER
			if (n < PIVOT_CUTOFF)
				/* use a random pivot */
				memSwap (first + (rand () % n) * qSize, first, qSize);
			else
				/* use the median of first, middle, and a random volunteer */
				chooseMedian (first, first,
						first + (n / 2) * qSize,
						first + (rand () % n) * qSize);
		#else
			/* use a random pivot */
			memSwap (first + (rand () % n) * qSize, first, qSize);
		#endif /* FASTER */
		
		i = first;
		j = last;
		while (1)
		{
			i += qSize;
			while (i < last && gCmpFn (i, first) < 0)
				i += qSize;
				
			j -= qSize;
			while (j > first && gCmpFn (j, first) > 0)
				j -= qSize;
				
				
			if (i >= j)
 				break;
 				
			memSwap (i, j, qSize);
		}
		
		if (j == first)
		{
			first += qSize;
			continue;
		}
		memSwap (first, j, qSize);
		if (j - first < last - (j + qSize))
		{
			doFastQSort (first, j);
			
			first = j + qSize;
			/* equivalent to doFastQSort (j + qSize, last);		*/
			/* to save stack space								*/
		}
		else
		{
			doFastQSort (j + qSize, last);
			
			last = j;
			/* equivalent to doFastQSort (first, j);			*/
			/* to save stack space								*/
		}
	}
}


#if FASTER
static void chooseMedian (void *pDest, void *pA, void *pB, void *pC)
{
	void	*pMedian;
	
	if (gCmpFn (pA, pB) > 0)						/* pA > pB, what is pC?			*/
		if (gCmpFn (pA, pC) > 0)					/* pA > pB, pA > pC				*/
			if (gCmpFn (pB, pC) > 0)
				pMedian = pB;						/* pA > pB > pC					*/
			else
				pMedian = pC;						/* pA > pC > pB					*/
		else
			pMedian = pA;							/* pC > pA > pB					*/
	else											/* pB > pA, what is pC?			*/
		if (gCmpFn (pA, pC) > 0)
			pMedian = pA;							/* pB > pA > pC					*/
		else										/* pB > pA, pC > pA				*/
			if (gCmpFn (pB, pC) > 0)
				pMedian = pC;						/* pB > pC > pA					*/
			else
				pMedian = pB;						/* pC > pB > pA					*/

	if (pDest != pMedian)
		memSwap (pDest, pMedian, gSize);
}
#endif /* FASTER */


static void memSwap (register void *p, register void *q, size_t qSize)
{
	register char	tmp;

	asm {
		move.l	qSize,d0	/*	while (qSize)		*/
		bra.s	@2			/*	{					*/
@1		move.b	(p),tmp		/*		tmp = *p;		*/
		move.b	(q),(p)+	/*		*p++ = *q;		*/
		move.b	tmp,(q)+	/*		*q++ = tmp;		*/
		subq.l	#1,d0		/*		qSize--;		*/
@2		bne.s	@1			/*	}					*/
	}
}

;;  *****************************************************
;;  *  Haydn Huntley    huntley@copper.ucs.indiana.edu  *
;;  *****************************************************

kaufman@Neon.Stanford.EDU (Marc T. Kaufman) (09/25/90)

In article <60123@iuvax.cs.indiana.edu> huntley@copper.ucs.indiana.edu (Haydn Huntley) writes:

>One problem with qsort is that when the data is not in random order -- for 
>example when it's already ordered, in reverse order, or almost sorted -- then 
>qsort exhibits worst case behavior of N*N/2 operations.  For N=1024 this can 
>take about 30 seconds, instead of the usual 0.8 seconds.  The fastQSort 
>algorithm doesn't have this problem, because it randomly selects the pivot 
>element, and so for N=1024 it requires about 0.26 secs +/- 0.02 secs.

Without considering the other features of fastQSort, I really have to object
to the comment that fastQSort doesn't have a "worst case" problem because of
"random" selection.  Just because it may not be predictable doesn't mean it
doesn't exist -- only that you will be bitten when you least expect it.

Using "random numbers" is not a substitute for analysis of the algorithm.

Marc Kaufman (kaufman@Neon.stanford.edu)

lim@iris.ucdavis.edu (Lloyd Lim) (09/25/90)

In article <1990Sep25.044455.3039@Neon.Stanford.EDU> kaufman@Neon.Stanford.EDU (Marc T. Kaufman) writes:
>In article <60123@iuvax.cs.indiana.edu> huntley@copper.ucs.indiana.edu (Haydn Huntley) writes:
>	
>>One problem with qsort is that when the data is not in random order -- for 
>>example when it's already ordered, in reverse order, or almost sorted -- then 
>>qsort exhibits worst case behavior of N*N/2 operations.  For N=1024 this can 
>>take about 30 seconds, instead of the usual 0.8 seconds.  The fastQSort 
>>algorithm doesn't have this problem, because it randomly selects the pivot 
>>element, and so for N=1024 it requires about 0.26 secs +/- 0.02 secs.
>
>Without considering the other features of fastQSort, I really have to object
>to the comment that fastQSort doesn't have a "worst case" problem because of
>"random" selection.  Just because it may not be predictable doesn't mean it
>doesn't exist -- only that you will be bitten when you least expect it.
>
>Using "random numbers" is not a substitute for analysis of the algorithm.

Marc is correct.  When you select a random pivot, you still have the same
worst case running time.  If all permutations are equally likely, a random
pivot does not help you at all.  In practice, however, reverse order
arrangements are somewhat more probable than totally disordered arrangements.
Thus a random pivot helps a little in practice.

A better way is to pick the median of three elements (the first, middle, and
last elements).  This method is fast and it does reduce the probability of
encountering the worst case - both in theory and practice.  The worst case is
still there, of course.

+++
Lloyd Lim     Internet: lim@iris.ucdavis.edu (128.120.57.20)
              Compuserve: 72647,660
              US Mail: 146 Lysle Leach Hall, U.C. Davis, Davis, CA 95616

huntley@copper.ucs.indiana.edu (Haydn Huntley) (09/26/90)

In article <7739@ucdavis.ucdavis.edu> lim@iris.ucdavis.edu (Lloyd Lim) writes:
| >Using "random numbers" is not a substitute for analysis of the algorithm.
| 
| Marc is correct.  When you select a random pivot, you still have the same
| worst case running time.  If all permutations are equally likely, a random
| pivot does not help you at all.  In practice, however, reverse order
| arrangements are somewhat more probable than totally disordered arrangements.
| Thus a random pivot helps a little in practice.
| 
| A better way is to pick the median of three elements (the first, middle, and
| last elements).  This method is fast and it does reduce the probability of
| encountering the worst case - both in theory and practice.  The worst case is
| still there, of course.
| 

Actually, I implemented 15 different sorting algorithms, of which 10
were variations on Quick Sort, and using the median of the first,
middle, and last elements was not one of the better Quick Sort
variations!  It *surprised* me too!

Have you studied the analysis of algorithms?  Marc is quite right that
using random numbers is not a substitute for the careful analysis of
an algorithm, but I think if you take a look at this implementation of
the Quick Sort algorithm, you will find that it is quite efficient --
and it is *much* more efficient than the one in the library.

The way to calculate the possibility of that algorithm receiving it's
worst case input is 1/N!, where N is the number of items in the list
to be sorted.  If N > 10, that number is *vanishingly* small, much
smaller than the chance of being struck by lightning, etc., etc.  If N
<= 10, then that randomized algorithm will sort the data so quickly,
that it doesn't much matter whether it hits it's worst case or not!

Hows that for a (tongue-in-cheek) careful analysis!

:-)

--Haydn
;;  *****************************************************
;;  *  Haydn Huntley    huntley@copper.ucs.indiana.edu  *
;;  *****************************************************

philj@tekig5.PEN.TEK.COM (Phil K Jansen) (09/26/90)

Haydn Huntley writes:
>One problem with qsort is that when the data is not in random order ...
>... worst case behavior of N*N/2 operations.  For N=1024 this can 
>take about 30 seconds, instead of the usual 0.8 seconds.
>[fastqsort doesn't do this because of random pivot]...

Marc T. Kaufman writes:
>... I really have to object [to the comment that random selection eliminates
> the 'worst case' run time.  It just changes where worst-case will be.]

A simple 'proof' that worst-case is still O(N^2) is to assume that random
pivots happen to be chosen in exactly the same place as before.  It *can*
happen, you know.

Lloyd Lim writes:
>Marc is correct ... [justification for some improvement with a random pivot]

>A better way is to pick the median of three elements ...

Haydn actually does use "Median of three" for his pivot (check the #ifdefs).
His documentation is a little out of date, though... :-).

Our Algorithms class demonstrated that the insertion sort was faster
for less than ~25 elements.  This is because you start to notice quickSort's
subroutine overhead for each element actually moved.

So what you really want to do is CHANGE THE SORT ALGORITHM for small lists.
This will improve worst-case behavior to near the insertion-sort level.

hyperQuickSort(arguments) {
	if(numElements > 25)
	    quickSort(arguments);
	        /* recursively calls hyperQuickSort() -- twice */
	else
	    insertionSort(arguments); /* good ol' dumb insertion sort */
}

I expect most of the speedup came from the asm{} statements for the swaps.

As Haydn said, he did get 3x improvement from optimizing the algorithm.  But
he also made it non-portable.  Of course we're all 68000 Mac users here, so
maybe portability is less of an issue.  I need it, though.

It's always worth verifying the algorithm you used is the one you want before
you start optimizing.  I should know -- I've written REALLY SLOW assembler
stuff.

Good luck.

Phil Jansen
-- 
                          If you repeat things often enough, they become true.
Phil Jansen               If you repeat things often enough, they become true.
philj@tekig5.pen.tek.com  If you repeat things often enough, they become true.

vd09+@andrew.cmu.edu (Vincent M. Del Vecchio) (09/26/90)

> Excerpts from netnews.comp.sys.mac.programmer: 25-Sep-90 Re: fastqsort.c
> -- 3 times .. Haydn Huntley@copper.ucs (2015)

> The way to calculate the possibility of that algorithm receiving it's
> worst case input is 1/N!, where N is the number of items in the list
> to be sorted.  If N > 10, that number is *vanishingly* small, much
> smaller than the chance of being struck by lightning, etc., etc.  If N
> <= 10, then that randomized algorithm will sort the data so quickly,
> that it doesn't much matter whether it hits it's worst case or not!


The absolute worst case may only occcur with a probability of 1/N!, but
there are other cases which are very close to worst case in performance
(e.g. take the worst case and move around a small fraction of the
numbers) and that makes the 1/N! analysis seem rather irrelevant.  Note
that this is not to say that your algorithm doesn't do a good job.

+----------------------------------------------------------------------------+
| Vincent Del Vecchio     \ Disclaimer: Views expressed are not necessarily  |
| Box 4872                 \ those of any person/group I am associated with. |
| 5125 Margaret Morrison St.\   UUCP: {uunet,harvard}!andrew.cmu.edu!vd09    |
| Pittsburgh, PA  15213      \   BITNET: vd09+%andrew@cmuccvma.bitnet        |
| (412) 268-4441              \   Internet: vd09+@andrew.cmu.edu             |
+----------------------------------------------------------------------------+

huntley@copper.ucs.indiana.edu (Haydn Huntley) (09/26/90)

In article <7004@tekig5.PEN.TEK.COM> philj@tekig5.PEN.TEK.COM (Phil K Jansen) writes:
| Our Algorithms class demonstrated that the insertion sort was faster
| for less than ~25 elements.  This is because you start to notice quickSort's
| subroutine overhead for each element actually moved.
| 
| So what you really want to do is CHANGE THE SORT ALGORITHM for small lists.
| This will improve worst-case behavior to near the insertion-sort level.
| 
| hyperQuickSort(arguments) {
| 	if(numElements > 25)
| 	    quickSort(arguments);
| 	        /* recursively calls hyperQuickSort() -- twice */
| 	else
| 	    insertionSort(arguments); /* good ol' dumb insertion sort */
| }

Actually, the best general purpose sorting algorithm I could come up
with used just this technique, however it was only 4 times faster than
THINK's, and several times longer than fastQsort(), so I didn't think
many people would be interested in it.  If anyone is interested, I'd
be delighted to mail you the collection of sorting algorithms and the
code I wrote for timing, testing, and comparing them all.  It's
fun to play around with sorts, and instructive too!  :-)

Perhaps sorting algorithms are an emotional issue -- so many people
have posted and mailed trying to tell me that randomizing the
algorithm doesn't change it's worst case behavior.  Technically, they
are not mistaken, the worst case behavior of fastQsort is N*N, but the
chance of observing that behavior is tiny.  If you perform one sort per
second for the rest of your life, and you are sorting arrays of 1024
elements, I think it is incredibly unlikely that you will observe
fastQsort exhibiting worst case behavior before pass on to elsewhere.
A person may live for 3 billion seconds, but 1024! is much larger!
Fifty factorial is more than 3x10^64, and most calculators can't do
more than 69!.

| 
| I expect most of the speedup came from the asm{} statements for the swaps.
| 

Some speedup came from fixing their assembler code.  The big wins came
from randomizing the algorithm (to avoid worst case behavior on common
cases), and from using pointers instead of array indexing.

| As Haydn said, he did get 3x improvement from optimizing the algorithm.  But
| he also made it non-portable.  Of course we're all 68000 Mac users here, so
| maybe portability is less of an issue.  I need it, though.
|

If you'd like a portable algorithm, just convert memSwap() to C -- the
equivalent C code is in comments just to the right of the assembler.




;;  *****************************************************
;;  *  Haydn Huntley    huntley@copper.ucs.indiana.edu  *
;;  *****************************************************

CXT105@psuvm.psu.edu (Christopher Tate) (09/27/90)

In article <60332@iuvax.cs.indiana.edu>, huntley@copper.ucs.indiana.edu (Haydn
Huntley) says:

>Perhaps sorting algorithms are an emotional issue -- so many people
>have posted and mailed trying to tell me that randomizing the
>algorithm doesn't change it's worst case behavior.  Technically, they
>are not mistaken, the worst case behavior of fastQsort is N*N, but the
>chance of observing that behavior is tiny.  If you perform one sort per
>second for the rest of your life, and you are sorting arrays of 1024
>elements, I think it is incredibly unlikely that you will observe
>fastQsort exhibiting worst case behavior before pass on to elsewhere.
>A person may live for 3 billion seconds, but 1024! is much larger!
>Fifty factorial is more than 3x10^64, and most calculators can't do
>more than 69!.

Attention!  This is NOT a flame!!!

For fastQsort, this may well be true -- the random pivots will keep you
from hitting worst-case pretty effectively.  However, for regular qsort,
the worst case is an already-sorted list.  Even if the list is "almost"
already sorted, qsort's performance will go down the tubes, and it's
suprising how often this kind of data set will crop up in supposedly
"random" lists.

Yet another reason to use fastQsort instead of THINK's qsort.... :)

-------
Christopher Tate                 |
                                 |   Sorry; it's classified.  I could tell
    cxt105@psuvm.psu.edu         |    you, but then I'd have to kill you.
    cxt105@psuvm.bitnet          |

mneerach@iiic.ethz.ch (Matthias Ulrich Neeracher) (09/29/90)

In article <90269.174529CXT105@psuvm.psu.edu> CXT105@psuvm.psu.edu (Christopher Tate) writes:
]In article <60332@iuvax.cs.indiana.edu>, huntley@copper.ucs.indiana.edu (Haydn
]Huntley) says:

]>Perhaps sorting algorithms are an emotional issue -- so many people
]>have posted and mailed trying to tell me that randomizing the
]>algorithm doesn't change it's worst case behavior.  Technically, they
]>are not mistaken, the worst case behavior of fastQsort is N*N, but the
]>chance of observing that behavior is tiny.  If you perform one sort per
]>second for the rest of your life, and you are sorting arrays of 1024
]>elements, I think it is incredibly unlikely that you will observe
]>fastQsort exhibiting worst case behavior before pass on to elsewhere.
]>A person may live for 3 billion seconds, but 1024! is much larger!
]>Fifty factorial is more than 3x10^64, and most calculators can't do
]>more than 69!.

]Attention!  This is NOT a flame!!!

Neither is this...

]For fastQsort, this may well be true -- the random pivots will keep you
]from hitting worst-case pretty effectively.  

No. For random data, a random pivot has exactly the same chance of hitting
worst-case as any `simple' pivot. A median-of-three pivot, however, as
used by Haydn, statistically improves your chances.

]However, for regular qsort,
]the worst case is an already-sorted list.  Even if the list is "almost"
]already sorted, qsort's performance will go down the tubes, and it's
]suprising how often this kind of data set will crop up in supposedly
]"random" lists.

This, however, is true. But this kind of problem could just as easily be
circumvented by picking the middle element of the range instead of the first
one. Still easier than random.

]Yet another reason to use fastQsort instead of THINK's qsort.... :)

If you are really worried about worst case, conside using heapsort instead
of quicksort. Once you get the idea, it's really easy to program. Heapsort
is slower by a constant factor than the average QuickSort case, but it 
performs in almost constant time and in completely constant space. A superb
presentation is given in J. Bentley's "Programming Pearls".

Matthias

-----
Matthias Neeracher                                   mneerach@iiic.ethz.ch
   "These days, though, you have to be pretty technical before you can 
    even aspire to crudeness." -- William Gibson, _Johnny Mnemonic_

laird@slum.MV.COM (Laird Heal) (09/30/90)

Excuse me for coagulating this burgeoning thread, but one can test for
the "worst case" of vanilla Quick Sort without too much difficulty; in
the first pass one normally makes at least one comparison against each
of the keys.  I have not developed an heuristic but I would not expect
it to be too hard to determine if the incoming list were suited to the
algorithm.  Making the same optimization on a random pivot is possible 
but not as clear.

The first pass could check for sorted input or enclosed sorted series.
This determination would be O(n) and thus not burdensome on very large
lists, the only ones worth discussing.  The options would be to select
a portion of the list and apply Quick Sort or another algorithm.  What
is the best algorithm on a list of, say, ten or fewer items?

[quotations follow; hit 'n' except for reference]

In article <90269.174529CXT105@psuvm.psu.edu> CXT105@psuvm.psu.edu (Christopher Tate) writes:
>In article <60332@iuvax.cs.indiana.edu>, huntley@copper.ucs.indiana.edu (Haydn
>Huntley) says:
>
>>Perhaps sorting algorithms are an emotional issue -- so many people
>>have posted and mailed trying to tell me that randomizing the
>>algorithm doesn't change it's worst case behavior.  Technically, they
>>are not mistaken, the worst case behavior of fastQsort is N*N, but the
>>chance of observing that behavior is tiny.  If you perform one sort per
>>second for the rest of your life, and you are sorting arrays of 1024
>>elements, I think it is incredibly unlikely that you will observe
>>fastQsort exhibiting worst case behavior before pass on to elsewhere.
>>A person may live for 3 billion seconds, but 1024! is much larger!
>>Fifty factorial is more than 3x10^64, and most calculators can't do
>>more than 69!.
>
>Attention!  This is NOT a flame!!!
>
>For fastQsort, this may well be true -- the random pivots will keep you
>from hitting worst-case pretty effectively.  However, for regular qsort,
>the worst case is an already-sorted list.  Even if the list is "almost"
>already sorted, qsort's performance will go down the tubes, and it's
>suprising how often this kind of data set will crop up in supposedly
>"random" lists.
>
>Yet another reason to use fastQsort instead of THINK's qsort.... :)
>
-- 

Laird Heal	laird@slum.MV.COM	Dishonesty is the best politics.
(Salem, NH)	+1 603 898 1406         Disclaimer:  if you want'er.