35
_radixSort_0 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0];
/*
RADIX SORT
Use 256 bins
Use shadow array
- Get counts
- Transform counts to pointers
- Sort from LSB - MSB
*/
function radixSort(intArr) {
    var cpy = new Int32Array(intArr.length);
    var c4 = [].concat(_radixSort_0); 
    var c3 = [].concat(_radixSort_0); 
    var c2 = [].concat(_radixSort_0);
    var c1 = [].concat(_radixSort_0); 
    var o4 = 0; var t4;
    var o3 = 0; var t3;
    var o2 = 0; var t2;
    var o1 = 0; var t1;
    var x;
    for(x=0; x<intArr.length; x++) {
        t4 = intArr[x] & 0xFF;
        t3 = (intArr[x] >> 8) & 0xFF;
        t2 = (intArr[x] >> 16) & 0xFF;
        t1 = (intArr[x] >> 24) & 0xFF ^ 0x80;
        c4[t4]++;
        c3[t3]++;
        c2[t2]++;
        c1[t1]++;
    }
    for (x=0; x<256; x++) {
        t4 = o4 + c4[x];
        t3 = o3 + c3[x];
        t2 = o2 + c2[x];
        t1 = o1 + c1[x];
        c4[x] = o4;
        c3[x] = o3;
        c2[x] = o2;
        c1[x] = o1;
        o4 = t4;
        o3 = t3;
        o2 = t2;
        o1 = t1;
    }
    for(x=0; x<intArr.length; x++) {
        t4 = intArr[x] & 0xFF;
        cpy[c4[t4]] = intArr[x];
        c4[t4]++;
    }
    for(x=0; x<intArr.length; x++) {
        t3 = (cpy[x] >> 8) & 0xFF;
        intArr[c3[t3]] = cpy[x];
        c3[t3]++;
    }
    for(x=0; x<intArr.length; x++) {
        t2 = (intArr[x] >> 16) & 0xFF;
        cpy[c2[t2]] = intArr[x];
        c2[t2]++;
    }
    for(x=0; x<intArr.length; x++) {
        t1 = (cpy[x] >> 24) & 0xFF ^ 0x80;
        intArr[c1[t1]] = cpy[x];
        c1[t1]++;
    }
    return intArr;
}

EDIT:

So far, the best/only major optimization brought to light is JS typed arrays. Using a typed array for the normal radix sort's shadow array has yielded the best results. I was also able to squeeze a little extra out of the in place quick sort using JS built in stack push/pop.


latest jsfiddle benchmark

Intel i7 870, 4GB, FireFox 8.0
2mil
radixSort(intArr): 172 ms
radixSortIP(intArr): 1738 ms
quickSortIP(arr): 661 ms
200k
radixSort(intArr): 18 ms
radixSortIP(intArr): 26 ms
quickSortIP(arr): 58 ms

It appears standard radix sort is indeed king for this work-flow. If someone has time to experiment with loop-unrolling or other modifications for it I would appreciate it.

I have a specific use case where I'd like the fastest possible sorting implementation in JavaScript. There will be large (50,000 - 2mil), unsorted (essentially random), integer (32bit signed) arrays that the client script will access, it then needs to sort and present this data.

I've implemented a fairly fast in place radix sort and in place quick sort jsfiddle benchmark but for my upper bound array length they are still fairly slow. The quick sort performs better on my upper bound array size while the radix sort performs better on my lower bound.

defaultSort is the built-in JavaScript array.sort with an integer compare function

Intel C2Q 9650, 4GB, FireFox 3.6
2mil
radixSortIP(intArr): 5554 ms
quickSortIP(arr): 1796 ms
200k
radixSortIP(intArr): 139 ms
quickSortIP(arr): 190 ms
defaultSort(intArr): 354 ms

Intel i7 870, 4GB, FireFox 8.0
2mil
radixSortIP(intArr): 990 ms
quickSortIP(arr): 882 ms
defaultSort(intArr): 3632 ms
200k
radixSortIP(intArr): 28 ms
quickSortIP(arr): 68 ms
defaultSort(intArr): 306 ms

Questions

  • Is there a better implementation of any sorting algorithm that would meet my use case/needs?
  • Are there any optimizations that can be made to my in place radix/quick sort implementations to improve performance?
    • Is there an efficient way to convert my in place radix sort from a recursive to iterative function? Memory and execution speed.

Goal

  • I am hoping these answers will help me get ~20-30% performance improvement in my benchmark test.

Clarifications/Notes

  • "DEFINE FAST" I would prefer a general case where it runs well on all modern browsers, but if there is a browser specific optimization that makes a significant improvement that may be acceptable.
  • The sorting COULD be done server side, but I'd prefer to avoid this because the JS app may become a standalone (paired with some off the shelf proprietary app that will stream sensor data to a file).
  • JavaScript may not be the best language for this but it's a requirement.
  • I've already asked this question https://stackoverflow.com/questions/7111525/fastest-way-to-sort-integer-arrays-in-javascript an incorrect answer was up-voted and the question was closed.
  • I've attempted using multiple browser window instances as a makeshift multi-threading; it didn't pan out. I'd be interested in useful info regarding spawning multiple windows for concurrency.
Community
  • 1
  • 1
Louis Ricci
  • 20,804
  • 5
  • 48
  • 62
  • 1
    It's very interesting that radix sort is slower than quicksort. Are you sure you implemented it correctly? How much time is spent creating the array? – James Nov 10 '11 at 16:08
  • have you checked for a performance increase in populating your array with floats? `test[x]=parseFloat(..rand..)` – Alex K. Nov 10 '11 at 16:19
  • Are you allowed to use typed buffers? – harold Nov 10 '11 at 16:20
  • 1
    Going with a median-of-three in your partitioning will probably give you a 10% speedup in your QuickSort implementation, and will avoid most of the problems QuickSort has with partially ordered subsets. Better yet would be a median-of-five, which avoids the median of three killers that are not uncommon. – Jim Mischel Nov 10 '11 at 16:50
  • @James - just shows how O(n) can be misleading. Perhaps the shadow array in the non-in-place radix sort is the bottleneck, but in the in place radix sort only 256 entry arrays are initialized. – Louis Ricci Nov 10 '11 at 18:15
  • @Alex K. - performance seems to suffer slightly with floats. – Louis Ricci Nov 10 '11 at 18:18
  • @harold - do you have an example / testing showing Int32Array being able to outperform []? I'd be interested in this approach, what level of browser compatibility is available? – Louis Ricci Nov 10 '11 at 18:20
  • @Jim Mischel - the in place quick sort is already using a median of 3, please refer to the js fiddle link. – Louis Ricci Nov 10 '11 at 18:37
  • Ahhhh, so it is. And I even looked at that code. Must have been asleep. – Jim Mischel Nov 10 '11 at 18:39
  • 1
    Perhaps create a http://jsperf.com/ page that would reflect tried solutions? – zatatatata Nov 13 '11 at 20:03
  • Anyone realize that passing 50,000 - 2mil pieces of data to Javascript will take longer than solving the problem server-side? Are you sure that Javascript in a browser is the best way to solve this problem? – Yzmir Ramirez Nov 13 '11 at 20:16
  • @Alex JavaScript has only one number type which is Number. So, performing `parseFloat()` on an integer value is a no-op. – Šime Vidas Nov 13 '11 at 20:31
  • I didn't look at your code, but if quicksort scales better than radix, you almost certainly did something wrong ("almost" because there's some chance you found an edge case on one/both tests). By definition an O(n) algorithm will scale better than an O(n*log(n)). – Aaron Dufour Nov 14 '11 at 08:01
  • @Aaron Dufour - by all means please look at the code (js fiddle link). The only thing I can see is that the in place QSort is iterative while the in place RSort is recursive, if you could share a way to make the RSort iterative in an efficient manner it would be great. – Louis Ricci Nov 14 '11 at 12:24
  • @LastCoder I looked at it and the algorithms look right. I can't for the life of me figure out why radix and quick appear to scale the same way. – Aaron Dufour Nov 14 '11 at 16:22
  • @AaronDufour Yeah just as Quicksort has to be slower in reality because clearly a O(n^2) algorithm can't beat an O(n log n) one ;) Or if you want it "mathy" - RadixSort is actually O(k*n) (k being the number of bits). Now sorting more than 2^32 elements isn't impossible, but highly unlikely for client side JS code. Obviously in reality constant factors and cache effects play a much larger role. I can think of lots of great O(n log n) algorithms that are beaten by O(n^2) algorithms all the time on real sized problems. – Voo Nov 14 '11 at 19:25
  • @Voo - Empirically radix sort is "widely" considered faster than quick sort with regards to 32 bit integer sorting (even for conservative array sizes < 10mil), while this is true in other languages (c++, java, x86 assembly) it appears to fall short in this instance (JavaScript). When sorting 8 bits at a time standard Radix Sort has a complexity of ~5*n (count + 4x Sort), while the in place version is ~8*n (we re-count for each of the 4 sorts). – Louis Ricci Nov 14 '11 at 20:50
  • @LastCoder Last time I checked TimSort was a good bit faster on partially sorted arrays when compared to RadixSort - that is in Java tested with 10 million entries with a 8bit radix. So it hardly is that clear-cut. – Voo Nov 14 '11 at 21:30
  • @Voo - nothing's clear cut you can find counter cases for many things, but with random data, NOT partially sorted, radix sort IN MANY languages is faster. If you know of an optimized JS implementation of TimSort i'd be interested in the benchmark results. – Louis Ricci Nov 16 '11 at 12:34
  • @Aaron Dufour - seems like the scaling issue was solved by using typed arrays for both the input and the shadow array (for standard radix sort). – Louis Ricci Nov 16 '11 at 13:04
  • 2
    Just my 2 cents... 200k items of 4 bytes each will probably fit in the on-chip CPU cache for most modern processors whereas 2 million probably won't. The random access nature of radix sort will hurt the performance once we have to use DRAM. Quick sort typically requires sequential memory access. – Jason Nov 16 '11 at 22:08
  • @Jason - The standard radix sort IS faster for 2mil. Because I use shadow array swapping the READS are all sequential and only the non temporal WRITES are random. RS:172ms vs QSIP:661ms ... – Louis Ricci Nov 17 '11 at 03:45
  • @LastCoder do you need 32 bit integers ? because some engines (like Opera's Presto, or V8 - seem to use even cheaper data-dypes for small numbers - like, say - 16 bit integers). This can be seen - when sorting regular arrays is faster than sorting typed arrays. – c69 Nov 18 '11 at 11:12
  • @c69 - yeah 32bit signed is a requirement in this case. But that's interesting about 16bit, despite being smaller x86 (32bit and more so with 64bit) processors usually incur a penalty for working heavily with small data that isn't padded/aligned correctly. – Louis Ricci Nov 18 '11 at 12:02
  • Hold on to your seat a bit longer - WebCL is coming! http://www.khronos.org/webcl/ + http://code.google.com/p/back40computing/wiki/RadixSorting – Ricardo Tomasi Nov 20 '11 at 03:38
  • @Ricardo Tomasi - the question/answer may need updating in the near future then... I look forward to it. – Louis Ricci Nov 21 '11 at 12:14

5 Answers5

12

I've tested typed arrays, the QSIP version seems to be good in modern browsers:

2 000 000 elements

          QSIP_TYPED | RDXIP_TYPED |  QSIP_STD | RDXIP_STD
----------------------------------------------------------
Chrome  |    300          1000          600        1300
Firefox |    550          1500          800        1600    

http://jsfiddle.net/u8t2a/35/

Support (source: http://caniuse.com/typedarrays):

 IE 10+   |   FF 4+  |  Chrome 7+  |  Safari 5.1+  |  Opera 11.6+   
gblazex
  • 49,155
  • 12
  • 98
  • 91
  • In Chrome 17, I get identical results for typed or not – either a bug in the test, or V8 automatically makes that optimisation. – Rich Bradshaw Nov 13 '11 at 20:19
  • @Rich - Hm... interesting. I used v15, but I can see the same difference in v17 too. – gblazex Nov 13 '11 at 20:20
  • 1
    @Rich - Oh you mean the jsperf test? It was only 20 000 elements. I removed it. It's not representative for large datasets. – gblazex Nov 13 '11 at 20:24
  • This is definitely a substantial optimization, so much so, that the "supported browser" caveat is acceptable. For intellectual curiosities sake I'd love to see some algorithm level optimizations, but this surely answers the crux of the question. – Louis Ricci Nov 14 '11 at 12:29
  • 1
    Using the typed array for the shadow array of standard radix sort has yielded the best results thus far *see my question edit*. – Louis Ricci Nov 16 '11 at 13:02
2

Have you considered a combination of algorithms to maximize cache use? I saw in the benchmark that you are switching to insertion sort when the subarrays become small. An interesting approach is to instead switch to heapsort. Used in conjunction with quicksort, it can bring down the worst case to O(nlog(n)) instead of O(n^2). Have a look at Introsort.

Tudor
  • 61,523
  • 12
  • 102
  • 142
  • The in place quick sort I'm using (that currently has the best performance) is avoiding recursion with a stack. Using a heap would add another ADT to the mix. If you could implement introsort and it's faster that would be a great help, but I don't see a compelling reason to even attempt it. – Louis Ricci Nov 16 '11 at 12:30
  • Alright I'll try to implement it and let you know. Unfortunately I'm not a great expert of javascript, but I'll see what I can do... – Tudor Nov 16 '11 at 14:09
  • @LastCoder: I found it on someone's github. Maybe you can figure out how to include it in your benchmark if it's not too hard. https://github.com/fatshotty/IntroSort.js/blob/master/introsort.js – Tudor Nov 16 '11 at 20:23
1

I fiddled with your benchmark and added my own sort function. It performs same as radixsort, but it's idea (and implementation) is simpler, it is like a radixsort, but in binary, so you only have two bucket and can do it in-place. Look at http://jsfiddle.net/KaRV9/7/.

I put my code in place of "Quicksort in place" (since it is very similar to quicksort, just pivot is selected other way). Run them a few times, in my Chrome 15 they perform so close it is unable to distinguish them.

  • Very interesting, especially the speed claim, in FF8 its about 6x slower 973 ms vs 172 ms for standard radix sort. Chrome's JS engine must be vastly different. – Louis Ricci Nov 16 '11 at 20:55
  • for the first run it is similar difference here, too, but second, third and subsequent runs are in par with radixsort. Don't ask me why :-) –  Nov 16 '11 at 20:59
1

I am not going to comment on your sorting algorithms. you know more about those than me.

But a good idea would be to use web workers. This allows your sort to run in the background in it's own thread and thus not blocking the interface. This would be good practise no matter what. It is well supported for Chrome and Firefox. Opera has a non-threaded version. Not sure about the support in IE, but it would be easy to work around that. There is of course overhead involved in using multiple threads.

Merge sort can be made easily into a multi-threaded version, which will give you some performance boost. Messaging comes with a time penalty of course, so it will really depend on your specific situation if it will run faster. Remember though, that the non-blocking nature might make it feel like the app is running faster for the end user.

0

EDIT: I see you're already using insertion sort for smaller subarrays. I missed that.

The good real-world approach with quicksort is to check the size of the subarray, and if it's short enough use a quick low-overhead sort that's too slow for larger arrays, such as insertion sort.

Pseudo-code is something along the lines of:

quicksort(array, start, end):
  if ( (end-start) < THRESHOLD ) {
    insertion_sort(array, start, end);
    return;
  }
  // regular quicksort here
  // ...

To determine the THRESHOLD, you need to time it on the platforms you care about, in your case - possibly different browsers. Measure the time for random arrays with different thresholds to find an close-to-optimal one. You could also choose different thresholds for different browsers if you find significant differences.

Now if your inputs aren't really random (which is pretty common) you can see if better pivot selection improves performance. A common method is the median of three.

orip
  • 73,323
  • 21
  • 116
  • 148