Radix sort on 38 bits in a uint64?

Question

Question: Is there a "Dial_A_Bit" Radix Sort which can be used to sort on a subset of the bits in a data type?

uint64 *Radix_Sort_Dial_A_Bit(uint64_t *a, int num_a, int sort_bits);

In particular, would a 38 bit sort on 64 bit data be possible and have speed somewhere between the 32/64 and 48/64 varieties shown below?

uint64 *Radix_Sort_ui64_38MSB(uint64_t *a, int num_a);

Note how research into 48 and 32 bit sorts has verified both increased speed and correctness compared to sorting on all 64 bits in a uint64_t[].

It seems as though a Radix_Sort() which sorts on a subset of the data package size would be generally useful and efficient, sorting only what is needed.

There are cases in which results are calculated for each Pixel and need to be sorted. A uint64_t[] is used to hold both the calculation result and the XY location.

26 total bits (13 for X and 13 for Y, 8192 max resolution) are needed to hold the pixel XY coordinates which leaves 38 bits for the data to be sorted.

The entire 64 bit package can be sorted with the Radix_Sort_Uint64().

A faster method is to use a Radix_Sort_Uint48() (see below) so the last 16 bits are not considered in the sort. This sorts all of the data correctly and also sorts 10 of the 13 X coordinate bits which is not needed.

Since performance is almost linearly proportional to bits sorted, optimally, only the 38 most significant bits would be considered in the sort.

Even a 40 bit Radix Sort would be better than using 48 bits. I tried to generalize the working 48 bit Radix Sort to operate on 40 bits but it does not sort correctly.

QSort_uint64_38_msb():

static inline int comp_uint64_38_msb(const void *a, const void *b)  {
register int64_t ca, cb;
    ca = (*((uint64_t *)a)) >> 26;  // Send 26 LSBs to oblivion
    cb = (*((uint64_t *)b)) >> 26;  // Send 26 LSBs to oblivion
    return((ca > cb) - (ca < cb));  // Calcs to [+1, 0 -1]
}

As you can see, the 48 bit sort is significantly faster than the full 64 bits. The 32 bit sort is almost twice as fast as the full 64 bit. And the qsort is lagging far behind:

  Time=  2.136 sec =  3.390%, RADIX_SORT_FFFF0000  , hits=4, 0.534 sec each
  Time=  2.944 sec =  4.672%, RADIX_SORT_FFFFFF00  , hits=4, 0.736 sec each
  Time=  4.691 sec =  7.444%, RADIX_SORT_64        , hits=5, 0.938 sec each
  Time= 25.209 sec = 40.010%, QSORT_UINT64_ARRAY   , hits=4, 6.302 sec each
  Time= 26.191 sec = 41.569%, QSORT_UINT64_38_ARRAY, hits=4, 6.548 sec each

Linearizing from the 64, 48 and 32 bit results to 38 bits:

lsf 64  0.938   48  0.736    32  0.534   38   ->  0.6500

The 38 bit Radix_Sort could be 35% faster than 64 bit sorting and 17% faster than the 48 bit sort.

Even 40 bits would be quicker, 5 bytes vs 6 processed per uint64.

=========

Fastest yet, 6 of 8 byte sort of uint64[], generalized from: 32 MSbits used to sort uint64

// #############################################################################
// From: http://ideone.com/JHI0d9
// RadixSort---  for 48 MSB of uint64
typedef union {
    struct {
        uint32_t c6[256];
        uint32_t c5[256];
        uint32_t c4[256];
        uint32_t c3[256];
        uint32_t c2[256];
        uint32_t c1[256];
    };
    uint32_t counts[256 * 6];
}  rscounts6_t;


// #############################################################################
// Patterned off of Radix_Sort_64 but looks only at the 48 MostSigBits
// 0XFFFF-FFFF-FFFF-0000  << Ignore the zeros, sort on 3 MostSigBytes
// Made for RGB48 stuffed into uint64 with 2 LeastSig bytes zero
// Get rid of the 7 and 8 level comps
uint64_t *radix_sort_48_msb(uint64_t *arrayA, uint32_t asize) 
{
register uint64_t *array=arrayA;  // Slam arg into Register!
register int ii;  // Loop control

    rscounts6_t counts;
    memset(&counts, 0, 256 * 6 * sizeof(uint32_t));
    uint64_t *cpy = (uint64_t *)malloc(asize * sizeof(uint64_t));
    uint32_t o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
    uint32_t t6, t5, t4, t3, t2, t1;
    register uint32_t x;
    // calculate counts
    for(x = 0; x < asize; x++) {
        t6 = (array[x] >> 16) & 0xff;
        t5 = (array[x] >> 24) & 0xff;
        t4 = (array[x] >> 32) & 0xff;
        t3 = (array[x] >> 40) & 0xff;
        t2 = (array[x] >> 48) & 0xff;
        t1 = (array[x] >> 56) & 0xff;
        counts.c6[t6]++;
        counts.c5[t5]++;
        counts.c4[t4]++;
        counts.c3[t3]++;
        counts.c2[t2]++;
        counts.c1[t1]++;
    }
    // convert counts to offsets
    for(x = 0; x < 256; x++) {
        t6 = o6 + counts.c6[x];
        t5 = o5 + counts.c5[x];
        t4 = o4 + counts.c4[x];
        t3 = o3 + counts.c3[x];
        t2 = o2 + counts.c2[x];
        t1 = o1 + counts.c1[x];
        counts.c6[x] = o6;
        counts.c5[x] = o5;
        counts.c4[x] = o4;
        counts.c3[x] = o3;
        counts.c2[x] = o2;
        counts.c1[x] = o1;
        o6 = t6; 
        o5 = t5; 
        o4 = t4; 
        o3 = t3; 
        o2 = t2; 
        o1 = t1;
    }
    // radix
    for(x = 0; x < asize; x++) {
        t6 = (array[x] >> 16) & 0xff;
        cpy[counts.c6[t6]] = array[x];
        counts.c6[t6]++;  }
    for(x = 0; x < asize; x++) {
        t5 = (cpy[x] >> 24) & 0xff;
        array[counts.c5[t5]] = cpy[x];
        counts.c5[t5]++;  }
    for(x = 0; x < asize; x++) {
        t4 = (array[x] >> 32) & 0xff;
        cpy[counts.c4[t4]] = array[x];
        counts.c4[t4]++;  }
    for(x = 0; x < asize; x++) {
        t3 = (cpy[x] >> 40) & 0xff;
        array[counts.c3[t3]] = cpy[x];
        counts.c3[t3]++;  }
    for(x = 0; x < asize; x++) {
        t2 = (array[x] >> 48) & 0xff;
        cpy[counts.c2[t2]] = array[x];
        counts.c2[t2]++;  }
    for(x = 0; x < asize; x++) {
        t1 = (cpy[x] >> 56) & 0xff;
        array[counts.c1[t1]] = cpy[x];
        counts.c1[t1]++;  }
    free(cpy);
    return array;
}  // End radix_sort_48_msb().

==================================

Thanks again to Rcgldr for innovative programming suggestion! Instead of 10, 10, 9, 9, I used the fast 32 bit pattern with [4][10]

It Works, but it is quite a bit slower than the 48 MSB sort, 737 msec for 40 MSBt, 588 msec for 48 MSB. :(

Maybe I coded it poorly.

    Time=  6.108 sec = 33.668%, QSORT_UINT64_ARRAY   , hits=1
    Time=  3.060 sec = 16.866%, RADIX_SORT_UINT64_REG, hits=4, 0.765 sec each
    Time=  2.947 sec = 16.241%, RADIX_SORT_UINT64_40R, hits=4, 0.737 sec each < SLOW
    Time=  2.354 sec = 12.973%, RADIX_SORT_UINT64_48R, hits=4, 0.588 sec each
    Time=  1.542 sec =  8.498%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each
    Time=  0.769 sec =  4.236%, RADIX_SORT_64        , hits=1

The test:

create a random, uint64_t[36M] master array
sort it with both qsort and a known good Radix Sort, radixSort() to create a standard array
Comparing Qsort and radixSort() results for identicality.
Sort a copy of the master with 32, 40, 48 and 64 MSB Radix Sorts
Compare each test sort to the Standard after masking off the ignored LSBs

Here is the code:

    //=============================================================================
// From code submitted by rcgldr, Feb 8 2020
// Optimized to use Registers and to sort on 40 MSBs, ignoring 24 LSBs
void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
    EV_TIME_STR *tsa)
{
    size_t mIndex[4][1024] = { 0 };  /* index matrix */
    size_t * pmIndex;                /* ptr to row of matrix */
    size_t i, j, m, n;
    uint64_t u;
    if(tsa)  time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);  

    for (i = 0; i < count; i++) {       /* generate histograms */
        u = pData[i];
        mIndex[3][(u >> 24) & 0x3ff]++;
        mIndex[2][(u >> 34) & 0x3ff]++;
        mIndex[1][(u >> 44) & 0x3ff]++;
        mIndex[0][(u >> 54) & 0x3ff]++;
    }

    for (j = 0; j < 4; j++) {           /* convert to indices */
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 1024; i++)  {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }

    for (i = 0; i < count; i++)  {  /* radix sort */
        u = pData[i];
        pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
    }
    for (i = 0; i < count; i++)  {
        u = pTemp[i];
        pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
    }
    for (i = 0; i < count; i++)  {
        u = pData[i];
        pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
    }
    for (i = 0; i < count; i++)  {
        u = pTemp[i];
        pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;
    }
}  // End Radix_Sort_R64_40().

Here is the unique dif between the FAST 32 MSB version and the cloned 40 MSB slow version.

Unique lines from "~/tmp/radix.sort.32.c":
  02) void radix_sort_r64_32(uint64_t *pData, uint64_t *pTemp, size_t count,
  05) size_t mIndex[4][256] = { 0 };      /* index matrix */
  09) if(tsa)  time_event(E_RADIX_SORT_UINT64_32R, tsa, E_TIME_EVENT, 1, 0);
  13) mIndex[3][(u >> 32) & 0xff]++;  // B4
  14) mIndex[2][(u >> 40) & 0xff]++;  // B5
  15) mIndex[1][(u >> 48) & 0xff]++;  // B6
  16) mIndex[0][(u >> 56) & 0xff]++;  // B7
  22) for (i = 0; i < 256; i++) {
  31) pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
  35) pData[mIndex[2][(u >> 40) & 0xff]++] = u;
  39) pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
  43) pData[mIndex[0][(u >> 56) & 0xff]++] = u;

Unique lines from "~/tmp/radix.sort.40.c":
  01) void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
  04) size_t mIndex[4][1024] = { 0 };  /* index matrix */
  08) if(tsa)  time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);
  12) mIndex[3][(u >> 24) & 0x3ff]++;
  13) mIndex[2][(u >> 34) & 0x3ff]++;
  14) mIndex[1][(u >> 44) & 0x3ff]++;
  15) mIndex[0][(u >> 54) & 0x3ff]++;
  21) for (i = 0; i < 1024; i++)  {
  30) pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
  34) pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
  38) pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
  42) pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;

A few different (but unrelated) points: First of all, in C you don't need to and really [shouldn't cast the result of `malloc`](https://stackoverflow.com/questions/605845/do-i-cast-the-result-of-malloc). Also to be avoided is [magic numbers](https://en.wikipedia.org/wiki/Magic_number_(programming)) (especially when the "magic number" could be gotten by the `sizeof` operator, as in `sizeof counts`). And `register` is only a suggestion, that a compiler doesn't have to bother with at all if it feels like it (except that it has to follow the linkage semantics, like no address taken etc.). — Some programmer dude, Jan 28 '20 at 17:40
@rcgldr - Skylake i7-6700K CPU @ 4.00GHz, cache_alignment : 64, AVX2, 64 GB DDR4 3.2GHz, CL16 (F4-3200C16-16GVK) set to Interleaved for faster large block sequential. 4x32k inst/dat L1, 4x256k L2, 8M L3 — BrianP007, Feb 15 '20 at 20:02
@rcgldr - You mentioned cache coherency as a likely cause of degraded performance with 10 bit bins vs 8 bit. The writing is done from point A to point B within the scope of the total array size so would it not be dependent on array size and independent of bin bits? To test this, one could vary the array size from 1M to 32M and see where performance drops. This might show a performance cliff indicating where the cache was overwhelmed. The new AMD 3950x has 64 MB L3 and 8 MB total L2! Wish I had one to test with! — BrianP007, Feb 15 '20 at 20:12
@BrianP007 - Cache coherency - related to shared memory between cores or processors, not an issue here. I was referring to the size of the L1 cache, which is 32KB per core (both 6700K and 3770K have same cache size). A matrix of [8][256] 64 bit indexes takes 16KB of space, leaving space L1 cache for array reads. A matrix of [4][1024] 64 bit indexes takes 32KB of space, and switching from size_t to uint32_t would help during initial counting pass. During radix sorts, only one row is used at a time, so not an issue. — rcgldr, Feb 15 '20 at 21:22
@BrianP007 - the main issue is each radix sort write is a random access write to anywhere in a (6000 x 6000 x 8 = ) 288MB array, 36 times the size of L3 cache. The "1024" sort has the same issue when creating the 1024 bins, but assuming reasonable distribution, each bin is less than 8MB (281KB in an ideal case), which should reduce the overhead related to random access writes. (Note I deleted prior comments that are no longer needed). — rcgldr, Feb 15 '20 at 21:33
If it is an L1 cache size issue, splitting the sort to 4 (8?) subsorts, one on each core would use all 4 L1s. And then a 4 way merge? Or a+b->e, c+d->f, e+f->G cascade merge? Too much 'extended discussions in comments'? Do'h! — BrianP007, Feb 15 '20 at 22:30
@BrianP007 - delete some of your prior comments to avoid "extended discussion" flagging. L1 cache size mostly an issue on the initial read pass to generate the counts that get converted into indexes, and the issue here is if mIndex[][] fits in L1 cache with room to spare for array reads. I've done multi-threaded merge sort (4 threads about 3 times as fast as 1 thread), but I haven't tried multi-threading on radix sort as I suspect it's closer to being memory bandwidth limited. — rcgldr, Feb 16 '20 at 00:42

rcgldr · Answer 1 · 2020-02-10T21:29:09.690

A radix sort for the 38 msb's could be done in 4 passes using {10 bit, 10 bit, 9 bit, 9 bit} counts == offsets. Change your code to use sizes: c1[1024], c2[1024], c3[512], c4[512], and the count initialization to use shifts and masks {(...>>54)&0x3ff ,(...>>44)&0x3ff, (...>>35)&0x1ff,(...>>26)&0x1ff}, and make similar changes to the rest of the code.

Try using this code for comparison. Either change your code or add the timer stuff to this code:

void RadixSort(uint64_t *a, uint64_t *b, size_t count)
{
    uint32_t mIndex[4][1024] = { 0 };  /* index matrix */
    uint32_t * pmIndex;                /* ptr to row of matrix */
    uint32_t i, j, m, n;
    uint64_t u;

    for (i = 0; i < count; i++) {       /* generate histograms */
        u = a[i];
        mIndex[3][(u >> 26) & 0x1ff]++;
        mIndex[2][(u >> 35) & 0x1ff]++;
        mIndex[1][(u >> 44) & 0x3ff]++;
        mIndex[0][(u >> 54) & 0x3ff]++;
    }

    for (j = 0; j < 2; j++) {           /* convert to indices */
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 1024; i++)  {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }
    for (j = 2; j < 4; j++) {
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 512; i++)  {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }

    pmIndex = mIndex[3];
    for (i = 0; i < count; i++)  {      /* radix sort */
        u = a[i];
        b[pmIndex[(u >> 26) & 0x1ff]++] = u;
    }
    pmIndex = mIndex[2];
    for (i = 0; i < count; i++)  {
        u = b[i];
        a[pmIndex[(u >> 35) & 0x1ff]++] = u;
    }
    pmIndex = mIndex[1];
    for (i = 0; i < count; i++)  {
        u = a[i];
        b[pmIndex[(u >> 44) & 0x3ff]++] = u;
    }
    pmIndex = mIndex[0];
    for (i = 0; i < count; i++)  {
        u = b[i];
        a[pmIndex[(u >> 54) & 0x3ff]++] = u;
    }
}

Faster still is to sort by most significant 10 bits first, creating 1024 bins, then sort the 1024 bins, {10,9,9} bit fields, least significant bits first. This will speed up the sort since each of the 1024 bins fits in cache, reducing the overhead for all of the random access writes. Note - aIndex has size 1025, to keep track of the size of the last bin.

void RadixSort3(uint64_t *, uint64_t *, size_t);

/* split array into 1024 bins according to most significant 10 bits */
void RadixSort(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t aIndex[1025] = {0};            /* index array */
uint32_t i, m, n;
    for(i = 0; i < count; i++)          /* generate histogram */
        aIndex[(a[i] >> 54)]++;
    n = 0;                              /* convert to indices */
    for (i = 0; i < 1025; i++)  {
        m = aIndex[i];
        aIndex[i] = n;
        n += m;
    }
    for(i = 0; i < count; i++)          /* sort by ms 10 bits */
        b[aIndex[a[i]>>54]++] = a[i];
    for(i = 1024; i; i--)               /* restore aIndex */
        aIndex[i] = aIndex[i-1];
    aIndex[0] = 0;
    for(i = 0; i < 1024; i++)           /* radix sort the 1024 bins */
        RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
}

void RadixSort3(uint64_t *a, uint64_t *b, size_t count)
{
    uint32_t mIndex[3][1024] = { 0 };   /* index matrix */
    uint32_t * pmIndex;                 /* ptr to row of matrix */
    uint32_t i, j, m, n;
    uint64_t u;

    for (i = 0; i < count; i++) {       /* generate histograms */
        u = a[i];
        mIndex[2][(u >> 26) & 0x1ff]++;
        mIndex[1][(u >> 35) & 0x1ff]++;
        mIndex[0][(u >> 44) & 0x3ff]++;
    }

    for (j = 0; j < 1; j++) {           /* convert to indices */
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 1024; i++)  {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }
    for (j = 1; j < 3; j++) {
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 512; i++)  {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }

    pmIndex = mIndex[2];
    for (i = 0; i < count; i++)  {      /* radix sort */
        u = a[i];
        b[pmIndex[(u >> 26) & 0x1ff]++] = u;
    }
    pmIndex = mIndex[1];
    for (i = 0; i < count; i++)  {
        u = b[i];
        a[pmIndex[(u >> 35) & 0x1ff]++] = u;
    }
    pmIndex = mIndex[0];
    for (i = 0; i < count; i++)  {
        u = a[i];
        b[pmIndex[(u >> 44) & 0x3ff]++] = u;
    }
}

score 1 · Answer 2 · answered Feb 08 '20 at 19:46

The 40 MSB sort performed very poorly, much slower than the 48 MSB version.

So I tried a [6][64], 36 MSB version modeled after the relatively fast 48 MSB.

I know I wanted 38 bits, not 36. The method to optimize was how to pack an XY location and a property at that point into a uint64_t. With 8k X and Y coords, that took 13+13 bits for XY and left no more than 64-26=38 bits for the data.

The current data max is 34 or 35 bits so 36 should work.

Here's the performance:

  Time=  6.104 sec = 30.673%, QSORT_UINT64_ARRAY   , hits=1
  Time=  3.117 sec = 15.663%, RADIX_SORT_UINT64_REG, hits=4, 0.779 sec each
  Time=  2.931 sec = 14.731%, RADIX_SORT_UINT64_40R, hits=4, 0.733 sec each
  Time=  2.269 sec = 11.401%, RADIX_SORT_UINT64_48R, hits=4, 0.567 sec each
  Time=  1.663 sec =  8.359%, RADIX_SORT_UINT64_36R, hits=4, 0.416 sec each < FAST
  Time=  1.516 sec =  7.620%, RADIX_SORT_UINT64_32R, hits=4, 0.379 sec each
  Time=  0.734 sec =  3.689%, RADIX_SORT_64        , hits=1

It's 27% faster than the 48 bit code.

And, it looks like it should be able to be extended to [6][7] to sort on the 42 MSBs if 36 bits gets too tight!

Here's the full code:

void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
    EV_TIME_STR *tsa)
{
    size_t mIndex[6][64] = { 0 };  /* index matrix */
    size_t * pmIndex;              /* ptr to row of matrix */
    size_t i, j, m, n;
    uint64_t u;
    if(tsa)  time_event(E_RADIX_SORT_UINT64_36R, tsa, E_TIME_EVENT, 1, 0);  

    // 64 -- 56 48 40 32 24 16  -- 8 bits each
    // 64 -- 58 52 46 40 34 28  -- 6 bits each
    for (i = 0; i < count; i++) {       /* generate histograms */
        u = pData[i];                   // Igonores Nibbles 0, 1 & 2
        mIndex[5][(u >> 28) & 0x3F]++;  // N2
        mIndex[4][(u >> 34) & 0x3F]++;  // N3
        mIndex[3][(u >> 40) & 0x3F]++;  // N4
        mIndex[2][(u >> 46) & 0x3F]++;  // N5
        mIndex[1][(u >> 52) & 0x3F]++;  // N6
        mIndex[0][(u >> 58) & 0x3F]++;  // N7
    }

    for (j = 0; j < 6; j++) {           /* convert to indices */
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 64; i++) {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }

    for (i = 0; i < count; i++) {  /* radix sort */
        u = pData[i];
        pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
    }
    for (i = 0; i < count; i++) { 
        u = pTemp[i];
        pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pData[i];
        pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pTemp[i];
        pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pData[i];
        pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pTemp[i];
        pData[mIndex[0][(u >> 58) & 0x3F]++] = u;
    }
}  // End Radix_Sort_R64_36().

Unique dif with the 48 MSB function:

Unique lines from "/home/brianp/tmp/radix.sort.36.c":
  01) void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
  04) size_t mIndex[6][64] = { 0 };  /* index matrix */
  08) if(tsa)  time_event(E_RADIX_SORT_UINT64_36R, tsa, E_TIME_EVENT, 1, 0);
  11) mIndex[5][(u >> 28) & 0x3F]++;  // N2
  12) mIndex[4][(u >> 34) & 0x3F]++;  // N3
  13) mIndex[3][(u >> 40) & 0x3F]++;  // N4
  14) mIndex[2][(u >> 46) & 0x3F]++;  // N5
  15) mIndex[1][(u >> 52) & 0x3F]++;  // N6
  16) mIndex[0][(u >> 58) & 0x3F]++;  // N7
  22) for (i = 0; i < 64; i++) {
  31) pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
  35) pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
  39) pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
  43) pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
  47) pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
  51) pData[mIndex[0][(u >> 58) & 0x3F]++] = u;

Unique lines from "/home/brianp/tmp/radix.sort.48.c":
  01) void radix_sort_r64_48(uint64_t *pData, uint64_t *pTemp, size_t count,
  04) size_t mIndex[6][256] = { 0 };      /* index matrix */
  08) if(tsa)  time_event(E_RADIX_SORT_UINT64_48R, tsa, E_TIME_EVENT, 1, 0);
  14) mIndex[5][(u >> 16) & 0xff]++;  // B2
  15) mIndex[4][(u >> 24) & 0xff]++;  // B3
  16) mIndex[3][(u >> 32) & 0xff]++;  // B4
  17) mIndex[2][(u >> 40) & 0xff]++;  // B5
  18) mIndex[1][(u >> 48) & 0xff]++;  // B6
  19) mIndex[0][(u >> 56) & 0xff]++;  // B7
  25) for (i = 0; i < 256; i++) {
  34) pTemp[mIndex[5][(u >> 16) & 0xff]++] = u;
  38) pData[mIndex[4][(u >> 24) & 0xff]++] = u;
  42) pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
  46) pData[mIndex[2][(u >> 40) & 0xff]++] = u;
  50) pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
  54) pData[mIndex[0][(u >> 56) & 0xff]++] = u;

score 1 · Answer 3 · answered Feb 09 '20 at 18:06

For completeness, the 6 bin, 7 bit/bin 42 MSB Radix Sort works and the performance is right in line with the 48 MSB and the 36 MSB versions.

  Time=  6.334 sec = 25.435%, QSORT_UINT64_ARRAY   , hits=1
  Time=  3.519 sec = 14.131%, RADIX_SORT_UINT64_REG, hits=4, 0.880 sec each
  Time=  3.273 sec = 13.145%, RADIX_SORT_UINT64_40R, hits=4, 0.818 sec each < anomaly
  Time=  2.680 sec = 10.764%, RADIX_SORT_UINT64_48R, hits=4, 0.670 sec each
  Time=  2.302 sec =  9.246%, RADIX_SORT_UINT64_42R, hits=4, 0.576 sec each < NEW
  Time=  2.025 sec =  8.132%, RADIX_SORT_UINT64_36R, hits=4, 0.506 sec each
  Time=  1.767 sec =  7.094%, RADIX_SORT_UINT64_32R, hits=4, 0.442 sec each
  Time=  0.955 sec =  3.835%, RADIX_SORT_64        , hits=1

Can anybody explain why the 40 MSB sort is so much slower than the 48 MSB sort?

Full Code:

void radix_sort_r64_42(uint64_t *pData, uint64_t *pTemp, size_t count,
    EV_TIME_STR *tsa)
{
    size_t mIndex[6][128] = { 0 };  /* index matrix */
    size_t * pmIndex;              /* ptr to row of matrix */
    size_t i, j, m, n;
    uint64_t u;
    if(tsa)  time_event(E_RADIX_SORT_UINT64_42R, tsa, E_TIME_EVENT, 1, 0);  

    // 64 -- 56 48 40 32 24 16  -- 8 bits each
    // 64 -- 57 50 43 36 29 22  -- 7 bits each
    // 64 -- 58 52 46 40 34 28  -- 6 bits each
    for (i = 0; i < count; i++) {       /* generate histograms */
        u = pData[i];                   // Igonores Nibbles 0, 1 & 2
        mIndex[5][(u >> 22) & 0x7F]++;  // N2
        mIndex[4][(u >> 29) & 0x7F]++;  // N3
        mIndex[3][(u >> 36) & 0x7F]++;  // N4
        mIndex[2][(u >> 43) & 0x7F]++;  // N5
        mIndex[1][(u >> 50) & 0x7F]++;  // N6
        mIndex[0][(u >> 57) & 0x7F]++;  // N7
    }

    for (j = 0; j < 6; j++) {           /* convert to indices */
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 128; i++) {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }

    for (i = 0; i < count; i++) {  /* radix sort */
        u = pData[i];
        pTemp[mIndex[5][(u >> 22) & 0x7F]++] = u;
    }
    for (i = 0; i < count; i++) { 
        u = pTemp[i];
        pData[mIndex[4][(u >> 29) & 0x7F]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pData[i];
        pTemp[mIndex[3][(u >> 36) & 0x7F]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pTemp[i];
        pData[mIndex[2][(u >> 43) & 0x7F]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pData[i];
        pTemp[mIndex[1][(u >> 50) & 0x7F]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pTemp[i];
        pData[mIndex[0][(u >> 57) & 0x7F]++] = u;
    }
}  // End Radix_Sort_R64_42().

ADD version of diff of 36 MSB vs 42 MSB

Unique lines from "~/tmp/radix.sort.36.c":
  01) void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
  04) size_t mIndex[6][64] = { 0 };  /* index matrix */
  11) mIndex[5][(u >> 28) & 0x3F]++;  // N2
  12) mIndex[4][(u >> 34) & 0x3F]++;  // N3
  13) mIndex[3][(u >> 40) & 0x3F]++;  // N4
  14) mIndex[2][(u >> 46) & 0x3F]++;  // N5
  15) mIndex[1][(u >> 52) & 0x3F]++;  // N6
  16) mIndex[0][(u >> 58) & 0x3F]++;  // N7
  22) for (i = 0; i < 64; i++) {
  31) pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
  35) pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
  39) pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
  43) pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
  47) pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
  51) pData[mIndex[0][(u >> 58) & 0x3F]++] = u;

19 Unique lines from "~/tmp/radix.sort.42.c":
  01) void radix_sort_r64_42(uint64_t *pData, uint64_t *pTemp, size_t count,
  04) size_t mIndex[6][128] = { 0 };  /* index matrix */
  10) // 64 -- 56 48 40 32 24 16  -- 8 bits each
  11) // 64 -- 57 50 43 36 29 22  -- 7 bits each
  12) // 64 -- 58 52 46 40 34 28  -- 6 bits each
  15) mIndex[5][(u >> 22) & 0x7F]++;  // N2
  16) mIndex[4][(u >> 29) & 0x7F]++;  // N3
  17) mIndex[3][(u >> 36) & 0x7F]++;  // N4
  18) mIndex[2][(u >> 43) & 0x7F]++;  // N5
  19) mIndex[1][(u >> 50) & 0x7F]++;  // N6
  20) mIndex[0][(u >> 57) & 0x7F]++;  // N7
  26) for (i = 0; i < 128; i++) {
  35) pTemp[mIndex[5][(u >> 22) & 0x7F]++] = u;
  39) pData[mIndex[4][(u >> 29) & 0x7F]++] = u;
  43) pTemp[mIndex[3][(u >> 36) & 0x7F]++] = u;
  47) pData[mIndex[2][(u >> 43) & 0x7F]++] = u;
  51) pTemp[mIndex[1][(u >> 50) & 0x7F]++] = u;
  55) pData[mIndex[0][(u >> 57) & 0x3F]++] = u;

score 0 · Answer 4 · answered Feb 14 '20 at 16:47

2 more sorting algorithm test results:

  Time=  6.208 sec = 21.838%, QSORT_UINT64_ARRAY   , hits=1
  Time=  3.358 sec = 11.813%, RADIX_SORT_UINT64_REG, hits=4, 0.840 sec each
  Time=  2.525 sec =  8.884%, RADIX_SORT_UI64_AA99 , hits=4, 0.631 sec each <NEW
  Time=  2.509 sec =  8.825%, RADIX_SORT_UINT64_48R, hits=4, 0.627 sec each
  Time=  2.461 sec =  8.658%, RADIX_SORT_UI64_1024 , hits=4, 0.615 sec each <NEW
  Time=  2.223 sec =  7.822%, RADIX_SORT_UINT64_42R, hits=4, 0.556 sec each
  Time=  2.215 sec =  7.791%, RADIX_SORT_UI64_40_85, hits=4, 0.554 sec each
  Time=  1.930 sec =  6.788%, RADIX_SORT_UINT64_36R, hits=4, 0.482 sec each
  Time=  1.710 sec =  6.014%, RADIX_SORT_UINT64_32R, hits=4, 0.427 sec each
  Time=  0.915 sec =  3.220%, COMP_UINT64_ARRAYS   , hits=32, 0.029 sec each

Another run without Firefox hogging the system:

  Time=  6.156 sec = 23.199%, QSORT_UINT64_ARRAY   , hits=1
  Time=  2.993 sec = 11.277%, RADIX_SORT_UINT64_REG, hits=4, 0.748 sec each
  Time=  2.409 sec =  9.077%, RADIX_SORT_UI64_AA99 , hits=4, 0.602 sec each < NEW
  Time=  2.330 sec =  8.778%, RADIX_SORT_UI64_1024 , hits=4, 0.582 sec each < NEW
  Time=  2.241 sec =  8.443%, RADIX_SORT_UINT64_48R, hits=4, 0.560 sec each
  Time=  2.124 sec =  8.002%, RADIX_SORT_UI64_40_85, hits=4, 0.531 sec each
  Time=  1.982 sec =  7.468%, RADIX_SORT_UINT64_42R, hits=4, 0.495 sec each
  Time=  1.725 sec =  6.499%, RADIX_SORT_UINT64_36R, hits=4, 0.431 sec each
  Time=  1.507 sec =  5.677%, RADIX_SORT_UINT64_32R, hits=4, 0.377 sec each
  Time=  0.889 sec =  3.348%, COMP_UINT64_ARRAYS   , hits=32, 0.028 sec each

The RADIX_SORT_UI64_AA99 uses bin bits [10, 10, 9, 9] in 4 passes.

The RADIX_SORT_UI64_1024 sorts by most significant 10 bits first.

The RADIX_SORT_UINT64_42R uses 6 bins of 7 bits each

Note the dueling 40 MSB, 8 bin 5 bit RADIX_SORT_UI64_40_85 and the 42 MSB, 7 bin 6 bit RADIX_SORT_UINT64_42R relative performance. The 42 MSB sort is often much faster though the 40 bit edges it out at times.

The compiler is GCC, optimized for speed:

gcc -Ofast -ffast-math -m64 -march=native -funroll-loops -fopenmp -flto -finline-functions -Wuninitialized  ~/bin/pb.c  -lm  -o  ~/bin/pb_a

Are there better compiler options?

gcc version 7.4.1 20190905 [gcc-7-branch revision 275407] (SUSE Linux)

==========================

Full Code:

// =============================================================================
// Sort with bin bits 10, 10, 9,9
// From code by rcgldr StackOverflow Feb 8, 2020
void RadixSort_aa99(uint64_t *a, uint64_t *b, size_t count, EV_TIME_STR *tsa)
{
uint32_t mIndex[4][1024] = { 0 };  /* index matrix */
uint32_t * pmIndex;                /* ptr to row of matrix */
uint32_t i, j, m, n;
uint64_t u;

if(tsa)  time_event(E_RADIX_SORT_UI64_AA99, tsa, E_TIME_EVENT, 1, 0);  
for (i = 0; i < count; i++) {       /* generate histograms */
    u = a[i];
    mIndex[3][(u >> 26) & 0x1ff]++;
    mIndex[2][(u >> 35) & 0x1ff]++;
    mIndex[1][(u >> 44) & 0x3ff]++;
    mIndex[0][(u >> 54) & 0x3ff]++;
}

for (j = 0; j < 2; j++) {           /* convert to indices */
    pmIndex = mIndex[j];
    n = 0;
    for (i = 0; i < 1024; i++)  {
        m = pmIndex[i];
        pmIndex[i] = n;
        n += m;
    }
}
for (j = 2; j < 4; j++) {
    pmIndex = mIndex[j];
    n = 0;
    for (i = 0; i < 512; i++)  {
        m = pmIndex[i];
        pmIndex[i] = n;
        n += m;
    }
}

pmIndex = mIndex[3];
for (i = 0; i < count; i++)  {      /* radix sort */
    u = a[i];
    b[pmIndex[(u >> 26) & 0x1ff]++] = u;
}
pmIndex = mIndex[2];
for (i = 0; i < count; i++)  {
    u = b[i];
    a[pmIndex[(u >> 35) & 0x1ff]++] = u;
}
pmIndex = mIndex[1];
for (i = 0; i < count; i++)  {
    u = a[i];
    b[pmIndex[(u >> 44) & 0x3ff]++] = u;
}
pmIndex = mIndex[0];
for (i = 0; i < count; i++)  {
    u = b[i];
    a[pmIndex[(u >> 54) & 0x3ff]++] = u;
}
}


// =============================================================================
/* split array into 1024 bins according to most significant 10 bits */
void RadixSort_1024(uint64_t *a, uint64_t *b, size_t count, EV_TIME_STR *tsa)
{
uint32_t aIndex[1025] = {0};            /* index array */
uint32_t i, m, n;
if(tsa)  time_event(E_RADIX_SORT_UI64_1024, tsa, E_TIME_EVENT, 1, 0);  
for(i = 0; i < count; i++)          /* generate histogram */
    aIndex[(a[i] >> 54)]++;
n = 0;                              /* convert to indices */
for (i = 0; i < 1025; i++)  {
    m = aIndex[i];
    aIndex[i] = n;
    n += m;
}
for(i = 0; i < count; i++)          /* sort by ms 10 bits */
    b[aIndex[a[i]>>54]++] = a[i];
for(i = 1024; i; i--)               /* restore aIndex */
    aIndex[i] = aIndex[i-1];
aIndex[0] = 0;
for(i = 0; i < 1024; i++)           /* radix sort the 1024 bins */
    RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
}

void RadixSort3(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t mIndex[3][1024] = { 0 };   /* index matrix */
uint32_t * pmIndex;                 /* ptr to row of matrix */
uint32_t i, j, m, n;
uint64_t u;

for (i = 0; i < count; i++) {       /* generate histograms */
    u = a[i];
    mIndex[2][(u >> 26) & 0x1ff]++;
    mIndex[1][(u >> 35) & 0x1ff]++;
    mIndex[0][(u >> 44) & 0x3ff]++;
}

for (j = 0; j < 1; j++) {           /* convert to indices */
    pmIndex = mIndex[j];
    n = 0;
    for (i = 0; i < 1024; i++)  {
        m = pmIndex[i];
        pmIndex[i] = n;
        n += m;
    }
}
for (j = 1; j < 3; j++) {
    pmIndex = mIndex[j];
    n = 0;
    for (i = 0; i < 512; i++)  {
        m = pmIndex[i];
        pmIndex[i] = n;
        n += m;
    }
}

pmIndex = mIndex[2];
for (i = 0; i < count; i++)  {      /* radix sort */
    u = a[i];
    b[pmIndex[(u >> 26) & 0x1ff]++] = u;
}
pmIndex = mIndex[1];
for (i = 0; i < count; i++)  {
    u = b[i];
    a[pmIndex[(u >> 35) & 0x1ff]++] = u;
}
pmIndex = mIndex[0];
for (i = 0; i < count; i++)  {
    u = a[i];
    b[pmIndex[(u >> 44) & 0x3ff]++] = u;
}
}

The set size is 36.2M uint64_t and 100% distinct generated with mrand48() used twice, << 32 and ^. — BrianP007, Feb 14 '20 at 16:52

Radix sort on 38 bits in a uint64?

4 Answers4