1

I use the uint64_t Radix sort provided by Louis Ricci (answered Aug 24 '15 at 18:00) Radix_Sort_Uint64. Amazingly fast.

I have a data structure which contains 2, uint32_t items and want to sort a large array (20+ million) looking only at the first or last 32 bits, but I want the sort routine to move the entire 64 bit package as a unit.

Is there a C language uint64 Radix sort which orders based on a subset of the entire 64 bit quanta as if the data were masked with 0X1111111100000000?

  • It should be fairly easy to adapt the code in the answer you found, using a similar bitshift and mask technique, except you'll only need 4 sorting passes instead of 8 (plus an initial pass to pre-calculate the counts and offsets), and only using 4 bytes of the value instead of 8. – Ian Abbott Nov 02 '17 at 16:49
  • I looked at the code and it was not obvious which variables to subset: o* t* counts or size. An easy solution would be to just write a qsort function to cast the uint64_t pointers to uint32_t and compare them. But, the qsort on my data proved to be much slower than the radix sort. It seems like a fairly general tool; sort 64 bit chunks looking only at the first 32 bits. – PatrickPhotog Nov 02 '17 at 16:53
  • I think you'll need `c1` to `c4`, `counts[256 * 4]`, `o1` to `o4`, `t1` to `t4`. Set `t4` to `t1` to the bytes of the value you are sorting by. E.g. for most significant 32 bits, `t4 = (array[x] >> 32) & 0xff;` `t3 = (array[x] >> 40) & 0xff;` `t2 = (array[x] >> 48) & 0xff;` `t1 = (array[x] >> 56) & 0xff;`. – Ian Abbott Nov 02 '17 at 17:46
  • The main reason for the speed difference is probably because the sort code you found trades memory space for extra speed, using a temporary buffer the same size as the original data. This allows it to replace swap operations with simple assignment operations, copying the items back and forth between the temporary buffer and the original array at each pass. – Ian Abbott Nov 02 '17 at 17:58
  • On Gcc, with the following opts, I am seeing qsort take ~80% longer than Radix on positive, double data. And, I have checked for sorting agreement on my data.
    gcc -Ofast -ffast-math -march=native -m64 -fopenmp -funroll-loops -flto // Time=1.543 sec = 24.289%, Event RADIX_SORT_64 , hits=3, 0.514 sec each
    // Time=2.746 sec = 43.212%, Event QSORT_DOUBLE_ARRAY , hits=3, 0.915 sec each
    // Time=1.522 sec = 24.120%, Event RADIX_SORT_64 , hits=3, 0.507 sec each
    // Time=2.743 sec = 43.454%, Event QSORT_DOUBLE_ARRAY , hits=3, 0.914 sec each
    – PatrickPhotog Nov 02 '17 at 21:46
  • @IanAbbott - radix sort is faster because it's time function = F(8 n) if sorting 64 bit integers 8 bits at at time, while quick sort best case is F(n log2(n)). For 20 million elements, quick sort time function is ~ F(34 n), 4 times as large as radix sort. This is somewhat offset by the fact that at deeper levels of recursion, quick sort swaps take place within cache, while every radix sort write could be anywhere within the destination array. – rcgldr Nov 03 '17 at 03:23

3 Answers3

2

Example C code. It uses fewer local variables than the example linked to in the original post, allowing a C compiler to use registers for those variables. This program takes less than 0.5 second to sort 20 million (20*1024*1024 = 20971520) 64 bit unsigned integers by the upper 32 bits on my system (Intel 3770K 3.5ghz cpu, Windows 7 Pro 64 bit).

/*  radix sort via upper 32 bits of 64 bit unsigned integers */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef unsigned long long uint64_t;

void RadixSort(uint64_t * pData, uint64_t * pTemp, size_t count)
{
    size_t mIndex[4][256] = { 0 };      /* index matrix */
    size_t * pmIndex;                   /* ptr to row of matrix */
    size_t i, j, m, n;
    uint64_t u;

    for (i = 0; i < count; i++) {       /* generate histograms */
        u = pData[i];
        mIndex[3][(u >> 32) & 0xff]++;
        mIndex[2][(u >> 40) & 0xff]++;
        mIndex[1][(u >> 48) & 0xff]++;
        mIndex[0][(u >> 56) & 0xff]++;
    }

    for (j = 0; j < 4; j++) {           /* convert to indices */
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 256; i++) {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }

    for (i = 0; i < count; i++) {       /* radix sort */
        u = pData[i];
        pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pTemp[i];
        pData[mIndex[2][(u >> 40) & 0xff]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pData[i];
        pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pTemp[i];
        pData[mIndex[0][(u >> 56) & 0xff]++] = u;
    }
}

#define COUNT (20*1024*1024)            /* number of elements */

static clock_t dwTimeStart;             /* clock values */
static clock_t dwTimeStop;

int main( )
{
uint64_t * pData;
uint64_t * pTemp;
uint64_t r;
size_t i;

    /* allocate memory */
    pData  = (uint64_t *)malloc(COUNT*sizeof(uint64_t));
    if(pData == NULL){
        return 0;
    }
    pTemp  = (uint64_t *)malloc(COUNT*sizeof(uint64_t));
    if(pTemp == NULL){
        free(pData);
        return 0;
    }

    for(i = 0; i < COUNT; i++){         /* generate test data */
        r  = (((uint64_t)((rand()>>4) & 0xff))<< 0);
        r |= (((uint64_t)((rand()>>4) & 0xff))<< 8);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<16);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<24);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<32);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<40);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<48);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<56);
        pData[i] = r;
    }

    dwTimeStart = clock();
    RadixSort(pData, pTemp, COUNT);     /* sort array */
    dwTimeStop = clock();
    printf("Number of ticks %d\n", dwTimeStop-dwTimeStart);
    for(i = 1; i < COUNT; i++){         /* check sort */
        if((pData[i-1]>>32) > (pData[i]>>32)){
            break;
        }
    }
    if(i != COUNT)
        printf("sort error\n");
    free(pData);
    return(0);
}
rcgldr
  • 27,407
  • 3
  • 36
  • 61
2

The register optimized code from RGCLGR ["RADIX_SORT_UINT64_32R()"] is significantly faster than the original code sorting on only the 32 most significant bits:

  Time= 1.851 sec = 15.648%, RADIX_SORT_FFFF0000  , hits=4, 0.463 sec each  << OLD
  Time= 1.552 sec = 13.120%, RADIX_SORT_UINT64_32R, hits=4, 0.388 sec each  << NEW

It takes an average of 83.8% as long to run averaged over 4 runs with an array with 36.2 million values, 100% of which are distinct. After each run, the results of each sort are bit shifted >> 32, compared and are identical.

Comparing to QSort and the previous version of Radix_Sort() on all 64 bits:

Time=  6.125 sec = 62.359%, QSORT_UINT64_ARRAY   , hits=1
Time=  0.832 sec =  8.468%, RADIX_SORT_64        , hits=1  << OLD
Time=  0.770 sec =  7.842%, RADIX_SORT_UINT64_REG, hits=1  << NEW

The QSort takes almost 8 times as long as this new version. The original 'RADIX_SORT_64()' function is 8% slower.

Generalizing the code from RGCLGR to 48 and 64 bits and comparing sorting a uint64_t[32M] on 64, 48 and 32 bits with the RGCLGR method:

Time=  3.130 sec = 20.342%, RADIX_SORT_UINT64_REG, hits=4, 0.782 sec each
Time=  2.336 sec = 15.180%, RADIX_SORT_UINT64_48R, hits=4, 0.584 sec each
Time=  1.540 sec = 10.007%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each

Note the expected linearity between 20%, 15% and 10% program time for 64, 48 and 32 bits considered for sort.

1
// =============================================================================
// Create 2 identical uint64[]s, mirror the first 32 bits in the last, sort one
// with qsort and the other with RadixSort11110000() and compare the results
int test_uint64_32_sort(EV_TIME_STR *tsa)  {
uint64_t *la1, *la2;    
int ii, lcount=3615232, zmem=0, debug=0, ucount=1, ri, er12=0, ep=63;
time_t tt;  // This Time! 
float rssec=2.0f;  // RadixSort_SEConds elapsed time from Radixsort()
    time_event(E_GENERATE_RANDOM_ARRAY, tsa, E_TIME_EVENT, 1, 0); 
    srand((unsigned) time(&tt));
    la1=(uint64_t *)alloc_check((lcount)*8, zmem=0, "LLong_ara_1", debug);
    la2=(uint64_t *)alloc_check((lcount)*8, zmem=0, "LLong_ara_2", debug);

    for(ii=0; ii < lcount; ii++)  {  // Look only SHORT range
        ri = rand();  // Random int
        la1[ii] = ri << 32 + (0XFFFFFFFF - ri);  // Reflect val in lower 32
    }

    // Make identical copies in the other []
    time_event(E_MEMCPY_DOUBLE_ARRAY, tsa, E_TIME_EVENT, 1, 0); 
    memcpy((void *) la2, (void *) la1, lcount * sizeof(uint64_t));

    time_event(E_QSORT_UINT64_ARRAY, tsa, E_TIME_EVENT, 1, 0); 
    qsort(la1, lcount, sizeof(uint64_t), comp_uint6411110000);

    time_event(E_RADIX_SORT64_11110000, tsa, E_TIME_EVENT, 1, 0); 
    radixSort11110000(la2, lcount, &rssec, &ucount);

    time_event(E_SPLIT_RGBJ_MEM,   tsa, E_TIME_EVENT, 1, 0); 
    for(ii=er12=0; ii < lcount; ii++)  { 
        if(la1[ii] != la2[ii])  {  
            er12++;  
            if((--ep) > 0)  {
                printf("II %d) Er%d, l1=0X%016llX, l2=0X%016llX\n", 
                    ii, ep, la1[ii], la2[ii]);  FF_SO;  }
        }  // Count Error Mismatches
        if(!(ii%100000))  {  
            printf("II %d) l1=0X%016llX, l2=0X%016llX\n", 
                ii, la1[ii], la2[ii]);  FF_SO;  }
    }
    printf("T63S: Er1/2 = %d \n", er12);  FF_SO;
    if(ucount)  {
        printf("T63S: RadixSort time = %.3f ms, unique=%d = %.3f%%\n",
            rssec*1.0E3f, ucount, (100.0f * ucount / lcount));  FF_SO;
    }
    free_bb(la1);  free_bb(la2); 
}



// =============================================================================
// Based on original code from https://ideone.com/JHI0d9
// and suggestions from  Ian Abbott
// https://stackoverflow.com/questions/47080353/radix-sort-c-code-to-sort-uint64-t-looking-only-at-32-msb-bits
// Hacked code may be unsuitable for any use and should not be used by anyone
// Sort uint64[] by looking ONLY at the 
uint64_t *radixSort11110000(uint64_t *arrayA, uint32_t size)  {
register uint64_t *array=arrayA;  // Slam arg into Register!
register uint64_t std;  // STanDard to compare others for uniqueness
register int dist=0;  // Distinct, unique values found if *UCount arg is TRUE
register int ii;  // Loop control
int64_t rtime, mtns;  // Time in NanoSeconds!!!
    rscounts4_t counts;
    memset(&counts, 0, 256 * 4 * sizeof(uint32_t));
    uint64_t * cpy = (uint64_t *)malloc(size * sizeof(uint64_t));
    uint32_t o4=0, o3=0, o2=0, o1=0;
    uint32_t t4, t3, t2, t1;
    register uint32_t x;
    // calculate counts
    for(x = 0; x < size; x++) {
        t4 = (array[x] >> 32) & 0xff;
        t3 = (array[x] >> 40) & 0xff;
        t2 = (array[x] >> 48) & 0xff;
        t1 = (array[x] >> 56) & 0xff;
        counts.c4[t4]++;
        counts.c3[t3]++;
        counts.c2[t2]++;
        counts.c1[t1]++;
    }
    // convert counts to offsets
    for(x = 0; x < 256; x++) {
        t4 = o4 + counts.c4[x];
        t3 = o3 + counts.c3[x];
        t2 = o2 + counts.c2[x];
        t1 = o1 + counts.c1[x];
        counts.c4[x] = o4;
        counts.c3[x] = o3;
        counts.c2[x] = o2;
        counts.c1[x] = o1;
        o4 = t4; 
        o3 = t3; 
        o2 = t2; 
        o1 = t1;
    }
    // radix
    for(x = 0; x < size; x++) {
        t4 = (array[x] >> 32) & 0xff;
        cpy[counts.c4[t4]] = array[x];
        counts.c4[t4]++;  }
    for(x = 0; x < size; x++) {
        t3 = (cpy[x] >> 40) & 0xff;
        array[counts.c3[t3]] = cpy[x];
        counts.c3[t3]++;  }
    for(x = 0; x < size; x++) {
        t2 = (array[x] >> 48) & 0xff;
        cpy[counts.c2[t2]] = array[x];
        counts.c2[t2]++;  }
    for(x = 0; x < size; x++) {
        t1 = (cpy[x] >> 56) & 0xff;
        array[counts.c1[t1]] = cpy[x];
        counts.c1[t1]++;  }
    free(cpy);
    return array;
}  // End radixSort_11110000().


// #############################################################################
// From: http://ideone.com/JHI0d9
// RadixSort---
typedef union {
    struct {
        uint32_t c4[256];
        uint32_t c3[256];
        uint32_t c2[256];
        uint32_t c1[256];
    };
    uint32_t counts[256 * 4];
}  rscounts4_t;


// =============================================================================
// Compare only the MSB 4 bytes of a uint64 by masking each with 
// 0XFFFFFFFF00000000 before comparison
int comp_uint6411110000(const void *a, const void *b)  {
    return (
      ( 
        ( ( *( (uint64_t *)a ) ) & 0XFFFFFFFF00000000ULL ) > 
        ( ( *( (uint64_t *)b ) ) & 0XFFFFFFFF00000000ULL ) 
      )
     - 
      (  
        ( ( *( (uint64_t *)a ) ) & 0XFFFFFFFF00000000ULL ) < 
        ( ( *( (uint64_t *)b ) ) & 0XFFFFFFFF00000000ULL )
      )
    );
}  // End Comp_Uint64_11110000().



// Both sorted arrays were identical. 
// T63S: Er1/2 = 0
// TE: Top 90% events in desc order (7/312):
//  Time=  0.157 sec = 71.282%, QSORT_UINT64_ARRAY   , hits=1
//  Time=  0.029 sec = 13.119%, RADIX_SORT64_11110000, hits=1
//  Time=  0.026 sec = 11.872%, GENERATE_RANDOM_ARRAY, hits=1

// mult 71.282  /13.119  -> 5.433493
// The RadixSort is over 5x faster that the qsort

Perhaps the QSort comparitor could be handled more efficiently