Question: Is there a "Dial_A_Bit" Radix Sort which can be used to sort on a subset of the bits in a data type?
uint64 *Radix_Sort_Dial_A_Bit(uint64_t *a, int num_a, int sort_bits);
In particular, would a 38 bit sort on 64 bit data be possible and have speed somewhere between the 32/64 and 48/64 varieties shown below?
uint64 *Radix_Sort_ui64_38MSB(uint64_t *a, int num_a);
Note how research into 48 and 32 bit sorts has verified both increased speed and correctness compared to sorting on all 64 bits in a uint64_t[].
It seems as though a Radix_Sort() which sorts on a subset of the data package size would be generally useful and efficient, sorting only what is needed.
There are cases in which results are calculated for each Pixel and need to be sorted. A uint64_t[] is used to hold both the calculation result and the XY location.
26 total bits (13 for X and 13 for Y, 8192 max resolution) are needed to hold the pixel XY coordinates which leaves 38 bits for the data to be sorted.
The entire 64 bit package can be sorted with the Radix_Sort_Uint64().
A faster method is to use a Radix_Sort_Uint48() (see below) so the last 16 bits are not considered in the sort. This sorts all of the data correctly and also sorts 10 of the 13 X coordinate bits which is not needed.
Since performance is almost linearly proportional to bits sorted, optimally, only the 38 most significant bits would be considered in the sort.
Even a 40 bit Radix Sort would be better than using 48 bits. I tried to generalize the working 48 bit Radix Sort to operate on 40 bits but it does not sort correctly.
QSort_uint64_38_msb():
static inline int comp_uint64_38_msb(const void *a, const void *b) {
register int64_t ca, cb;
ca = (*((uint64_t *)a)) >> 26; // Send 26 LSBs to oblivion
cb = (*((uint64_t *)b)) >> 26; // Send 26 LSBs to oblivion
return((ca > cb) - (ca < cb)); // Calcs to [+1, 0 -1]
}
As you can see, the 48 bit sort is significantly faster than the full 64 bits. The 32 bit sort is almost twice as fast as the full 64 bit. And the qsort is lagging far behind:
Time= 2.136 sec = 3.390%, RADIX_SORT_FFFF0000 , hits=4, 0.534 sec each
Time= 2.944 sec = 4.672%, RADIX_SORT_FFFFFF00 , hits=4, 0.736 sec each
Time= 4.691 sec = 7.444%, RADIX_SORT_64 , hits=5, 0.938 sec each
Time= 25.209 sec = 40.010%, QSORT_UINT64_ARRAY , hits=4, 6.302 sec each
Time= 26.191 sec = 41.569%, QSORT_UINT64_38_ARRAY, hits=4, 6.548 sec each
Linearizing from the 64, 48 and 32 bit results to 38 bits:
lsf 64 0.938 48 0.736 32 0.534 38 -> 0.6500
The 38 bit Radix_Sort could be 35% faster than 64 bit sorting and 17% faster than the 48 bit sort.
Even 40 bits would be quicker, 5 bytes vs 6 processed per uint64.
=========
Fastest yet, 6 of 8 byte sort of uint64[], generalized from: 32 MSbits used to sort uint64
// #############################################################################
// From: http://ideone.com/JHI0d9
// RadixSort--- for 48 MSB of uint64
typedef union {
struct {
uint32_t c6[256];
uint32_t c5[256];
uint32_t c4[256];
uint32_t c3[256];
uint32_t c2[256];
uint32_t c1[256];
};
uint32_t counts[256 * 6];
} rscounts6_t;
// #############################################################################
// Patterned off of Radix_Sort_64 but looks only at the 48 MostSigBits
// 0XFFFF-FFFF-FFFF-0000 << Ignore the zeros, sort on 3 MostSigBytes
// Made for RGB48 stuffed into uint64 with 2 LeastSig bytes zero
// Get rid of the 7 and 8 level comps
uint64_t *radix_sort_48_msb(uint64_t *arrayA, uint32_t asize)
{
register uint64_t *array=arrayA; // Slam arg into Register!
register int ii; // Loop control
rscounts6_t counts;
memset(&counts, 0, 256 * 6 * sizeof(uint32_t));
uint64_t *cpy = (uint64_t *)malloc(asize * sizeof(uint64_t));
uint32_t o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
uint32_t t6, t5, t4, t3, t2, t1;
register uint32_t x;
// calculate counts
for(x = 0; x < asize; x++) {
t6 = (array[x] >> 16) & 0xff;
t5 = (array[x] >> 24) & 0xff;
t4 = (array[x] >> 32) & 0xff;
t3 = (array[x] >> 40) & 0xff;
t2 = (array[x] >> 48) & 0xff;
t1 = (array[x] >> 56) & 0xff;
counts.c6[t6]++;
counts.c5[t5]++;
counts.c4[t4]++;
counts.c3[t3]++;
counts.c2[t2]++;
counts.c1[t1]++;
}
// convert counts to offsets
for(x = 0; x < 256; x++) {
t6 = o6 + counts.c6[x];
t5 = o5 + counts.c5[x];
t4 = o4 + counts.c4[x];
t3 = o3 + counts.c3[x];
t2 = o2 + counts.c2[x];
t1 = o1 + counts.c1[x];
counts.c6[x] = o6;
counts.c5[x] = o5;
counts.c4[x] = o4;
counts.c3[x] = o3;
counts.c2[x] = o2;
counts.c1[x] = o1;
o6 = t6;
o5 = t5;
o4 = t4;
o3 = t3;
o2 = t2;
o1 = t1;
}
// radix
for(x = 0; x < asize; x++) {
t6 = (array[x] >> 16) & 0xff;
cpy[counts.c6[t6]] = array[x];
counts.c6[t6]++; }
for(x = 0; x < asize; x++) {
t5 = (cpy[x] >> 24) & 0xff;
array[counts.c5[t5]] = cpy[x];
counts.c5[t5]++; }
for(x = 0; x < asize; x++) {
t4 = (array[x] >> 32) & 0xff;
cpy[counts.c4[t4]] = array[x];
counts.c4[t4]++; }
for(x = 0; x < asize; x++) {
t3 = (cpy[x] >> 40) & 0xff;
array[counts.c3[t3]] = cpy[x];
counts.c3[t3]++; }
for(x = 0; x < asize; x++) {
t2 = (array[x] >> 48) & 0xff;
cpy[counts.c2[t2]] = array[x];
counts.c2[t2]++; }
for(x = 0; x < asize; x++) {
t1 = (cpy[x] >> 56) & 0xff;
array[counts.c1[t1]] = cpy[x];
counts.c1[t1]++; }
free(cpy);
return array;
} // End radix_sort_48_msb().
==================================
Thanks again to Rcgldr for innovative programming suggestion! Instead of 10, 10, 9, 9, I used the fast 32 bit pattern with [4][10]
It Works, but it is quite a bit slower than the 48 MSB sort, 737 msec for 40 MSBt, 588 msec for 48 MSB. :(
Maybe I coded it poorly.
Time= 6.108 sec = 33.668%, QSORT_UINT64_ARRAY , hits=1
Time= 3.060 sec = 16.866%, RADIX_SORT_UINT64_REG, hits=4, 0.765 sec each
Time= 2.947 sec = 16.241%, RADIX_SORT_UINT64_40R, hits=4, 0.737 sec each < SLOW
Time= 2.354 sec = 12.973%, RADIX_SORT_UINT64_48R, hits=4, 0.588 sec each
Time= 1.542 sec = 8.498%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each
Time= 0.769 sec = 4.236%, RADIX_SORT_64 , hits=1
The test:
- create a random, uint64_t[36M] master array
- sort it with both qsort and a known good Radix Sort, radixSort() to create a standard array
- Comparing Qsort and radixSort() results for identicality.
- Sort a copy of the master with 32, 40, 48 and 64 MSB Radix Sorts
- Compare each test sort to the Standard after masking off the ignored LSBs
Here is the code:
//=============================================================================
// From code submitted by rcgldr, Feb 8 2020
// Optimized to use Registers and to sort on 40 MSBs, ignoring 24 LSBs
void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[4][1024] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i];
mIndex[3][(u >> 24) & 0x3ff]++;
mIndex[2][(u >> 34) & 0x3ff]++;
mIndex[1][(u >> 44) & 0x3ff]++;
mIndex[0][(u >> 54) & 0x3ff]++;
}
for (j = 0; j < 4; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;
}
} // End Radix_Sort_R64_40().
Here is the unique dif between the FAST 32 MSB version and the cloned 40 MSB slow version.
Unique lines from "~/tmp/radix.sort.32.c":
02) void radix_sort_r64_32(uint64_t *pData, uint64_t *pTemp, size_t count,
05) size_t mIndex[4][256] = { 0 }; /* index matrix */
09) if(tsa) time_event(E_RADIX_SORT_UINT64_32R, tsa, E_TIME_EVENT, 1, 0);
13) mIndex[3][(u >> 32) & 0xff]++; // B4
14) mIndex[2][(u >> 40) & 0xff]++; // B5
15) mIndex[1][(u >> 48) & 0xff]++; // B6
16) mIndex[0][(u >> 56) & 0xff]++; // B7
22) for (i = 0; i < 256; i++) {
31) pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
35) pData[mIndex[2][(u >> 40) & 0xff]++] = u;
39) pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
43) pData[mIndex[0][(u >> 56) & 0xff]++] = u;
Unique lines from "~/tmp/radix.sort.40.c":
01) void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[4][1024] = { 0 }; /* index matrix */
08) if(tsa) time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);
12) mIndex[3][(u >> 24) & 0x3ff]++;
13) mIndex[2][(u >> 34) & 0x3ff]++;
14) mIndex[1][(u >> 44) & 0x3ff]++;
15) mIndex[0][(u >> 54) & 0x3ff]++;
21) for (i = 0; i < 1024; i++) {
30) pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
34) pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
38) pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
42) pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;