I am trying to convert this fast radix sort code to work with __uint128_t's.
typedef union {
struct {
uint32_t c8[256];
uint32_t c7[256];
uint32_t c6[256];
uint32_t c5[256];
uint32_t c4[256];
uint32_t c3[256];
uint32_t c2[256];
uint32_t c1[256];
};
uint32_t counts[256 * 8];
} rscounts_t;
uint64_t * radixSort(uint64_t * array, uint32_t size) {
rscounts_t counts;
memset(&counts, 0, 256 * 8 * sizeof(uint32_t));
uint64_t * cpy = (uint64_t *)malloc(size * sizeof(uint64_t));
uint32_t o8=0, o7=0, o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
uint32_t t8, t7, t6, t5, t4, t3, t2, t1;
uint32_t x;
// calculate counts
for(x = 0; x < size; x++) {
t8 = array[x] & 0xff;
t7 = (array[x] >> 8) & 0xff;
t6 = (array[x] >> 16) & 0xff;
t5 = (array[x] >> 24) & 0xff;
t4 = (array[x] >> 32) & 0xff;
t3 = (array[x] >> 40) & 0xff;
t2 = (array[x] >> 48) & 0xff;
t1 = (array[x] >> 56) & 0xff;
counts.c8[t8]++;
counts.c7[t7]++;
counts.c6[t6]++;
counts.c5[t5]++;
counts.c4[t4]++;
counts.c3[t3]++;
counts.c2[t2]++;
counts.c1[t1]++;
}
// convert counts to offsets
for(x = 0; x < 256; x++) {
t8 = o8 + counts.c8[x];
t7 = o7 + counts.c7[x];
t6 = o6 + counts.c6[x];
t5 = o5 + counts.c5[x];
t4 = o4 + counts.c4[x];
t3 = o3 + counts.c3[x];
t2 = o2 + counts.c2[x];
t1 = o1 + counts.c1[x];
counts.c8[x] = o8;
counts.c7[x] = o7;
counts.c6[x] = o6;
counts.c5[x] = o5;
counts.c4[x] = o4;
counts.c3[x] = o3;
counts.c2[x] = o2;
counts.c1[x] = o1;
o8 = t8;
o7 = t7;
o6 = t6;
o5 = t5;
o4 = t4;
o3 = t3;
o2 = t2;
o1 = t1;
}
// radix
for(x = 0; x < size; x++) {
t8 = array[x] & 0xff;
cpy[counts.c8[t8]] = array[x];
counts.c8[t8]++;
}
for(x = 0; x < size; x++) {
t7 = (cpy[x] >> 8) & 0xff;
array[counts.c7[t7]] = cpy[x];
counts.c7[t7]++;
}
for(x = 0; x < size; x++) {
t6 = (array[x] >> 16) & 0xff;
cpy[counts.c6[t6]] = array[x];
counts.c6[t6]++;
}
for(x = 0; x < size; x++) {
t5 = (cpy[x] >> 24) & 0xff;
array[counts.c5[t5]] = cpy[x];
counts.c5[t5]++;
}
for(x = 0; x < size; x++) {
t4 = (array[x] >> 32) & 0xff;
cpy[counts.c4[t4]] = array[x];
counts.c4[t4]++;
}
for(x = 0; x < size; x++) {
t3 = (cpy[x] >> 40) & 0xff;
array[counts.c3[t3]] = cpy[x];
counts.c3[t3]++;
}
for(x = 0; x < size; x++) {
t2 = (array[x] >> 48) & 0xff;
cpy[counts.c2[t2]] = array[x];
counts.c2[t2]++;
}
for(x = 0; x < size; x++) {
t1 = (cpy[x] >> 56) & 0xff;
array[counts.c1[t1]] = cpy[x];
counts.c1[t1]++;
}
free(cpy);
return array;
}
I blindly followed the pattern and doubled the number of variables making t16...t1 and o16...o1 and counts.c16 to counts.c1 but it does not give the right answer. It sorts uints less that 2^64 but does not sort larger ints correctly. I was wondering if >>
is the problem. Should I expect it to work for __uint128's in gcc? Can anyone make a version that does work with __int128_t's. I would like to sort the array as quickly as possible.
My non-working code:
typedef union {
struct {
uint32_t c16[256];
uint32_t c15[256];
uint32_t c14[256];
uint32_t c13[256];
uint32_t c12[256];
uint32_t c11[256];
uint32_t c10[256];
uint32_t c9[256];
uint32_t c8[256];
uint32_t c7[256];
uint32_t c6[256];
uint32_t c5[256];
uint32_t c4[256];
uint32_t c3[256];
uint32_t c2[256];
uint32_t c1[256];
};
uint32_t counts[256 * 16];
} rscounts_t;
__uint128_t * radixSort(__uint128_t * array, uint32_t size) {
rscounts_t counts;
memset(&counts, 0, 256 * 16 * sizeof(uint32_t));
__uint128_t * cpy = (__uint128_t *)malloc(size * sizeof(__uint128_t));
uint32_t o16=0, o15=0, o14=0, o13=0, o12=0, o11=0, o10=0, o9=0, o8=0, o7=0, o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
uint32_t t16, t15, t14, t13, t12, t11, t10, t9, t8, t7, t6, t5, t4, t3, t2, t1;
uint32_t x;
// calculate counts
for(x = 0; x < size; x++) {
t16 = array[x] & 0xff;
t15 = (array[x] >> 8) & 0xff;
t14 = (array[x] >> 16) & 0xff;
t13 = (array[x] >> 24) & 0xff;
t12 = (array[x] >> 32) & 0xff;
t11 = (array[x] >> 40) & 0xff;
t10 = (array[x] >> 48) & 0xff;
t9 = (array[x] >> 56) & 0xff;
t8 = (array[x] >> 64) & 0xff;
t7 = (array[x] >> 72) & 0xff;
t6 = (array[x] >> 80) & 0xff;
t5 = (array[x] >> 88) & 0xff;
t4 = (array[x] >> 96) & 0xff;
t3 = (array[x] >> 104) & 0xff;
t2 = (array[x] >> 112) & 0xff;
t1 = (array[x] >> 120) & 0xff;
counts.c16[t16]++;
counts.c15[t15]++;
counts.c14[t14]++;
counts.c13[t13]++;
counts.c12[t12]++;
counts.c11[t11]++;
counts.c10[t10]++;
counts.c9[t9]++;
counts.c8[t8]++;
counts.c7[t7]++;
counts.c6[t6]++;
counts.c5[t5]++;
counts.c4[t4]++;
counts.c3[t3]++;
counts.c2[t2]++;
counts.c1[t1]++;
}
// convert counts to offsets
for(x = 0; x < 256; x++) {
t16 = o16 + counts.c16[x];
t15 = o15 + counts.c15[x];
t14 = o14 + counts.c14[x];
t13 = o13 + counts.c13[x];
t12 = o12 + counts.c12[x];
t11 = o11 + counts.c11[x];
t10 = o10 + counts.c10[x];
t9 = o9 + counts.c9[x];
t8 = o8 + counts.c8[x];
t7 = o7 + counts.c7[x];
t6 = o6 + counts.c6[x];
t5 = o5 + counts.c5[x];
t4 = o4 + counts.c4[x];
t3 = o3 + counts.c3[x];
t2 = o2 + counts.c2[x];
t1 = o1 + counts.c1[x];
counts.c16[x] = o16;
counts.c15[x] = o15;
counts.c14[x] = o14;
counts.c13[x] = o13;
counts.c12[x] = o12;
counts.c11[x] = o11;
counts.c10[x] = o10;
counts.c9[x] = o9;
counts.c8[x] = o8;
counts.c7[x] = o7;
counts.c6[x] = o6;
counts.c5[x] = o5;
counts.c4[x] = o4;
counts.c3[x] = o3;
counts.c2[x] = o2;
counts.c1[x] = o1;
o16 = t16;
o15 = t15;
o14 = t14;
o13 = t13;
o12 = t12;
o11 = t11;
o10 = t10;
o9 = t9;
o8 = t8;
o7 = t7;
o6 = t6;
o5 = t5;
o4 = t4;
o3 = t3;
o2 = t2;
o1 = t1;
}
// radix
for(x = 0; x < size; x++) {
t16 = array[x] & 0xff;
cpy[counts.c16[t16]] = array[x];
counts.c16[t16]++;
}
for(x = 0; x < size; x++) {
t15 = (cpy[x] >> 8) & 0xff;
array[counts.c15[t15]] = cpy[x];
counts.c15[t15]++;
}
for(x = 0; x < size; x++) {
t14 = (array[x] >> 16) & 0xff;
cpy[counts.c14[t14]] = array[x];
counts.c14[t14]++;
}
for(x = 0; x < size; x++) {
t13 = (cpy[x] >> 24) & 0xff;
array[counts.c13[t13]] = cpy[x];
counts.c13[t13]++;
}
for(x = 0; x < size; x++) {
t12 = (array[x] >> 32) & 0xff;
cpy[counts.c12[t12]] = array[x];
counts.c12[t12]++;
}
for(x = 0; x < size; x++) {
t11 = (cpy[x] >> 40) & 0xff;
array[counts.c11[t11]] = cpy[x];
counts.c11[t11]++;
}
for(x = 0; x < size; x++) {
t10 = (array[x] >> 48) & 0xff;
cpy[counts.c10[t10]] = array[x];
counts.c10[t10]++;
}
for(x = 0; x < size; x++) {
t9 = (cpy[x] >> 56) & 0xff;
array[counts.c9[t9]] = cpy[x];
counts.c9[t9]++;
}
for(x = 0; x < size; x++) {
t8 = (cpy[x] >> 64) & 0xff;
cpy[counts.c8[t8]] =array[x];
counts.c8[t8]++;
}
for(x = 0; x < size; x++) {
t7 = (cpy[x] >> 72) & 0xff;
array[counts.c7[t7]] = cpy[x];
counts.c7[t7]++;
}
for(x = 0; x < size; x++) {
t6 = (array[x] >> 80) & 0xff;
cpy[counts.c6[t6]] = array[x];
counts.c6[t6]++;
}
for(x = 0; x < size; x++) {
t5 = (cpy[x] >> 88) & 0xff;
array[counts.c5[t5]] = cpy[x];
counts.c5[t5]++;
}
for(x = 0; x < size; x++) {
t4 = (array[x] >> 96) & 0xff;
cpy[counts.c4[t4]] = array[x];
counts.c4[t4]++;
}
for(x = 0; x < size; x++) {
t3 = (cpy[x] >> 104) & 0xff;
array[counts.c3[t3]] = cpy[x];
counts.c3[t3]++;
}
for(x = 0; x < size; x++) {
t2 = (array[x] >> 112) & 0xff;
cpy[counts.c2[t2]] = array[x];
counts.c2[t2]++;
}
for(x = 0; x < size; x++) {
t1 = (cpy[x] >> 120) & 0xff;
array[counts.c1[t1]] = cpy[x];
counts.c1[t1]++;
}
free(cpy);
return array;
}