Problem with making radix sort __uint128_t array

Question

I am trying to convert this fast radix sort code to work with __uint128_t's.

typedef union {
    struct {
        uint32_t c8[256];
        uint32_t c7[256];
        uint32_t c6[256];
        uint32_t c5[256];
        uint32_t c4[256];
        uint32_t c3[256];
        uint32_t c2[256];
        uint32_t c1[256];
    };
    uint32_t counts[256 * 8];
} rscounts_t;

uint64_t * radixSort(uint64_t * array, uint32_t size) {
    rscounts_t counts;
    memset(&counts, 0, 256 * 8 * sizeof(uint32_t));
    uint64_t * cpy = (uint64_t *)malloc(size * sizeof(uint64_t));
    uint32_t o8=0, o7=0, o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
    uint32_t t8, t7, t6, t5, t4, t3, t2, t1;
    uint32_t x;
    // calculate counts
    for(x = 0; x < size; x++) {
        t8 = array[x] & 0xff;
        t7 = (array[x] >> 8) & 0xff;
        t6 = (array[x] >> 16) & 0xff;
        t5 = (array[x] >> 24) & 0xff;
        t4 = (array[x] >> 32) & 0xff;
        t3 = (array[x] >> 40) & 0xff;
        t2 = (array[x] >> 48) & 0xff;
        t1 = (array[x] >> 56) & 0xff;
        counts.c8[t8]++;
        counts.c7[t7]++;
        counts.c6[t6]++;
        counts.c5[t5]++;
        counts.c4[t4]++;
        counts.c3[t3]++;
        counts.c2[t2]++;
        counts.c1[t1]++;
    }
    // convert counts to offsets
    for(x = 0; x < 256; x++) {
        t8 = o8 + counts.c8[x];
        t7 = o7 + counts.c7[x];
        t6 = o6 + counts.c6[x];
        t5 = o5 + counts.c5[x];
        t4 = o4 + counts.c4[x];
        t3 = o3 + counts.c3[x];
        t2 = o2 + counts.c2[x];
        t1 = o1 + counts.c1[x];
        counts.c8[x] = o8;
        counts.c7[x] = o7;
        counts.c6[x] = o6;
        counts.c5[x] = o5;
        counts.c4[x] = o4;
        counts.c3[x] = o3;
        counts.c2[x] = o2;
        counts.c1[x] = o1;
        o8 = t8; 
        o7 = t7; 
        o6 = t6; 
        o5 = t5; 
        o4 = t4; 
        o3 = t3; 
        o2 = t2; 
        o1 = t1;
    }
    // radix
    for(x = 0; x < size; x++) {
        t8 = array[x] & 0xff;
        cpy[counts.c8[t8]] = array[x];
        counts.c8[t8]++;
    }
    for(x = 0; x < size; x++) {
        t7 = (cpy[x] >> 8) & 0xff;
        array[counts.c7[t7]] = cpy[x];
        counts.c7[t7]++;
    }
    for(x = 0; x < size; x++) {
        t6 = (array[x] >> 16) & 0xff;
        cpy[counts.c6[t6]] = array[x];
        counts.c6[t6]++;
    }
    for(x = 0; x < size; x++) {
        t5 = (cpy[x] >> 24) & 0xff;
        array[counts.c5[t5]] = cpy[x];
        counts.c5[t5]++;
    }
    for(x = 0; x < size; x++) {
        t4 = (array[x] >> 32) & 0xff;
        cpy[counts.c4[t4]] = array[x];
        counts.c4[t4]++;
    }
    for(x = 0; x < size; x++) {
        t3 = (cpy[x] >> 40) & 0xff;
        array[counts.c3[t3]] = cpy[x];
        counts.c3[t3]++;
    }
    for(x = 0; x < size; x++) {
        t2 = (array[x] >> 48) & 0xff;
        cpy[counts.c2[t2]] = array[x];
        counts.c2[t2]++;
    }
    for(x = 0; x < size; x++) {
        t1 = (cpy[x] >> 56) & 0xff;
        array[counts.c1[t1]] = cpy[x];
        counts.c1[t1]++;
    }
    free(cpy);
    return array;
}

I blindly followed the pattern and doubled the number of variables making t16...t1 and o16...o1 and counts.c16 to counts.c1 but it does not give the right answer. It sorts uints less that 2^64 but does not sort larger ints correctly. I was wondering if >> is the problem. Should I expect it to work for __uint128's in gcc? Can anyone make a version that does work with __int128_t's. I would like to sort the array as quickly as possible.

My non-working code:

typedef union {
    struct {
        uint32_t c16[256];
        uint32_t c15[256];
        uint32_t c14[256];
        uint32_t c13[256];
        uint32_t c12[256];
        uint32_t c11[256];
        uint32_t c10[256];
        uint32_t c9[256];
        uint32_t c8[256];
        uint32_t c7[256];
        uint32_t c6[256];
        uint32_t c5[256];
        uint32_t c4[256];
        uint32_t c3[256];
        uint32_t c2[256];
        uint32_t c1[256];
    };
    uint32_t counts[256 * 16];
} rscounts_t;

__uint128_t * radixSort(__uint128_t * array, uint32_t size) {
    rscounts_t counts;
    memset(&counts, 0, 256 * 16 * sizeof(uint32_t));
    __uint128_t * cpy = (__uint128_t *)malloc(size * sizeof(__uint128_t));
    uint32_t o16=0, o15=0, o14=0, o13=0, o12=0, o11=0, o10=0, o9=0, o8=0, o7=0, o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
    uint32_t t16, t15, t14, t13, t12, t11, t10, t9, t8, t7, t6, t5, t4, t3, t2, t1;
    uint32_t x;
    // calculate counts
    for(x = 0; x < size; x++) {
        t16 = array[x] & 0xff;
        t15 = (array[x] >> 8) & 0xff;
        t14 = (array[x] >> 16) & 0xff;
        t13 = (array[x] >> 24) & 0xff;
        t12 = (array[x] >> 32) & 0xff;
        t11 = (array[x] >> 40) & 0xff;
        t10 = (array[x] >> 48) & 0xff;
        t9 = (array[x] >> 56) & 0xff;
        t8 = (array[x] >> 64) & 0xff;
        t7 = (array[x] >> 72) & 0xff;
        t6 = (array[x] >> 80) & 0xff;
        t5 = (array[x] >> 88) & 0xff;
        t4 = (array[x] >> 96) & 0xff;
        t3 = (array[x] >> 104) & 0xff;
        t2 = (array[x] >> 112) & 0xff;
        t1 = (array[x] >> 120) & 0xff;
        counts.c16[t16]++;
        counts.c15[t15]++;
        counts.c14[t14]++;
        counts.c13[t13]++;
        counts.c12[t12]++;
        counts.c11[t11]++;
        counts.c10[t10]++;
        counts.c9[t9]++;
        counts.c8[t8]++;
        counts.c7[t7]++;
        counts.c6[t6]++;
        counts.c5[t5]++;
        counts.c4[t4]++;
        counts.c3[t3]++;
        counts.c2[t2]++;
        counts.c1[t1]++;
    }
    // convert counts to offsets
    for(x = 0; x < 256; x++) {
        t16 = o16 + counts.c16[x];
        t15 = o15 + counts.c15[x];
        t14 = o14 + counts.c14[x];
        t13 = o13 + counts.c13[x];
        t12 = o12 + counts.c12[x];
        t11 = o11 + counts.c11[x];
        t10 = o10 + counts.c10[x];
        t9 = o9 + counts.c9[x];
        t8 = o8 + counts.c8[x];
        t7 = o7 + counts.c7[x];
        t6 = o6 + counts.c6[x];
        t5 = o5 + counts.c5[x];
        t4 = o4 + counts.c4[x];
        t3 = o3 + counts.c3[x];
        t2 = o2 + counts.c2[x];
        t1 = o1 + counts.c1[x];
    counts.c16[x] = o16;
        counts.c15[x] = o15;
        counts.c14[x] = o14;
        counts.c13[x] = o13;
        counts.c12[x] = o12;
        counts.c11[x] = o11;
        counts.c10[x] = o10;
        counts.c9[x] = o9;
        counts.c8[x] = o8;
        counts.c7[x] = o7;
        counts.c6[x] = o6;
        counts.c5[x] = o5;
        counts.c4[x] = o4;
        counts.c3[x] = o3;
        counts.c2[x] = o2;
        counts.c1[x] = o1;
    o16 = t16; 
        o15 = t15; 
        o14 = t14; 
        o13 = t13; 
        o12 = t12; 
        o11 = t11; 
        o10 = t10; 
        o9 = t9;
        o8 = t8; 
        o7 = t7; 
        o6 = t6; 
        o5 = t5; 
        o4 = t4; 
        o3 = t3; 
        o2 = t2; 
        o1 = t1;
    }
    // radix
    
    for(x = 0; x < size; x++) {
        t16 = array[x] & 0xff;
        cpy[counts.c16[t16]] = array[x];
        counts.c16[t16]++;
    }
    for(x = 0; x < size; x++) {
        t15 = (cpy[x] >> 8) & 0xff;
        array[counts.c15[t15]] = cpy[x];
        counts.c15[t15]++;
    }
    for(x = 0; x < size; x++) {
        t14 = (array[x] >> 16) & 0xff;
        cpy[counts.c14[t14]] = array[x];
        counts.c14[t14]++;
    }
    for(x = 0; x < size; x++) {
        t13 = (cpy[x] >> 24) & 0xff;
        array[counts.c13[t13]] = cpy[x];
        counts.c13[t13]++;
    }
    for(x = 0; x < size; x++) {
        t12 = (array[x] >> 32) & 0xff;
        cpy[counts.c12[t12]] = array[x];
        counts.c12[t12]++;
    }
    for(x = 0; x < size; x++) {
        t11 = (cpy[x] >> 40) & 0xff;
        array[counts.c11[t11]] = cpy[x];
        counts.c11[t11]++;
    }
    for(x = 0; x < size; x++) {
        t10 = (array[x] >> 48) & 0xff;
        cpy[counts.c10[t10]] = array[x];
        counts.c10[t10]++;
    }
    for(x = 0; x < size; x++) {
        t9 = (cpy[x] >> 56) & 0xff;
        array[counts.c9[t9]] = cpy[x];
        counts.c9[t9]++;
    }
    for(x = 0; x < size; x++) {
        t8 = (cpy[x] >> 64) & 0xff;
       cpy[counts.c8[t8]] =array[x];
        counts.c8[t8]++;
    }
    for(x = 0; x < size; x++) {
        t7 = (cpy[x] >> 72) & 0xff;
        array[counts.c7[t7]] = cpy[x];
        counts.c7[t7]++;
    }
    for(x = 0; x < size; x++) {
        t6 = (array[x] >> 80) & 0xff;
        cpy[counts.c6[t6]] = array[x];
        counts.c6[t6]++;
    }
    for(x = 0; x < size; x++) {
        t5 = (cpy[x] >> 88) & 0xff;
        array[counts.c5[t5]] = cpy[x];
        counts.c5[t5]++;
    }
    for(x = 0; x < size; x++) {
        t4 = (array[x] >> 96) & 0xff;
        cpy[counts.c4[t4]] = array[x];
        counts.c4[t4]++;
    }
    for(x = 0; x < size; x++) {
        t3 = (cpy[x] >> 104) & 0xff;
        array[counts.c3[t3]] = cpy[x];
        counts.c3[t3]++;
    }
    for(x = 0; x < size; x++) {
        t2 = (array[x] >> 112) & 0xff;
        cpy[counts.c2[t2]] = array[x];
        counts.c2[t2]++;
    }
    for(x = 0; x < size; x++) {
        t1 = (cpy[x] >> 120) & 0xff;
        array[counts.c1[t1]] = cpy[x];
        counts.c1[t1]++;
    }

    free(cpy);
    return array;
}

"Can anyone make a version that does work with __int128_t's. " --> Yes someone could, but best if you post your attempt first. — chux - Reinstate Monica, Jul 23 '21 at 00:10
Shifts of `__uint128_t` are supposed to work, yes, and in some simple tests gcc appears to generate correct code for doing so. It's much more likely you have some bug in your code, but it's so long and repetitive that it will be hard to find by inspection. Find an input set that reproduces the problem, add it to the question, and fire up your debugger. — Nate Eldredge, Jul 23 '21 at 05:50
Btw, it seems like this code would be far better if written with arrays and loops, instead of having 16 of every variable and hoping you don't make any typos when cutting and pasting and changing constants from line to line. — Nate Eldredge, Jul 23 '21 at 05:51
@NateEldredge Arrays and loops sound like a good idea, assuming that doesn't negatively affect optimization. — Simd, Jul 23 '21 at 05:55
@Anush: There's only one way to find out - try it. At a guess I might expect it would improve optimization, since loops and arrays will make it easier for the compiler to use SIMD instructions if possible. — Nate Eldredge, Jul 23 '21 at 05:57
@NateEldredge Would you be able to post an answer along these lines (please)? — Simd, Jul 23 '21 at 06:00

Bubbler · Accepted Answer · 2021-07-23T06:45:23.577

The t8/c8 block you just edited actually had more bug.

    //...
    for(x = 0; x < size; x++) {
        t10 = (array[x] >> 48) & 0xff;
        cpy[counts.c10[t10]] = array[x];
        counts.c10[t10]++;
    }
    for(x = 0; x < size; x++) {
        t9 = (cpy[x] >> 56) & 0xff;
        array[counts.c9[t9]] = cpy[x];
        counts.c9[t9]++;
    }
    for(x = 0; x < size; x++) {
        t8 = (cpy[x] >> 64) & 0xff; //<== See?
       cpy[counts.c8[t8]] =array[x];
        counts.c8[t8]++;
    }
    for(x = 0; x < size; x++) {
        t7 = (cpy[x] >> 72) & 0xff;
        array[counts.c7[t7]] = cpy[x];
        counts.c7[t7]++;
    }
    for(x = 0; x < size; x++) {
        t6 = (array[x] >> 80) & 0xff;
        cpy[counts.c6[t6]] = array[x];
        counts.c6[t6]++;
    }
    //...

Each block alternates the roles between array and cpy, one being the array to read the values and the other to write them in sorted order in that radix. Your t8 block breaks the pattern. It should be changed as follows:

    for(x = 0; x < size; x++) {
        t8 = (array[x] >> 64) & 0xff;
        cpy[counts.c8[t8]] = array[x];
        counts.c8[t8]++;
    }

Nate Eldredge raised a very good point in this comment.

Btw, it seems like this code would be far better if written with arrays and loops, instead of having 16 of every variable and hoping you don't make any typos when cutting and pasting and changing constants from line to line.

So I quickly wrote a version using loops: (typedef, naming, and formatting are just my preference)

typedef __uint128_t u128;
typedef uint32_t u32;
u128* radix_sort(u128* array, u32 size) {
    u32 counts[16][256];
    memset(&counts, 0, 256 * 16 * sizeof(u32));
    u128* cpy = (u128*) malloc(size * sizeof(u128));
    u32 o[16] = {0};
    u32 t[16];
    u32 x, pos;
    u128* array_from, * array_to;
    for (x = 0; x < size; x++) {
        for (pos = 0; pos < 16; pos++) {
            t[pos] = (array[x] >> 8 * pos) & 0xff;
            counts[pos][t[pos]]++;
        }
    }
    for (x = 0; x < 256; x++) {
        for (pos = 0; pos < 16; pos++) {
            t[pos] = o[pos] + counts[pos][x];
            counts[pos][x] = o[pos];
            o[pos] = t[pos];
        }
    }
    for (pos = 0; pos < 16; pos++) {
        array_from = pos % 2 == 0 ? array : cpy;
        array_to = pos % 2 == 0 ? cpy : array;
        for (x = 0; x < size; x++) {
            t[pos] = (array_from[x] >> 8 * pos) & 0xff;
            array_to[counts[pos][t[pos]]] = array_from[x];
            counts[pos][t[pos]]++;
        }
    }
    free(cpy);
    return array;
}

Much shorter and easier to read :) (Not sure if it's better to write array_from/array_to like this or to swap after each iteration)

Godbolt shows that with optimizations turned on, GCC indeed fully unrolls inner loops of length 16 (and seemingly reorders the instructions as necessary).

This version gives yet another insight: every t[pos] variable is written once and used in the same inner loop, and the value is never referenced later (they're simply overwritten in the later loops). So we can optimize u32 t[16] and all t[pos]'s into simply t, giving this:

typedef __uint128_t u128;
typedef uint32_t u32;
u128* radix_sort(u128* array, u32 size) {
    u32 counts[16][256];
    memset(&counts, 0, 256 * 16 * sizeof(u32));
    u128* cpy = (u128*) malloc(size * sizeof(u128));
    u32 o[16] = {0};
    u32 t, x, pos;
    u128* array_from, * array_to;
    for (x = 0; x < size; x++) {
        for (pos = 0; pos < 16; pos++) {
            t = (array[x] >> 8 * pos) & 0xff;
            counts[pos][t]++;
        }
    }
    for (x = 0; x < 256; x++) {
        for (pos = 0; pos < 16; pos++) {
            t = o[pos] + counts[pos][x];
            counts[pos][x] = o[pos];
            o[pos] = t;
        }
    }
    for (pos = 0; pos < 16; pos++) {
        array_from = pos % 2 == 0 ? array : cpy;
        array_to = pos % 2 == 0 ? cpy : array;
        for (x = 0; x < size; x++) {
            t = (array_from[x] >> 8 * pos) & 0xff;
            array_to[counts[pos][t]] = array_from[x];
            counts[pos][t]++;
        }
    }
    free(cpy);
    return array;
}

Problem with making radix sort __uint128_t array

1 Answers1