How can one make this dynamic bit range code GCC compliant for 64 bit compilers?

Question

I am trying to update for linux, GCC, and 64 bit use and preserve in a github Ken Silverman's Paint N Draw 3D C software. I got his permission but he's too busy to help. I don't want to do a bad job and I am not a bit-twiddling expert so I'd like to fix the main parts before I upload it.

In his code pnd3d.c he used a struct called bitmal_t * that contains a malloc (I think his element mal means the size of a malloc) and a size to indicate a voxel-distance as an unsigned int (in 2009 it was a 32 bit ) bit chain amongst the bits of a concatenated set of 32 bit ints. So basically, distance is a function of how many bits on (1) along the extended bit chain. For collisions, he looks up and down for zeros and ones.

Here is his bitmal_t:

    //buf: cast to: octv_t* or surf_t*
    //bit: 1 bit per sizeof(buf[0]); 0=free, 1=occupied
typedef struct bit { void *buf; unsigned int mal, *bit, ind, num, siz; } bitmal_t;

Here is his range finding code that goes up and down the bit-range looking for a one or a zero. I posted his originals, not my crappy nonworking version.

Here is all the code snippets you would need to reproduce it.

static __forceinline int dntil0 (unsigned int *lptr, int z, int zsiz)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z < zsiz) && (lptr[z>>5]&(1<<KMOD32(z)))) z++; return(z);
    int i;
        //WARNING: zsiz must be multiple of 32!
    i = (lptr[z>>5]|((1<<KMOD32(z))-1)); z &= ~31;
    while (i == 0xffffffff)
    {
        z += 32; if (z >= zsiz) return(zsiz);
        i = lptr[z>>5];
    }
    return(bsf(~i)+z);
}

static __forceinline int uptil0 (unsigned int *lptr, int z)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z > 0) && (lptr[(z-1)>>5]&(1<<KMOD32(z-1)))) z--; return(z);
    int i;
    if (!z) return(0); //Prevent possible crash
    i = (lptr[(z-1)>>5]|(-1<<KMOD32(z))); z &= ~31;
    while (i == 0xffffffff)
    {
        z -= 32; if (z < 0) return(0);
        i = lptr[z>>5];
    }
    return(bsr(~i)+z+1);
}

static __forceinline int dntil1 (unsigned int *lptr, int z, int zsiz)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z < zsiz) && (!(lptr[z>>5]&(1<<KMOD32(z))))) z++; return(z);
    int i;
        //WARNING: zsiz must be multiple of 32!
    i = (lptr[z>>5]&(-1<<KMOD32(z))); z &= ~31;
    while (!i)
    {
        z += 32; if (z >= zsiz) return(zsiz);
        i = lptr[z>>5];
    }
    return(bsf(i)+z);
}

static __forceinline int uptil1 (unsigned int *lptr, int z)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z > 0) && (!(lptr[(z-1)>>5]&(1<<KMOD32(z-1))))) z--; return(z);
    int i;
    if (!z) return(0); //Prevent possible crash
    i = (lptr[(z-1)>>5]&((1<<KMOD32(z))-1)); z &= ~31;
    while (!i)
    {
        z -= 32; if (z < 0) return(0);
        i = lptr[z>>5];
    }
    return(bsr(i)+z+1);
}

Here are his set range to ones and zeroes functions:

//Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 0's
#ifndef _WIN64

static __forceinline void setzrange0 (void *vptr, int z0, int z1)
{
    int z, ze, *iptr = (int *)vptr;
    if (!((z0^z1)&~31)) { iptr[z0>>5] &= ((~(-1<<z0))|(-1<<z1)); return; }
    z = (z0>>5); ze = (z1>>5);
    iptr[z] &=~(-1<<z0); for(z++;z<ze;z++) iptr[z] = 0;
    iptr[z] &= (-1<<z1);
}

    //Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, int z0, int z1)
{
    int z, ze, *iptr = (int *)vptr;
    if (!((z0^z1)&~31)) { iptr[z0>>5] |= ((~(-1<<z1))&(-1<<z0)); return; }
    z = (z0>>5); ze = (z1>>5);
    iptr[z] |= (-1<<z0); for(z++;z<ze;z++) iptr[z] = -1;
    iptr[z] |=~(-1<<z1);
}

#else

static __forceinline void setzrange0 (void *vptr, __int64 z0, __int64 z1)
{
    unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
    if (!((z0^z1)&~63)) { iptr[z0>>6] &= ((~(LL(-1)<<z0))|(LL(-1)<<z1)); return; }
    z = (z0>>6); ze = (z1>>6);
    iptr[z] &=~(LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(0);
    iptr[z] &= (LL(-1)<<z1);
}

    //Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, __int64 z0, __int64 z1)
{
    unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
    if (!((z0^z1)&~63)) { iptr[z0>>6] |= ((~(LL(-1)<<z1))&(LL(-1)<<z0)); return; }
    z = (z0>>6); ze = (z1>>6);
    iptr[z] |= (LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(-1);
    iptr[z] |=~(LL(-1)<<z1);
}

#endif

This is Ken Silverman's original code! Ken wrote the DOOM engine, so he's good just not a good team coder. — daemondave, May 04 '19 at 16:06
Imagine how I've been struggling... Ken does have some very object-oriented thinking in his C, there is some genius there, but it is a hidden thought process. — daemondave, May 04 '19 at 16:09
Truly efficient as it should be, the code was not supposed to be "enterprise-level supportable", that's what it seems. Soo... IMHO, your only option is reduced to the following: 1) taking "Hacker's delight" book in your hands 2) checking suspicious or erroneous lines one by one, comparing them on both x32 and x64 platforms with corresponding compilers. Good luck! =) — MasterAler, May 04 '19 at 20:00
You might want to compile this and look at the resulting asm. And write some unit tests that pass on the original. For x86-64, you might want to use SSE2, because that's baseline for 64-bit. GCC (unlike MSVC) assumes no strict-aliasing violations, so the set bit range functions (that cast an incoming pointer to signed `int*` (!) or `uint64_t` depending on WIN64 or not) might need to be compiled with `-fno-strict-aliasing` to make pointer-casting well-defined. You should replace the loop part of those functions with `memset`, or a hand-written SSE intrinsics loop. — Peter Cordes, May 21 '19 at 19:14

Peter Cordes · Answer 1 · 2019-05-21T19:48:22.130

Write some unit tests that pass on the original!

First of all, SSE2 is baseline for x86-64, so you should definitely be using that instead of just 64-bit integers.

GCC (unlike MSVC) assumes no strict-aliasing violations, so the set bit range functions (that cast an incoming pointer to signed int* (!!) or uint64_t* depending on WIN64 or not) might need to be compiled with -fno-strict-aliasing to make pointer-casting well-defined.

You could replace the loop part of the set/clear bit-range functions with memset (which gcc may inline), or a hand-written SSE intrinsics loop if you expect the size to usually be small (like under 200 bytes or so, not worth the overhead of calling libc memset)

I think those dntil0 functions in the first block are just bit-search loops for the first 0 or first 1 bit, forward or backward.

Rewrite them from scratch with SSE2 intrinsics: _mm_cmpeq_epi8 / _mm_movemask_epi8 to find the first byte that isn't all-0 or all-1 bits, then use bsf or bsr on that.

See the glibc source code for SSE2 memchr, or any simpler SSE2-optimized implementation, to find out how to do the byte-search part. Or glibc memmem for an example of comparing for equal, but that's easy: instead of looking for a non-zero _mm_movemask_epi8() (indicating there was a match), look for a result that's != 0xffff (all ones) to indicate that there was a mismatch. Use bsf or bsr on that bitmask to find the byte index into the SIMD vector.

So in total you'll use BSR or BSF twice in each function: one to find the byte index within the SIMD vector, and again to find the bit-index within the target byte.

For the bit-scan function, use GCC __builtin_clz or __builtin_ctz to find the first 1 bit. Bit twiddling: which bit is set?

To search for the first zero instead of the first one, bitwise invert, like __builtin_ctz( ~p[idx] ) where p is an unsigned char* into your search buffer (that you were using _mm_loadu_si128() on), and idx is an offset within that 16 byte window. (That you calculated with __builtin_ctz() on the movemask result that broke out of the vector loop.)

How the original worked:

z -= 32 is looping by 32 bits (the size of an int, because this was written assuming it would be compiled for x86 Windows or x86-64 Windows).

lptr[z>>5]; is converting the bit index to an int index. So it's simply looping over the buffer 1 int at a time.

When it finds a 4-byte element that's != 0xFFFFFFFF, it has found an int containing a bit that's not 1; i.e. it contains the bit we're looking for. So it uses bsf or bsr to bit-scan and find the position of that bit within this int.
It adds that to z (the bit-position of the start of this int).

This is exactly the same algorithm I described above, but implemented one integer at a time instead of 16 bytes at a time.

It should really be using uint32_t or unsigned int for bit-manipulations, not signed int, but it obviously works correctly on MSVC.

if (z >= zsiz) return(zsiz); This is the size check to break out of the loop if no bit is found.

How can one make this dynamic bit range code GCC compliant for 64 bit compilers?

1 Answers1