1

I am trying to update for linux, GCC, and 64 bit use and preserve in a github Ken Silverman's Paint N Draw 3D C software. I got his permission but he's too busy to help. I don't want to do a bad job and I am not a bit-twiddling expert so I'd like to fix the main parts before I upload it.

In his code pnd3d.c he used a struct called bitmal_t * that contains a malloc (I think his element mal means the size of a malloc) and a size to indicate a voxel-distance as an unsigned int (in 2009 it was a 32 bit ) bit chain amongst the bits of a concatenated set of 32 bit ints. So basically, distance is a function of how many bits on (1) along the extended bit chain. For collisions, he looks up and down for zeros and ones.

Here is his bitmal_t:

    //buf: cast to: octv_t* or surf_t*
    //bit: 1 bit per sizeof(buf[0]); 0=free, 1=occupied
typedef struct bit { void *buf; unsigned int mal, *bit, ind, num, siz; } bitmal_t;

Here is his range finding code that goes up and down the bit-range looking for a one or a zero. I posted his originals, not my crappy nonworking version.

Here is all the code snippets you would need to reproduce it.

static __forceinline int dntil0 (unsigned int *lptr, int z, int zsiz)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z < zsiz) && (lptr[z>>5]&(1<<KMOD32(z)))) z++; return(z);
    int i;
        //WARNING: zsiz must be multiple of 32!
    i = (lptr[z>>5]|((1<<KMOD32(z))-1)); z &= ~31;
    while (i == 0xffffffff)
    {
        z += 32; if (z >= zsiz) return(zsiz);
        i = lptr[z>>5];
    }
    return(bsf(~i)+z);
}

static __forceinline int uptil0 (unsigned int *lptr, int z)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z > 0) && (lptr[(z-1)>>5]&(1<<KMOD32(z-1)))) z--; return(z);
    int i;
    if (!z) return(0); //Prevent possible crash
    i = (lptr[(z-1)>>5]|(-1<<KMOD32(z))); z &= ~31;
    while (i == 0xffffffff)
    {
        z -= 32; if (z < 0) return(0);
        i = lptr[z>>5];
    }
    return(bsr(~i)+z+1);
}

static __forceinline int dntil1 (unsigned int *lptr, int z, int zsiz)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z < zsiz) && (!(lptr[z>>5]&(1<<KMOD32(z))))) z++; return(z);
    int i;
        //WARNING: zsiz must be multiple of 32!
    i = (lptr[z>>5]&(-1<<KMOD32(z))); z &= ~31;
    while (!i)
    {
        z += 32; if (z >= zsiz) return(zsiz);
        i = lptr[z>>5];
    }
    return(bsf(i)+z);
}

static __forceinline int uptil1 (unsigned int *lptr, int z)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z > 0) && (!(lptr[(z-1)>>5]&(1<<KMOD32(z-1))))) z--; return(z);
    int i;
    if (!z) return(0); //Prevent possible crash
    i = (lptr[(z-1)>>5]&((1<<KMOD32(z))-1)); z &= ~31;
    while (!i)
    {
        z -= 32; if (z < 0) return(0);
        i = lptr[z>>5];
    }
    return(bsr(i)+z+1);
}

Here are his set range to ones and zeroes functions:

//Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 0's
#ifndef _WIN64

static __forceinline void setzrange0 (void *vptr, int z0, int z1)
{
    int z, ze, *iptr = (int *)vptr;
    if (!((z0^z1)&~31)) { iptr[z0>>5] &= ((~(-1<<z0))|(-1<<z1)); return; }
    z = (z0>>5); ze = (z1>>5);
    iptr[z] &=~(-1<<z0); for(z++;z<ze;z++) iptr[z] = 0;
    iptr[z] &= (-1<<z1);
}

    //Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, int z0, int z1)
{
    int z, ze, *iptr = (int *)vptr;
    if (!((z0^z1)&~31)) { iptr[z0>>5] |= ((~(-1<<z1))&(-1<<z0)); return; }
    z = (z0>>5); ze = (z1>>5);
    iptr[z] |= (-1<<z0); for(z++;z<ze;z++) iptr[z] = -1;
    iptr[z] |=~(-1<<z1);
}

#else

static __forceinline void setzrange0 (void *vptr, __int64 z0, __int64 z1)
{
    unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
    if (!((z0^z1)&~63)) { iptr[z0>>6] &= ((~(LL(-1)<<z0))|(LL(-1)<<z1)); return; }
    z = (z0>>6); ze = (z1>>6);
    iptr[z] &=~(LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(0);
    iptr[z] &= (LL(-1)<<z1);
}

    //Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, __int64 z0, __int64 z1)
{
    unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
    if (!((z0^z1)&~63)) { iptr[z0>>6] |= ((~(LL(-1)<<z1))&(LL(-1)<<z0)); return; }
    z = (z0>>6); ze = (z1>>6);
    iptr[z] |= (LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(-1);
    iptr[z] |=~(LL(-1)<<z1);
}

#endif
daemondave
  • 309
  • 2
  • 12
  • This is Ken Silverman's original code! Ken wrote the DOOM engine, so he's good just not a good team coder. – daemondave May 04 '19 at 16:06
  • Sorry it's a mess but I didn't make it. – daemondave May 04 '19 at 16:07
  • Imagine how I've been struggling... Ken does have some very object-oriented thinking in his C, there is some genius there, but it is a hidden thought process. – daemondave May 04 '19 at 16:09
  • @P__J__ "implementation defined"? No. **undefined**: `-1< – EOF May 04 '19 at 18:03
  • Truly efficient as it should be, the code was not supposed to be "enterprise-level supportable", that's what it seems. Soo... IMHO, your only option is reduced to the following: 1) taking "Hacker's delight" book in your hands 2) checking suspicious or erroneous lines one by one, comparing them on both x32 and x64 platforms with corresponding compilers. Good luck! =) – MasterAler May 04 '19 at 20:00
  • I detect sarcasm... ;) – daemondave May 04 '19 at 20:29
  • You might want to compile this and look at the resulting asm. And write some unit tests that pass on the original. For x86-64, you might want to use SSE2, because that's baseline for 64-bit. GCC (unlike MSVC) assumes no strict-aliasing violations, so the set bit range functions (that cast an incoming pointer to signed `int*` (!) or `uint64_t` depending on WIN64 or not) might need to be compiled with `-fno-strict-aliasing` to make pointer-casting well-defined. You should replace the loop part of those functions with `memset`, or a hand-written SSE intrinsics loop. – Peter Cordes May 21 '19 at 19:14

1 Answers1

2

Write some unit tests that pass on the original!

First of all, SSE2 is baseline for x86-64, so you should definitely be using that instead of just 64-bit integers.

GCC (unlike MSVC) assumes no strict-aliasing violations, so the set bit range functions (that cast an incoming pointer to signed int* (!!) or uint64_t* depending on WIN64 or not) might need to be compiled with -fno-strict-aliasing to make pointer-casting well-defined.

You could replace the loop part of the set/clear bit-range functions with memset (which gcc may inline), or a hand-written SSE intrinsics loop if you expect the size to usually be small (like under 200 bytes or so, not worth the overhead of calling libc memset)


I think those dntil0 functions in the first block are just bit-search loops for the first 0 or first 1 bit, forward or backward.

Rewrite them from scratch with SSE2 intrinsics: _mm_cmpeq_epi8 / _mm_movemask_epi8 to find the first byte that isn't all-0 or all-1 bits, then use bsf or bsr on that.

See the glibc source code for SSE2 memchr, or any simpler SSE2-optimized implementation, to find out how to do the byte-search part. Or glibc memmem for an example of comparing for equal, but that's easy: instead of looking for a non-zero _mm_movemask_epi8() (indicating there was a match), look for a result that's != 0xffff (all ones) to indicate that there was a mismatch. Use bsf or bsr on that bitmask to find the byte index into the SIMD vector.

So in total you'll use BSR or BSF twice in each function: one to find the byte index within the SIMD vector, and again to find the bit-index within the target byte.

For the bit-scan function, use GCC __builtin_clz or __builtin_ctz to find the first 1 bit. Bit twiddling: which bit is set?

To search for the first zero instead of the first one, bitwise invert, like __builtin_ctz( ~p[idx] ) where p is an unsigned char* into your search buffer (that you were using _mm_loadu_si128() on), and idx is an offset within that 16 byte window. (That you calculated with __builtin_ctz() on the movemask result that broke out of the vector loop.)


How the original worked:

z -= 32 is looping by 32 bits (the size of an int, because this was written assuming it would be compiled for x86 Windows or x86-64 Windows).

lptr[z>>5]; is converting the bit index to an int index. So it's simply looping over the buffer 1 int at a time.

When it finds a 4-byte element that's != 0xFFFFFFFF, it has found an int containing a bit that's not 1; i.e. it contains the bit we're looking for. So it uses bsf or bsr to bit-scan and find the position of that bit within this int.
It adds that to z (the bit-position of the start of this int).

This is exactly the same algorithm I described above, but implemented one integer at a time instead of 16 bytes at a time.

It should really be using uint32_t or unsigned int for bit-manipulations, not signed int, but it obviously works correctly on MSVC.

if (z >= zsiz) return(zsiz); This is the size check to break out of the loop if no bit is found.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847