3

Consider a 64 bit unsigned integer which contains exactly one 0000b valued nibble at a position dividable by four.

Is it there a sub-linear, i.e. better than O(16) algorithm which extracts the position of this null nibble? SIMD solutions are acceptable as well.

seloga7376
  • 33
  • 3

1 Answers1

2

One approach is to use a variant of Alan Mycroft's null-byte detection algorithm. Bytes containing zero are turned into 0x80, other bytes are turned into 0x00. This can trivially be modified to work on nibbles instead of bytes by adjusting the masks. Using the Posix function ffsll(), we can then find the first set bit and make the necessary adjustment to the bit index, as ffsll() uses one-based bit positions and Mycroft's algorithm marks the most significant bit in the zero nibble rather than the least significant one.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

/* Adapted from Alan Mycroft's null-byte detection algorithm 
   newsgroup comp.lang.c, 1987/04/08,
   https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
*/
int zero_nibble_position (uint64_t a)
{
    const uint64_t nibble_lsb = 0x1111111111111111ULL;
    const uint64_t nibble_msb = 0x8888888888888888ULL; 
    uint64_t t = (a - nibble_lsb) & (~a & nibble_msb);
    return (t) ? (ffsll (t) - 4) : -1;
}

int zero_nibble_position_ref (uint64_t a)
{
    if (!(a & (0xfULL <<  0))) return  0;
    if (!(a & (0xfULL <<  4))) return  4;
    if (!(a & (0xfULL <<  8))) return  8;
    if (!(a & (0xfULL << 12))) return 12;
    if (!(a & (0xfULL << 16))) return 16;
    if (!(a & (0xfULL << 20))) return 20;
    if (!(a & (0xfULL << 24))) return 24;
    if (!(a & (0xfULL << 28))) return 28;
    if (!(a & (0xfULL << 32))) return 32;
    if (!(a & (0xfULL << 36))) return 36;
    if (!(a & (0xfULL << 40))) return 40;
    if (!(a & (0xfULL << 44))) return 44;
    if (!(a & (0xfULL << 48))) return 48;
    if (!(a & (0xfULL << 52))) return 52;
    if (!(a & (0xfULL << 56))) return 56;
    if (!(a & (0xfULL << 60))) return 60;
    return -1;
}
/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    for (int i = 0; i < 1000000000; i++) {
        uint64_t a  = KISS64;
        int res = zero_nibble_position (a);
        int ref = zero_nibble_position_ref (a);
        if (res != ref) {
            printf ("a=%016llx  res=%d  ref=%d\n", a, res, ref);
            return EXIT_FAILURE;
        }
    } 
    return EXIT_SUCCESS;
}

If your platform does not support the POSIX function ffsll(), you could use a compiler specific built-in instead, such as gcc's __builtin_ctz(), MSVC's _BitScanForward64() or the Intel compiler's _tzcnt_u64(), use inline assembly to access a machine instruction that delivers a count of trailing zeros, or roll your own, e.g. like so:

int clzll (uint64_t a)
{
    uint64_t r = 64;
    if (a >= 0x100000000ULL) { a >>= 32; r -= 32; }
    if (a >= 0x000010000ULL) { a >>= 16; r -= 16; }
    if (a >= 0x000000100ULL) { a >>=  8; r -=  8; }
    if (a >= 0x000000010ULL) { a >>=  4; r -=  4; }
    if (a >= 0x000000004ULL) { a >>=  2; r -=  2; }
    r -= a - (a & (a >> 1));
    return r;
}

int ffsll (uint64_t a)
{
    return 64 - clzll(a & -a);
}

Since we do not need a fully generalized ffsll() implementation here, one can also construct a faster variant based on a previous answer of mine, which uses an in-register lookup table:

/* return the position of a single set bit at (one-based) position n*4 */
int bit_pos (uint64_t a)
{
    const uint64_t magic_multiplier = 
        (( 0ULL << 60) | ( 1ULL << 56) | ( 2ULL << 52) | ( 3ULL << 48) |
         ( 4ULL << 44) | ( 5ULL << 40) | ( 6ULL << 36) | ( 7ULL << 32) |
         ( 8ULL << 28) | ( 9ULL << 24) | (10ULL << 20) | (11ULL << 16) |
         (12ULL << 12) | (13ULL <<  8) | (14ULL <<  4) | (15ULL <<  0));
    return (int)((((a >> 3) * magic_multiplier) >> 60) * 4 + 4);
}

/* special version for MSBs of nibbles only! */
int ffsll (uint64_t a)
{
#if NEVER_MORE_THAN_ONE_ZERO_NIBBLE
    /* find the position of the only bit set */
    return bit_pos (a);
#else // NEVER_MORE_THAN_ONE_ZERO_NIBBLE
    /* isolate least significant set bit and find its position */
    return bit_pos (a & -a);
#endif // NEVER_MORE_THAN_ONE_ZERO_NIBBLE
}
njuffa
  • 23,970
  • 4
  • 78
  • 130