Count leading zeros in __m256i word

Question

I'm tinkering around with AVX-2 instructions and I'm looking for a fast way to count the number of leading zeros in a __m256i word (which has 256 bits).

So far, I have figured out the following way:

// Computes the number of leading zero bits.
// Here, avx_word is of type _m256i.

if (!_mm256_testz_si256(avx_word, avx_word)) {
  uint64_t word = _mm256_extract_epi64(avx_word, 0);
  if (word > 0)
    return (__builtin_clzll(word));

  word = _mm256_extract_epi64(avx_word, 1);
  if (word > 0)
    return (__builtin_clzll(word) + 64);

  word = _mm256_extract_epi64(avx_word, 2);
  if (word > 0)
    return (__builtin_clzll(word) + 128);

  word = _mm256_extract_epi64(avx_word, 3);
  return (__builtin_clzll(word) + 192);
} else
  return 256; // word is entirely zero

However, I find it rather clumsy to figure out the exact non-zero word within the 256 bit register.

Does anybody know if there is a more elegant (or faster) way to do this?

Just as an additional information: I actually want to compute the index of the first set bit for arbitrarily long vectors created by logical ANDs, and I am comparing the performance of standard 64 bit operations with SSE and AVX-2 code. Here is my entire test code:

#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>
#include <stdalign.h>

#define ALL  0xFFFFFFFF
#define NONE 0x0


#define BV_SHIFTBITS ((size_t)    6)
#define BV_MOD_WORD  ((size_t)   63)
#define BV_ONE       ((uint64_t)  1)
#define BV_ZERO      ((uint64_t)  0)
#define BV_WORDSIZE  ((uint64_t) 64)


uint64_t*
Vector_new(
    size_t num_bits) {

  assert ((num_bits % 256) == 0);
  size_t num_words = num_bits >> BV_SHIFTBITS;
  size_t mod = num_bits & BV_MOD_WORD;
  if (mod > 0)
    assert (0);
  uint64_t* words;
  posix_memalign((void**) &(words), 32, sizeof(uint64_t) * num_words);
  for (size_t i = 0; i < num_words; ++i)
    words[i] = 0;
  return words;
}


void
Vector_set(
    uint64_t* vector,
    size_t pos) {

  const size_t word_index = pos >> BV_SHIFTBITS;
  const size_t offset     = pos & BV_MOD_WORD;
  vector[word_index] |= (BV_ONE << (BV_MOD_WORD - offset));
}


size_t
Vector_and_first_bit(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_words) {

  for (size_t i = 0; i < num_words; ++i) {
    uint64_t word = vectors[0][i];
    for (size_t j = 1; j < num_vectors; ++j)
      word &= vectors[j][i];
    if (word > 0)
      return (1 + i * BV_WORDSIZE + __builtin_clzll(word));
  }
  return 0;
}


size_t
Vector_and_first_bit_256(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_avx_words) {

  for (size_t i = 0; i < num_avx_words; ++i) {
    const size_t addr_offset = i << 2;
    __m256i avx_word = _mm256_load_si256(
        (__m256i const*) (vectors[0] + addr_offset));

    // AND the AVX words
    for (size_t j = 1; j < num_vectors; ++j) {
      avx_word = _mm256_and_si256(
        avx_word,
        _mm256_load_si256((__m256i const*) (vectors[j] + addr_offset))
      );
    }

    // test whether resulting AVX word is not zero
    if (!_mm256_testz_si256(avx_word, avx_word)) {
      uint64_t word = _mm256_extract_epi64(avx_word, 0);
      const size_t shift = i << 8;
      if (word > 0)
        return (1 + shift + __builtin_clzll(word));

      word = _mm256_extract_epi64(avx_word, 1);
      if (word > 0)
        return (1 + shift + __builtin_clzll(word) + 64);

      word = _mm256_extract_epi64(avx_word, 2);
      if (word > 0)
        return (1 + shift + __builtin_clzll(word) + 128);

      word = _mm256_extract_epi64(avx_word, 3);
      return (1 + shift + __builtin_clzll(word) + 192);
    }
  }
  return 0;
}


size_t
Vector_and_first_bit_128(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_avx_words) {

  for (size_t i = 0; i < num_avx_words; ++i) {
    const size_t addr_offset = i << 1;
    __m128i avx_word = _mm_load_si128(
        (__m128i const*) (vectors[0] + addr_offset));

    // AND the AVX words
    for (size_t j = 1; j < num_vectors; ++j) {
      avx_word = _mm_and_si128(
        avx_word,
        _mm_load_si128((__m128i const*) (vectors[j] + addr_offset))
      );
    }

    // test whether resulting AVX word is not zero
    if (!_mm_test_all_zeros(avx_word, avx_word)) {
      uint64_t word = _mm_extract_epi64(avx_word, 0);
      if (word > 0)
        return (1 + (i << 7) + __builtin_clzll(word));

      word = _mm_extract_epi64(avx_word, 1);
      return (1 + (i << 7) + __builtin_clzll(word) + 64);
    }
  }
  return 0;
}


uint64_t*
make_random_vector(
    const size_t num_bits,
    const size_t propability) {

  uint64_t* vector = Vector_new(num_bits);
  for (size_t i = 0; i < num_bits; ++i) {
    const int x = rand() % 10;
    if (x >= (int) propability)
      Vector_set(vector, i);
  }
  return vector;
}


size_t
millis(
    const struct timeval* end,
    const struct timeval* start) {

  struct timeval e = *end;
  struct timeval s = *start;
  return (1000 * (e.tv_sec - s.tv_sec) + (e.tv_usec - s.tv_usec) / 1000);
}


int
main(
    int argc,
    char** argv) {

  if (argc != 6)
    printf("fuck %s\n", argv[0]);

  srand(time(NULL));

  const size_t num_vectors = atoi(argv[1]);
  const size_t size = atoi(argv[2]);
  const size_t num_iterations = atoi(argv[3]);
  const size_t num_dimensions = atoi(argv[4]);
  const size_t propability = atoi(argv[5]);
  const size_t num_words = size / 64;
  const size_t num_sse_words = num_words / 2;
  const size_t num_avx_words = num_words / 4;

  assert(num_vectors > 0);
  assert(size > 0);
  assert(num_iterations > 0);
  assert(num_dimensions > 0);

  struct timeval t1;
  gettimeofday(&t1, NULL);

  uint64_t*** vectors = (uint64_t***) malloc(sizeof(uint64_t**) * num_vectors);
  for (size_t j = 0; j < num_vectors; ++j) {
    vectors[j] = (uint64_t**) malloc(sizeof(uint64_t*) * num_dimensions);
    for (size_t i = 0; i < num_dimensions; ++i)
      vectors[j][i] = make_random_vector(size, propability);
  }

  struct timeval t2;
  gettimeofday(&t2, NULL);
  printf("Creation: %zu ms\n", millis(&t2, &t1));



  size_t* results_64    = (size_t*) malloc(sizeof(size_t) * num_vectors);
  size_t* results_128   = (size_t*) malloc(sizeof(size_t) * num_vectors);
  size_t* results_256   = (size_t*) malloc(sizeof(size_t) * num_vectors);


  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_64[i] = Vector_and_first_bit(vectors[i], num_dimensions,
          num_words);
  gettimeofday(&t2, NULL);
  const size_t millis_64 = millis(&t2, &t1);
  printf("64            : %zu ms\n", millis_64);


  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_128[i] = Vector_and_first_bit_128(vectors[i],
          num_dimensions, num_sse_words);
  gettimeofday(&t2, NULL);
  const size_t millis_128 = millis(&t2, &t1);
  const double factor_128 = (double) millis_64 / (double) millis_128;
  printf("128           : %zu ms (factor: %.2f)\n", millis_128, factor_128);

  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_256[i] = Vector_and_first_bit_256(vectors[i],
          num_dimensions, num_avx_words);
  gettimeofday(&t2, NULL);
  const size_t millis_256 = millis(&t2, &t1);
  const double factor_256 = (double) millis_64 / (double) millis_256;
  printf("256           : %zu ms (factor: %.2f)\n", millis_256, factor_256);


  for (size_t i = 0; i < num_vectors; ++i) {
    if (results_64[i] != results_256[i])
      printf("ERROR: %zu (64) != %zu (256) with i = %zu\n", results_64[i],
          results_256[i], i);
    if (results_64[i] != results_128[i])
      printf("ERROR: %zu (64) != %zu (128) with i = %zu\n", results_64[i],
          results_128[i], i);
  }


  free(results_64);
  free(results_128);
  free(results_256);

  for (size_t j = 0; j < num_vectors; ++j) {
    for (size_t i = 0; i < num_dimensions; ++i)
      free(vectors[j][i]);
    free(vectors[j]);
  }
  free(vectors);
  return 0;
}

To compile:

gcc -o main main.c -O3 -Wall -Wextra -pedantic-errors -Werror -march=native -std=c99 -fno-tree-vectorize

To execute:

./main 1000 8192 50000 5 9

The parameters mean: 1000 testcases, vectors of length 8192 bits, 50000, test repetitions (last two parameters are minor tweaks).

Sample output for the above call on my machine:

Creation: 363 ms
64            : 15000 ms
128           : 10070 ms (factor: 1.49)
256           : 6784 ms (factor: 2.21)

Is throughput or latency most important here? Are you doing this for just one vector, or is it useful to lzcount multiple vectors in parallel. (No specific idea in mind, but branchy vs. branchless could be better. And for throughput, store / reload can be better than `vpextrq`, especially because extracting from the high 128 is impossible with one instruction; `vpextrq` only works on `xmm` registers, so `_mm256_extract_epi64(vec, 3)` would need a `vextracti128 xmm0, ymm1, 1` first.) — Peter Cordes, Mar 11 '18 at 01:37

Peter Cordes · Accepted Answer · 2018-03-11T13:50:08.277

If your input values are uniformly distributed, almost all of the time the highest set bit will be in the top 64 bits of the vector (1 in 2^64). A branch on this condition will predict very well. @Nejc's answer is good for that case.

But many problems where lzcnt is part of the solution have a uniformly distributed output (or similar), so a branchless version has an advantage. Not strictly uniform, but anything where it's common for the highest set bit to be somewhere other than the highest 64 bits.

Wim's idea of lzcnt on a compare bitmap to find the right element is a very good approach.

However, runtime-variable indexing of the vector with a store/reload is probably better than a shuffle. Store-forwarding latency is low (maybe 5 to 7 cycles on Skylake), and that latency is in parallel with the index generation (compare / movemask / lzcnt). The movd/vpermd/movd lane-crossing shuffle strategy takes 5 cycles after the index is known, to get the right element into an integer register. (See http://agner.org/optimize/)

I think this version should be better latency on Haswell/Skylake (and Ryzen), and also better throughput. (vpermd is quite slow on Ryzen, so it should be very good there) The address calculation for the load should have similar latency as the store-forwarding, so it's a toss-up which one is actually the critical path.

Aligning the stack by 32 to avoid cache-line splits on a 32-byte store takes extra instructions, so this is best if it can inline into a function that uses it multiple times, or already needs that much alignment for some other __m256i.

#include <stdint.h>
#include <immintrin.h>

#ifndef _MSC_VER
#include <stdalign.h>  //MSVC is missing this?
#else
#include <intrin.h>
#pragma intrinsic(_BitScanReverse)  // https://msdn.microsoft.com/en-us/library/fbxyd7zd.aspx suggests this
#endif

// undefined result for mask=0, like BSR
uint32_t bsr_nonzero(uint32_t mask)
{
// on Intel, bsr has a minor advantage for the first step
// for AMD, BSR is slow so you should use 31-LZCNT.

   //return 31 - _lzcnt_u32(mask);
 // Intel's docs say there should be a _bit_scan_reverse(x), maybe try that with ICC

   #ifdef _MSC_VER
     unsigned long tmp;
     _BitScanReverse(&tmp, mask);
     return tmp;
   #else
     return 31 - __builtin_clz(mask);
   #endif
}

And the interesting part:

int mm256_lzcnt_si256(__m256i vec)
{
    __m256i   nonzero_elem = _mm256_cmpeq_epi8(vec, _mm256_setzero_si256());
    unsigned  mask = ~_mm256_movemask_epi8(nonzero_elem);

    if (mask == 0)
        return 256;  // if this is rare, branching is probably good.

    alignas(32)  // gcc chooses to align elems anyway, with its clunky code
    uint8_t elems[32];
    _mm256_storeu_si256((__m256i*)elems, vec);

//    unsigned   lz_msk   = _lzcnt_u32(mask);
//    unsigned   idx = 31 - lz_msk;          // can use bsr to get the 31-x, because mask is known to be non-zero.
//  This takes the 31-x latency off the critical path, in parallel with final lzcnt
    unsigned   idx = bsr_nonzero(mask);
    unsigned   lz_msk = 31 - idx;
    unsigned   highest_nonzero_byte = elems[idx];
    return     lz_msk * 8 + _lzcnt_u32(highest_nonzero_byte) - 24;
               // lzcnt(byte)-24, because we don't want to count the leading 24 bits of padding.
}

On Godbolt with gcc7.3 -O3 -march=haswell, we get asm like this to count ymm1 into esi.

        vpxor   xmm0, xmm0, xmm0
        mov     esi, 256
        vpcmpeqd        ymm0, ymm1, ymm0
        vpmovmskb       eax, ymm0
        xor     eax, -1                      # ~mask and set flags, unlike NOT
        je      .L35
        bsr     eax, eax
        vmovdqa YMMWORD PTR [rbp-48], ymm1   # note no dependency on anything earlier; OoO exec can run it early
        mov     ecx, 31
        mov     edx, eax                     # this is redundant, gcc should just use rax later.  But it's zero-latency on HSW/SKL and Ryzen.
        sub     ecx, eax
        movzx   edx, BYTE PTR [rbp-48+rdx]   # has to wait for the index in edx
        lzcnt   edx, edx
        lea     esi, [rdx-24+rcx*8]          # lzcnt(byte) + lzcnt(vectormask) * 8
.L35:

For finding the highest non-zero element (the 31 - lzcnt(~movemask)), we use bsr to directly get the bit (and thus byte) index, and take a subtract off the critical path. This is safe as long as we branch on the mask being zero. (A branchless version would need to initialize the register to avoid an out-of-bounds index).

On AMD CPUs, bsr is significantly slower than lzcnt. On Intel CPUs, they're the same performance, except for minor variations in output-dependency details.

bsr with an input of zero leaves the destination register unmodified, but GCC doesn't provide a way to take advantage of that. (Intel only documents it as undefined output, but AMD documents the actual behaviour of Intel / AMD CPUs as producing the old value in the destination register).

bsr sets ZF if the input was zero, rather than based on the output like most instructions. (This and the output dependency may be why it's slow on AMD.) Branching on the BSR flags is not particularly better than branching on ZF as set by xor eax,-1 to invert the mask, which is what gcc does. Anyway, Intel does document a _BitScanReverse(&idx, mask) intrinsic that returns a bool, but gcc doesn't support it (not even with x86intrin.h). The GNU C builtin doesn't return a boolean to let you use the flag result, but maybe gcc would make smart asm using the flag output of bsr if you check for the input C variable being non-zero.

Using a dword (uint32_t) array and vmovmskps would let the 2nd lzcnt use a memory source operand instead of needing a movzx to zero-extend a single byte. But lzcnt has a false dependency on Intel CPUs before Skylake, so compilers might tend to load separately and use lzcnt same,same as a workaround anyway. (I didn't check.)

Wim's version needs lz_msk-24 because the high 24 bits are always zero with an 8-bit mask. But a 32-bit mask fills a 32-bit register.

This version with 8 bit elements and a 32-bit mask is the reverse: we need to lzcnt the selected byte, not including the 24 leading zero bits in the register. So our -24 moves to a different spot, not part of the critical path for indexing the array.

gcc chooses to do it as part of a single 3-component LEA (reg + reg*scale - const), which is great for throughput, but puts it on the critical path after the final lzcnt. (It's not free because 3-component LEA has extra latency vs. reg + reg*scale on Intel CPUs. See Agner Fog's instruction tables).

A multiply by 8 can be done as part of an lea, but a multiply by 32 would need a shift (or be folded into two separate LEAs).

Intel's optimization manual says (Table 2-24) even Sandybridge can forward from a 256-bit store to single-byte loads without a problem, so I think it's fine on AVX2 CPUs, the same as forwarding to 32-bit loads that of 4-byte-aligned chunks of the store.

Hi, thanks for the in-depth explanation. The movemask approach looks quite elegant. However, for some reason I could not figure out yet, the function "mm256_lzcnt_si256" you proposed does not compute the same indices as the code I listed. — Sven Hager, Mar 11 '18 at 12:34
@SvenHager: Thanks, I spotted (and fixed) a bug: lzcnt_u32 of a zero-extended byte needs a `-24`. Was that the only bug? I didn't test it myself, since I was sure the main logic was correct, and was mostly thinking about performance issues. Correctness is just a matter of debugging and minor changes :P — Peter Cordes, Mar 11 '18 at 12:54
Hmm, still an issue there ... However, it must be a minor thing, as the indices computed by your code are close to that computed by mine. I will post my test program above. — Sven Hager, Mar 11 '18 at 13:12
I posted my eval code in the question. Sorry that it's a bit lengthy, as it also contains the corresponding functions for 64 and 128 bits. — Sven Hager, Mar 11 '18 at 13:40
@SvenHager: I don't have time to look at it right now; I'll get back to this later today and debug my answer, if you or someone else doesn't spot the remaining bug before then. — Peter Cordes, Mar 11 '18 at 13:45
@SvenHager I think the bug resides in your code. I spotted yesterday that you extract 64-bit integers in the wrong order - according to lzcnt "offsets" that you use (0, 64, 128, 192) the order should be (3,2,1,0) and not the opposite. — Nejc, Mar 11 '18 at 16:12
When I correct the extraction order in @SvenHager method, both of my functions and also the function by PeterCordes return the same results. Btw, if you are interested, I added some benchmark data to my response, check it out. And thanks for all the advice. — Nejc, Mar 11 '18 at 18:46

wim · Answer 2 · 2019-01-30T22:45:31.493

(Update: new answer since 2019-01-31)

Three alternatives are:

Peter Cordes' excellent answer. Fast. This solution is not branchless, which should not be a problem, unless the input is frequently zero with an irregular pattern of occurrences.
My previous answer which is in the edit history of this answer now. Less efficient than Peter Cordes' answer, but branchless.
This answer. Very fast if the data from the 2 tiny lookup tables is in L1 cache. The L1 cache footprint is 128 bytes. Branchless. It may suffer from cache misses when called not often.

In this answer, the input epi64 vector is compared with zero, which produces a mask. This mask is converted to a 4-bit index i_mask (by _mm256_movemask_pd). With index i_mask two values are read from the two lookup tables: 1. the index of the first nonzero 64-bit element, and 2. the number of nonzeros of the preceding (from left to right) zero elements. Finally, the _lzcnt_u64 of the first nonzero 64-bit element is computed and added to the lookup table value. Function mm256_lzcnt_si256 implements this method:

#include <stdio.h>
#include <stdint.h>
#include <x86intrin.h>
#include <stdalign.h>
/* gcc -Wall -m64 -O3 -march=haswell clz_avx256_upd.c */


int mm256_lzcnt_si256(__m256i input)
{   
    /* Version with lookup tables and scratch array included in the function                                                                  */

    /* Two tiny lookup tables (64 bytes each, less space is possible with uint8_t or uint16_t arrays instead of uint32_t):                       */
    /* i_mask  (input==0)                 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111                        */
    /* ~i_mask (input!=0)                 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000                        */
    static const uint32_t indx[16]   = {   3,   3,   3,   3,   3,   3,   3,   3,   2,   2,   2,   2,   1,   1,   0,   0};
    static const uint32_t lz_msk[16] = {   0,   0,   0,   0,   0,   0,   0,   0,  64,  64,  64,  64, 128, 128, 192, 192};

    alignas(32)  uint64_t tmp[4]     = {   0,   0,   0,   0};                /* tmp is a scratch array of 32 bytes, preferably 32 byte aligned   */ 

                          _mm256_storeu_si256((__m256i*)&tmp[0], input);     /* Store input in the scratch array                                 */
    __m256i  mask       = _mm256_cmpeq_epi64(input, _mm256_setzero_si256()); /* Check which 64 bits elements are zero                            */
    uint32_t i_mask     = _mm256_movemask_pd(_mm256_castsi256_pd(mask));     /* Move vector mask to integer mask                                 */
    uint64_t input_i    = tmp[indx[i_mask]];                                 /* Load the first (from the left) non-zero 64 bit element input_i   */
    int32_t  lz_input_i = _lzcnt_u64(input_i);                               /* Count the number of leading zeros in input_i                     */
    int32_t  lz         = lz_msk[i_mask] + lz_input_i;                       /* Add the number of leading zeros of the preceding 64 bit elements */
             return lz;
}    


int mm256_lzcnt_si256_v2(__m256i input, uint64_t* restrict tmp, const uint32_t* indx, const uint32_t* lz_msk)
{   
    /* Version that compiles to nice assembly, although, after inlining there won't be any difference between the different versions.            */
                          _mm256_storeu_si256((__m256i*)&tmp[0], input);     /* Store input in the scratch array                                 */
    __m256i  mask       = _mm256_cmpeq_epi64(input, _mm256_setzero_si256()); /* Check which 64 bits elements are zero                            */
    uint32_t i_mask     = _mm256_movemask_pd(_mm256_castsi256_pd(mask));     /* Move vector mask to integer mask                                 */
    uint64_t input_i    = tmp[indx[i_mask]];                                 /* Load the first (from the left) non-zero 64 bit element input_i   */
    int32_t  lz_input_i = _lzcnt_u64(input_i);                               /* Count the number of leading zeros in input_i                     */
    int32_t  lz         = lz_msk[i_mask] + lz_input_i;                       /* Add the number of leading zeros of the preceding 64 bit elements */
             return lz;
}    


__m256i bit_mask_avx2_lsb(unsigned int n)               
{           
    __m256i ones       = _mm256_set1_epi32(-1);
    __m256i cnst32_256 = _mm256_set_epi32(256,224,192,160, 128,96,64,32);
    __m256i shift      = _mm256_set1_epi32(n);   
            shift      = _mm256_subs_epu16(cnst32_256,shift);  
                  return _mm256_srlv_epi32(ones,shift);
}


int print_avx2_hex(__m256i ymm)
{
    long unsigned int x[4];
        _mm256_storeu_si256((__m256i*)x,ymm);
        printf("%016lX %016lX %016lX %016lX  ", x[3],x[2],x[1],x[0]);
    return 0;
}


int main()
{
    unsigned int i;
    __m256i x;

    printf("mm256_lzcnt_si256\n");
    for (i = 0; i < 257; i++){
        printf("x=");
        x = bit_mask_avx2_lsb(i);
        print_avx2_hex(x);
        printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));
    }
    printf("\n");

    x = _mm256_set_epi32(0,0,0,0, 0,15,1,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));
    x = _mm256_set_epi32(0,0,0,8, 0,0,0,256);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));
    x = _mm256_set_epi32(0,0x100,0,8, 0,192,0,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));
    x = _mm256_set_epi32(-1,0x100,0,8, 0,0,32,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));

   /* Set arrays for mm256_lzcnt_si256_v2:                          */
    alignas(32) static const uint32_t indx[16]   = {   3,   3,   3,   3,   3,   3,   3,   3,   2,   2,   2,   2,   1,   1,   0,   0};
    alignas(32) static const uint32_t lz_msk[16] = {   0,   0,   0,   0,   0,   0,   0,   0,  64,  64,  64,  64, 128, 128, 192, 192};
    alignas(32)              uint64_t tmp[4]     = {   0,   0,   0,   0};
    printf("\nmm256_lzcnt_si256_v2\n");
    for (i = 0; i < 257; i++){
        printf("x=");
        x = bit_mask_avx2_lsb(i);
        print_avx2_hex(x);
        printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));
    }
    printf("\n");

    x = _mm256_set_epi32(0,0,0,0, 0,15,1,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));
    x = _mm256_set_epi32(0,0,0,8, 0,0,0,256);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));
    x = _mm256_set_epi32(0,0x100,0,8, 0,192,0,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));
    x = _mm256_set_epi32(-1,0x100,0,8, 0,0,32,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));

    return 0;
}

The output suggests that the code is correct:

$ ./a.out
mm256_lzcnt_si256
x=0000000000000000 0000000000000000 0000000000000000 0000000000000000  lzcnt(x)=256 
x=0000000000000000 0000000000000000 0000000000000000 0000000000000001  lzcnt(x)=255 
...
x=0000000000000000 0000000000000000 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=129 
x=0000000000000000 0000000000000000 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=128 
x=0000000000000000 0000000000000001 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=127 
...
x=7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=1 
x=FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=0 

x=0000000000000000 0000000000000000 000000000000000F 0000000100000000  lzcnt(x)=188 
x=0000000000000000 0000000000000008 0000000000000000 0000000000000100  lzcnt(x)=124 
x=0000000000000100 0000000000000008 00000000000000C0 0000000000000000  lzcnt(x)=55 
x=FFFFFFFF00000100 0000000000000008 0000000000000000 0000002000000000  lzcnt(x)=0

Function mm256_lzcnt_si256_v2 is an alternative version of the same function, but now the pointers to the lookup tables and the scratch array are passed with the function call. This leads to clean assembly code (no stack operations), and gives an impression which instructions are needed after inlining mm256_lzcnt_si256 in a loop.

With gcc 8.2 and options -m64 -O3 -march=skylake:

mm256_lzcnt_si256_v2:
        vpxor   xmm1, xmm1, xmm1
        vmovdqu YMMWORD PTR [rdi], ymm0
        vpcmpeqq        ymm0, ymm0, ymm1
        vmovmskpd       ecx, ymm0
        mov     eax, DWORD PTR [rsi+rcx*4]
        lzcnt   rax, QWORD PTR [rdi+rax*8]
        add     eax, DWORD PTR [rdx+rcx*4]
        vzeroupper
        ret

In a loop context, and with inlining, vpxor is likely hoisted outside the loop.

I think this is one of those times when store/reload beats ALU, because store-forwarding latency can run in parallel with index-calculation (vpcmpeq / movemask / bsr latency). See my answer. — Peter Cordes, Mar 11 '18 at 06:00
That is a good point. The `ymm` store can start even before the `vcmpeq`. I have updated the answer to redirect the reader to your answer. — wim, Mar 11 '18 at 09:05
Interesting idea to use a LUT instead of `x - bsr(movemask)`. It might even be lower latency. Looking back at this, I feel like `lzcnt` on the mask should have avoided the `31-` with BSR, and maybe made it branchless in my version. Memory-source 64-bit `lzcnt` is nice (but unlaminates an indexed addr modes on Skylake, where the false dependency is removed :/) — Peter Cordes, Jan 31 '19 at 05:08
The `indx` LUT can be 8-bit without costing any extra instructions. (MOVZX vs. MOV is free on Intel and Ryzen and only 1 byte longer, and I don't think indexing a static LUT with PIE/PIC would change the fact that you'd want a separate load before a memory-source LZCNT, even if you made it `uintptr_t[]`.) `lz_msk` should stay 32-bit though to allow folding that load, so the total space saving is pretty small. And we can't do an 8-bit `add al, [rdx+rcx]` because the result can be 256, which doesn't fit. Plus partial registers suck. — Peter Cordes, Jan 31 '19 at 05:10
@PeterCordes: Thanks for your comments. [Godbolt results](https://godbolt.org/z/N8J8Vv) with 8-bit LUTs, which confirms you comment. Can you explain what you mean by: *you'd want a separate load before a memory-source LZCNT* ? — wim, Jan 31 '19 at 07:57
I was thinking about doing the address calculation with ADD instead of using an indexed addressing mode. But I was getting mixed up with what the `lzcnt reg,mem` was indexing: it's into `tmp`, not a static table. So you could maybe do `add rdi, [rsi + rcx*8]` (stays micro-fuses because of read/write dst reg) / `lzcnt rax, [rdi]` (stays micro-fused even on SKL because non-indexed addr mode). But then you need `const uintptr_t indx[]`, and you need to convince C that you're adding byte offsets, not scaled array indices. Either with `char*` or casting to `uintptr_t` and back. — Peter Cordes, Jan 31 '19 at 08:08
But more likely `tmp` is on the stack and the compiler doesn't have its address in a register. So it would just MOV or MOVZX load and use an indexed addr mode for lzcnt. With a static table, you'd have either a RIP-relative LEA to create `rsi`, or `movzx eax, [indx + rcx]` which doesn't even need a SIB byte :) Compilers probably wouldn't put `indx[]` on the stack from immediate data because that would be 4x `imm32` . `lz_msk` would be efficient to store from immediates because of half of it being zeros, if it was a byte array. But anyway, static rodata is fine. — Peter Cordes, Jan 31 '19 at 08:11
@PeterCordes: Thanks for your answer. I'm afraid I don't have enough x86 background knowledge to understand all the details, but thanks anyway! — wim, Jan 31 '19 at 08:35

Nejc · Answer 3 · 2018-03-12T01:07:12.537

2

Since you are also asking for more elegant (i.e. simpler) way to do this: on my computer, your code runs as fast as the one below. In both cases it took 45 milliseconds to compute the result for 10 million 256-bit words.

Since I was filling AVX registers with (four) randomly generated uniformly distributed 64-bit integers (and not uniformly distributed 256 integers), the order of iteration through the array had no impact on the result of my benchmark test. Also, even though this is almost needless to say, the compiler was smart enough to unroll the loop.

uint32_t countLeadZeros(__m256i const& reg)
{
  alignas(32) uint64_t v[4];
  _mm256_store_si256((__m256i*)&v[0], reg);

  for (int i = 3; i >= 0; --i)
    if (v[i]) return _lzcnt_u64(v[i]) + (3 - i)*64;

  return 256;
}

EDIT: as it can be seen in the discussion below my answer and in my edit history, I initally took the approach similar to the one of @PeterCorbes (but he provided a better optimized solution). I changed my approach once I started doing benchmarks because I completely overlooked the fact that practically all of my inputs had the most significant bit located within top 64 bits of the AVX word.

After I realized the mistake I had made, I decided to try to do the benchmarks more properly. I will present two results below. I searched through the edit history of my post and from there I copy-pasted the function I submitted (but later edited-out) before I changed my approach and went for the branched version. That function is presented below. I compared the performance of my "branched" function, my "branchless" function and the branchless function that was independently developed by @PeterCorbes. His version is superior to mine in terms of performance - see his excellently written post that contains lots of useful details.

int countLeadZeros(__m256i const& reg){

  __m256i zero = _mm256_setzero_si256();
  __m256i cmp = _mm256_cmpeq_epi64(reg, zero);

  int mask = _mm256_movemask_epi8(cmp);

  if (mask == 0xffffffff) return 256;

  int first_nonzero_idx = 3 - (_lzcnt_u32(~mask) >> 3);

  alignas(32) uint64_t stored[4]; // edit: added alignas(32)
  _mm256_store_si256((__m256i*)stored, reg);

  int lead_zero_count = _lzcnt_u64(stored[first_nonzero_idx]);

  return (3 - first_nonzero_idx) * 64 + lead_zero_count;
}

Benchmark number 1

I will present the test code in pseudocode to make this short. I actually used AVX implementation of random number generator that does the generation of random numbers blazingly fast. First, let's do the test on the inputs that make branch prediction really hard:

tick()
for(int i = 0; i < N; ++i)
{
   // "xoroshiro128+"-based random generator was actually used
   __m256i in = _mm256_set_epi64x(rand()%2, rand()%2, rand()%2, rand()%2);

   res = countLeadZeros(in);  
}
tock();

For 10 million repetitions, the function from the top of my post takes 200ms. The implementation that I initially developed requires only 65ms to do the same job. But the function provided by @PeterCorbes takes the cake by consuming only 60ms.

Benchmark number 2

Now let's turn to test that I originally used. Again, pseudocode:

tick()
for(int i = 0; i < N; ++i)
{
   // "rand()" represents random 64-bit int; xoroshiro128+ waw actually used here
   __m256i in = _mm256_set_epi64x(rand(), rand(), rand(), rand());

   res = countLeadZeros(in);  
}
tock();

In this case, the version with branches is faster; 45ms are required to compute 10 million results. The function by @PeterCorbes takes 50ms to complete and my "branchless" implementation requires 55ms to do the same job.

I don't think that I dare to draw any general conclusions out of this. It seems to me that the branchless approach is better as it offers the more stable computation time, but whether you need that stability or not probably depends on the usecase.

EDIT: the random generator.

This is an extended reply to comment by @PeterCorbes. As I stated above, the benchmark test code is just pseudocode. If anyone is interested, how I actually generated the numbers, here is a quick description.

I used xoroshiro128+ algorithm which was released into public domain and which is available at this website. It is quite simple to rewrite the algorithm with AVX instructions so that four numbers are generated in parallel. I wrote a class that accepts the so-called initial seed (128 bits) as parameter. I obtain the seeds (states) for each one of four parallel generators by first copying the initial seed four times; after that I use jump instructions on i-th parallel generator i-times; i = {0, 1, 2, 3}. Each jump advances the internal state J=2^64 steps forward. This means I can generate 4*J numbers (moooore than enough for all everyday purposes), four at a time before any parallel generator starts to repeat a sequence of numbers that were already produced by any other generator in a current session. I control the range of produced numbers with _mm256_srli_epi64 instruction; I use shift 63 for first test and no shift for the second one.

edited Mar 12 '18 at 01:07

answered Mar 11 '18 at 02:33

Nejc

927
6
15

You need `alignas(32) uint64_t v[4];` if you're going to use an aligned-store intrinsic. And BTW, both your version and the question branch, so could mispredict if the input isn't a uniform distribution (i.e. if the high qword isn't almost always non-zero). But if that is the case, taking advantage of that with branch prediction is pretty good. – Peter Cordes Mar 11 '18 at 05:54
You're right, I added alignas to the sample straight away. Initially I tried to come up with a function without branch, but I could not match the performance of mine/OP's function (I did not benchmark the others). No matter what I tried, in best cases my function was ~20% slower than the one I posted in the reply. I was almost surprised because I was sure that branchless function will be significantly faster. – Nejc Mar 11 '18 at 06:03
@PeterCordes this is the fastest version of branchless function I could come up with. See any room for improvement? (never mind that if clause, that can be removed with small tweak) https://pastebin.com/XeBWAmXU – Nejc Mar 11 '18 at 06:06
With a uniform *input* distribution, you will get almost no branch mispredicts because the high qword only has a 1 in 2^64 chance of being 0. Counting only it will be much faster than checking everything branchlessly. – Peter Cordes Mar 11 '18 at 06:06
@PeterCordes yes, I completely understand that. But as I said in the post I was testing this with uniformly distributed 64-bit integers and the function with the branch was nevertheless faster. That's what surprised me. See my comment which I posted 1 second before your reply; I pasted the link to branchless function there. – Nejc Mar 11 '18 at 06:09
Your branchless version is very similar to my answer; I used basically the same strategy, using `n - lzcnt(movemask)` or `bsr` to index an array. But I used a uint8_t array to avoid needing shifts, because `movzx` is cheap. – Peter Cordes Mar 11 '18 at 06:09
Anyway, I don't find it surprising that branchy beats branchless when the branches predict perfectly. Fewer instructions and a shorter critical path is a nice win. In your benchmark, were you looping over a giant array in memory, or looping multiple times over a smaller array (that fit in L1d cache), or generating the vectors on the fly with an AVX2 xorshift+ or some other PRNG? If looping over memory, both versions are probably close to limited by memory-bandwidth bottlenecks. – Peter Cordes Mar 11 '18 at 06:13
Giant array in memory, filled with precomputed numbers. You are probably right about memory bandwith having a too large effect on this benchmark, yes. – Nejc Mar 11 '18 at 06:19
@PeterCordes I actually wrote AVX implementation of xoroshiro128+ a while ago so I could quickly repeat the benchmark now without large arrays: the results of the benchmark are the same; the bandwidth was not a bottleneck after all. – Nejc Mar 11 '18 at 06:30
Nice update with the benchmarks, thanks for testing mine. :) But `_mm256_set_epi64x(rand(), rand(), rand(), rand());` isn't really an AVX2 PRNG, if you actually wrote it that way. [I used an AVX2 `xorshift+`](https://unix.stackexchange.com/questions/323845/whats-the-fastest-way-to-generate-a-1-gb-text-file-containing-random-digits/324520#324520) that runs 4x 64-bit PRNGs in parallel in elements of a `__m256i`, for a case where performance was more important than quality (like this case; anything that defeats branch prediction is fine). Only AVX512 has rotates; AVX2 has to emulate them. – Peter Cordes Mar 12 '18 at 00:39
I wrote a class that accepts single 128-bit seed as parameter. The seed is copied four times in the constructor (once per "parallel generator"), and then jump instructions are used {0, 1, 2, 3}-times. Each jump advances the internal state J=2^64 steps forward. This means I can generate 4*J numbers, four at a time without repeating sequences. Works fine for various purposes. I started with xoroshiro128+ implementation in C and rewrote it with AVX intrinsics. The original implementation with corresponding jump instructions is published here: http://xoroshiro.di.unimi.it/ – Nejc Mar 12 '18 at 00:45
Oh, so you don't literally use `_mm256_set_epi64x` on a scalar function inside the inner loop. So that's not nearly as slow as what you wrote in your question. But still, `xorshift128+` is faster (and somewhat lower quality) than `xoroshiro128+`, especially in SIMD without AVX512 for rotates. – Peter Cordes Mar 12 '18 at 00:49
I am aware of that, but I had the implementation for xoroshiro128+ ready at hand because I wrote and tested it for some other project in the past. :) I'll edit the answer and add a brief description. Seems like my "pseudocode" was not "pseudo" enough, hah. :) – Nejc Mar 12 '18 at 00:54
@PeterCordes I quickly benchmarked the generator. It seems like 30ms are needed to produce 4*10e6 random numbers. If I substract that time from benchmark results above, the ratio of execution times of our branchless algorithms becomes about 4/5 (your function is faster). Benchmark seems OK, but I'll re-check it and edit the answer when I have the time. BTW, would you have any idea why execution times of branchless algorithms in first test are longer than in second test? Does _lzcnt_u64 produce result faster for larger numbers? I launched tests several times and the difference was always there. – Nejc Mar 12 '18 at 01:53
That's an interesting and useful benchmark, actually. Different strategies will be more or less friendly to surrounding code that bottlenecks on ALU throughput or latency, rather than the actual throughput of the vector lzcnt itself. The extra ALU uops in Wim's version to shuffle instead of generate an index were a definite downside in that respect. – Peter Cordes Mar 12 '18 at 01:54
`_mm256_srli_epi64` isn't `%2`, really. I assumed you meant `_mm256_and_si256` with `set1_epi64(1)`. xoroshiro already bottlenecks on vector ALU shift throughput, I think, so adding another shift is probably what slows it down. What CPU are you testing on? `vpsrlq ymm,ymm, imm8` has only 1 per clock throughput on Haswell (port 0 only), but 2 per clock throughput on Skylake (port 0/1). Of course on Skylake, it competes for port 1 against `lzcnt`. Anyway, try using an `and` for your mod 2 if you're mostly bottlenecked on port0 throughput. – Peter Cordes Mar 12 '18 at 02:00

gpnuma · Answer 4 · 2018-03-11T13:53:52.627

I've got a version which is not really "elegant", but faster here (Apple LLVM version 9.0.0 (clang-900.0.39.2)) :

#define NOT_ZERO(x) (!!(x))

#ifdef UNIFORM_DISTRIBUTION
#define LIKELY(x)           __builtin_expect(NOT_ZERO(x), 1)
#define UNLIKELY(x)         __builtin_expect(NOT_ZERO(x), 0)
#else
#define LIKELY(x)           (x)
#define UNLIKELY(x)         (x)
#endif


inline unsigned int clz_u128(uint64_t a, uint64_t b, int not_a, int not_b) {
    if(UNLIKELY(not_a)) {
        if(UNLIKELY(not_b)) {
            return 128;
        } else {
            return (__builtin_clzll(b)) + 64;
        }
    } else {
        return (__builtin_clzll(a));
    }
}

unsigned int clz_u256(__m256i packed) {
    const uint64_t a_0 = (uint64_t)_mm256_extract_epi64(packed, 0);
    const uint64_t a_1 = (uint64_t)_mm256_extract_epi64(packed, 1);
    const uint64_t b_0 = (uint64_t)_mm256_extract_epi64(packed, 2);
    const uint64_t b_1 = (uint64_t)_mm256_extract_epi64(packed, 3);

    const int not_a_0 = !a_0;
    const int not_a_1 = !a_1;

    if(UNLIKELY(not_a_0 & not_a_1)) {
        return clz_u128(b_0, b_1, !b_0, !b_1) + 128;
    } else {
        return clz_u128(a_0, a_1, not_a_0, not_a_1);
    }
}

It splits a bigger problem into smaller ones and uses the fact that it is incredibly more likely for higher bits to be non-zero than lower bits if the vector distribution is uniform.

Just add a #define UNIFORM_DISTRIBUTION if uniform distribution is expected for extra performance.

You're assuming a uniform distribution. This is often not the case; in many problems where it's interesting / useful to use an lzcnt, the *output* is approximately uniformly distributed in some range, e.g. in 0 .. 256. — Peter Cordes, Mar 11 '18 at 05:56
Your vector indexing or variable naming is weird. ` _mm256_extract_epi64(packed, 3);` is the highest element of the vector, and if it's non-zero then it contains the bit we're looking for. But you've called it `low_1`. — Peter Cordes, Mar 11 '18 at 06:03
You're right it was confusing, I edited and now use neutral variables. I've also taken care of the uniform/non uniform distribution possibilities. — gpnuma, Mar 11 '18 at 13:34

Count leading zeros in __m256i word

4 Answers4

Linked