Why is one of these sooooo much faster than the other?

Question

I'm writing C++ code to find the first byte in memory that is non 0xFF. To exploit bitscanforward, I had written an inline assembly code that I like very much. But for "readability" as well as future proofing (i.e. SIMD vectorization) I thought I would give g++ optimizer a chance. g++ didn't vectorize, but it did get to nearly the same non-SIMD solution I did. But for some reason, it's version runs much slower, 260000x slower (i.e. I have to loop my version 260,000x more to get to the same execution time). I excepted some difference but not THAT much! Can some point out why it might be? I just want to know so as to make a mistake in future inline assembly codes.

The C++ starting point is following, (in terms of counting accuracy, there is a bug in this code, but I've simplified it for this speed test):

uint64_t count3 (const void *data, uint64_t const &nBytes) {
      uint64_t count = 0;
      uint64_t block;
      do {
         block = *(uint64_t*)(data+count);
         if ( block != (uint64_t)-1 ) {
/*       count += __builtin_ctz(~block);   ignore this for speed test*/
            goto done;
          };
        count += sizeof(block);
      } while ( count < nBytes );
done:
      return (count>nBytes ? nBytes : count);
}

The assembly code g++ came up with is:

_Z6count3PKvRKm:
.LFB33:
    .cfi_startproc
    mov rdx, QWORD PTR [rsi]
    xor eax, eax
    jmp .L19
    .p2align 4,,10
    .p2align 3
.L21:
    add rax, 8
    cmp rax, rdx
    jnb .L18
.L19:
    cmp QWORD PTR [rdi+rax], -1
    je  .L21
.L18:
    cmp rax, rdx
    cmova   rax, rdx
    ret
    .cfi_endproc

My inline assembly is

_Z6count2PKvRKm:
.LFB32:
    .cfi_startproc
    push    rbx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    mov rbx, QWORD PTR [rsi]

    # count trailing bytes of 0xFF 
    xor     rax, rax  
.ctxff_loop_69:          
    mov     r9,  QWORD PTR [rdi+rax] 
    xor     r9, -1          
    jnz   .ctxff_final_69    
    add     rax, 8     
    cmp     rax, rbx 
    jl    .ctxff_loop_69    
.ctxff_final_69:         
    cmp     rax,rbx  
    cmova   rax,rbx  
    pop rbx
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc

As far as I can see, it is substantially identical, except for the method by which it compare the data byte against 0xFF. But I cannot believe this would cause a great difference in computation time.

It's conceivable my test method is causing the error, but all I do is change the function name and iteration length in the following, simple for-loop shown below: (when N is 1<<20, and all bytes of 'a' except the last byte is 0xFF)

test 1

   for (uint64_t i=0; i < ((uint64_t)1<<15); i++) {
      n = count3(a,N);
   }

test 2

   for (uint64_t i=0; i < ((uint64_t)1<<33); i++) {
      n = count2(a,N);
   }

EDIT:

Here are my real inline assembly codes with SSE count1(), x64-64 count() and then plain-old-c++ versions count0() and count3(). I fell down this rabbit hole hoping that I could get g++ to take my count0() and arrive, on it's own, to my count1() or even count2(). But alas it did nothing, absolutely no optmization :( I should add that my platform doesn't have AVX2, which is why I was hoping to get g++ to automatically vectorize, so that the code would automatically update when I update my platform.

In terms of the explicit register use in the inline assembly, if I didn't make them explicitly, g++ would reuse the same registers for nBytes and count.

In terms of speedup, between XMM and QWORD, I found the real benefit is simply the "loop-unroll" effect, which I replicate in count2().

uint32_t count0(const uint8_t *data, uint64_t const &nBytes) {

  for (int i=0; i<nBytes; i++)
    if (data[i] != 0xFF) return i;

  return nBytes;
}
uint32_t count1(const void *data, uint64_t const &nBytes) {
  uint64_t count;
  __asm__("# count trailing bytes of 0xFF \n"
    "   xor     %[count], %[count]  \n"
    " vpcmpeqb  xmm0, xmm0, xmm0  \n" // make array of 0xFF

    ".ctxff_next_block_%=:        \n"
    " vpcmpeqb  xmm1, xmm0, XMMWORD PTR [%[data]+%[count]]  \n"
    " vpmovmskb r9, xmm1         \n"
    " xor     r9, 0xFFFF       \n" // test if all match (bonus negate r9)
    " jnz   .ctxff_tzc_%=        \n" // if !=0, STOP & tzcnt negated r9
    " add     %[count], 16       \n" // else inc
    " cmp     %[count], %[nBytes] \n"
    " jl    .ctxff_next_block_%=  \n" // while count < nBytes, loop
    " jmp   .ctxff_done_%=      \n" // else done + ALL bytes were 0xFF

    ".ctxff_tzc_%=:           \n"
    " tzcnt   r9, r9          \n" // count bytes up to non-0xFF
    " add     %[count], r9    \n"

    ".ctxff_done_%=:          \n" // more than 'nBytes' could be tested,
    " cmp     %[count],%[nBytes]  \n" // find minimum
    " cmova   %[count],%[nBytes]  "
    : [count] "=a" (count)
    : [nBytes] "b" (nBytes), [data] "d" (data)
    : "r9", "xmm0", "xmm1"
  );
  return count;
};

uint64_t count2 (const void *data, uint64_t const &nBytes) {
    uint64_t count;
  __asm__("# count trailing bytes of 0xFF \n"
    "    xor     %[count], %[count]  \n"

    ".ctxff_loop_%=:          \n"
    "    mov     r9,  QWORD PTR [%[data]+%[count]] \n"
    "    xor     r9, -1          \n" 
    "    jnz   .ctxff_final_%=    \n"
    "    add     %[count], 8     \n" 
    "    mov     r9,  QWORD PTR [%[data]+%[count]] \n"  // <--loop-unroll
    "    xor     r9, -1          \n" 
    "    jnz   .ctxff_final_%=    \n"
    "    add     %[count], 8     \n" 
    "    cmp     %[count], %[nBytes] \n"
    "    jl    .ctxff_loop_%=    \n"
    "    jmp   .ctxff_done_%=   \n" 

    ".ctxff_final_%=:            \n"
    "    bsf   r9,  r9           \n" // do tz count on r9 (either of first QWORD bits or XMM bytes)
    "    shr     r9,  3          \n" // scale BSF count accordiningly
    "    add     %[count], r9    \n"
    ".ctxff_done_%=:          \n" // more than 'nBytes' bytes could have been tested,
    "    cmp     %[count],%[nBytes]  \n" // find minimum of count and nBytes
    "    cmova   %[count],%[nBytes]  "
    : [count] "=a" (count)
    : [nBytes] "b" (nBytes), [data] "D" (data)
    : "r9"
  );
  return count;
}

inline static uint32_t tzcount(uint64_t const &qword) {
  uint64_t tzc;
  asm("tzcnt %0, %1" : "=r" (tzc) : "r" (qword) );
  return tzc;
};

uint64_t count3 (const void *data, uint64_t const &nBytes) {
      uint64_t count = 0;
      uint64_t block;
      do {
        block = *(uint64_t*)(data+count);
         if ( block != (uint64_t)-1 ) {
           count += tzcount(~block);
            goto done;
          };
        count += sizeof(block);
      } while ( count < nBytes );
done:
      return (count>nBytes ? nBytes : count);
}

uint32_t N = 1<<20;

int main(int argc, char **argv) {

  unsigned char a[N];
  __builtin_memset(a,0xFF,N);

  uint64_t n = 0, j;
   for (uint64_t i=0; i < ((uint64_t)1<<18); i++) {
      n += count2(a,N);
   }

 printf("\n\n %x %x %x\n",N, n, 0);   
  return n;
}

Apparently your code assumes: 1) `nBytes >= sizeof(block)` 2) `(nBytes % sizeof(block)) == 0` 3) `data` is 64bit aligned. Given that... Like you, my attention is drawn to the `cmp`. What happens if you change your inline asm to compare directly with memory? Do the speeds sync up? — David Wohlferd, Apr 24 '16 at 00:19
A speed diff of 260000x is an almost sure sign that the faster test is not actually doing any work. There's nothing in either loop that can account for that much speed difference. (Perhaps self-modifying code could run that slowly, or like Douglas says, paging to disk.) Are you sure the 260k repeat loop is actually repeating the full work, and not just a mostly empty loop doing just an `add` or something? **There's certainly some kind of measurement error. Post the full code**. — Peter Cordes, Apr 24 '16 at 01:10
Also, note that `xor r9, -1` is the same as `not r9`, and neither one can macro-fuse with `jnz`. Your best bet is to `mov reg, -1` outside the loop, and then use `cmp reg, [mem]` inside the loop. That will let your `cmp` macro-fuse with the jcc on Intel SnB-family CPUs, which isn't possible when using an immediate and memory operand. (See http://agner.org/optimize/). Also, you could use `break` instead of `goto done`. — Peter Cordes, Apr 24 '16 at 01:14
Since SSE2 is baseline for x86-64, you can and should be using it. Get a vector of all-0xFF (using `pcmpeqw xmm1,xmm1`) and compare against it with `pcmpeqb`. Use `psubb` to sum the compare results (subtract the -1), occasionally using `psadbw` to horizontally sum the bytes into 16bit words often enough to avoid overflow. Non-matching count = total bytes - match count. Or, to break on the first one you find, use `pmovmskb eax, xmm0` on the compare result, and `cmp eax, 0xFFFF / jne` to fall through only if all 16 vector elements compared equal. — Peter Cordes, Apr 24 '16 at 01:21
What CPU microarchitecture are you testing on? Is it something weird with branch predictors that don't like having two branches really close to each other? What do the perf counters say for the slow version? Even a branch mispredict every iteration doesn't explain 260k, though. — Peter Cordes, Apr 24 '16 at 01:26
@PeterCordes I don't believe OP is actually saying 260,000 times faster. Read it again `I have to loop my version 260,000x more to get to the same execution time.` I'm reading that to say that if Count3 can do X loops in N seconds, count2 can do X + 260,000 in N seconds. It's hard to say how significant that really is without knowing the actual values for X and N. — David Wohlferd, Apr 24 '16 at 02:19
@PeterCordes I know about the SSE option, I actually did that as well, but the speedup, at least using XMM was, marginal. Using QWORD (8-bytes) is ~1op per block, using XMM (16-bytes) is ~2 ops per block. It would seem more useful with AVX2 or AVX512. — codechimp, Apr 24 '16 at 03:36
@user4602856: Your scalar version only counts with a granularity of 8 bytes. If you can do that for SIMD, then use `pcmpeqq` / `psubq` so you don't need a `psadb` outer loop to avoid overflow. — Peter Cordes, Apr 24 '16 at 03:43
With unrolling, the scalar version can probably achieve two loads per clock, including the compare-and-branch. The vector version needs four uops per compare-and-branch, or three per compare-and-count. With 128b xmm vectors using AVX1 for non-destructive operations, subtract one uop each (can compare-and-load without destroying a vector of all-ones). So for compare-and-count, you should be able to saturate the load unit on a SnB-family CPU loading 2x 128b per clock, but scalar you can probably only do 2x 64b per clock at best. — Peter Cordes, Apr 24 '16 at 03:48
@PeterCordes, the scalar version shown was a dumbed down version simple for this speed test, the real version counts with granularity of 1 with one `bsf` operation when a 'disturbed' block of data is found. And with regards to the speed, I have already done the test. SSE is better but not 2x. — codechimp, Apr 24 '16 at 03:49
How do you efficiently get a result from 0 to 8 with only one `bsf`? What if bytes 3 and 7 are "disturbed"? `bsf` can only tell you the position of the first mismatch, and requires a `not`. If you care about per-byte counts, SIMD is going to be significantly faster than scalar, like 2x for block counts, more for byte counts. Scalar only looks good for finding the first block, because `cmp reg, [mem]/jne` can macro-fuse into a single uop. (Haswell can macro-fuse two cmp/jcc pairs in the same decode block, but previous CPU can only macro-fuse the first) — Peter Cordes, Apr 24 '16 at 03:54
Also note that `bsf` is slow on AMD CPUs. If you encode it as `rep bsf`, it will run as `tzcnt` on CPUs that support that insn, giving the same result as `bsf` (except when the input is all zero) but setting flags differently. For some reason AMD Pilederiver/Steamroller have 2 m-op `tzcnt` but 6 m-op `bsf`, so be careful with that insn if you care about your code running on AMD. SIMD is going to be at least a bit faster, and work well on all CPUs, so just use it. For big arrays, you'll bottleneck on memory, of course. Fewer wider loads allows more in-flight loads with limited load buffers — Peter Cordes, Apr 24 '16 at 03:58
@PeterCordes I'm not counting ALL 0xFF, just up to the first non-0xFF. Also, I know something about the data, 0xFF is most likely to happen in large clumps, not individual bytes. The `bsf` is just sloppiness on my part but at this point it doesn't matter since, 1) my cpu doesn't actually support real `tzcnt`, 2) `bsf` happens only once, at the end. my CPU is i5-2550K — codechimp, Apr 24 '16 at 11:32
`xor` / `jnz` can't macro-fuse, but `cmp` / `je` *can* macro-fuse. You could speed up your scalar loop by a factor of 1.5 by reducing it from 9 fused-domain uops to 8. Similarly, your SIMD loop is shooting itself in the foot by using `xor` instead of `cmp`. Also, you're using an indexed addressing mode as well as an add/cmp/jcc loop overhead. You can save one uop by using either a pointer-increment or a negative index counting up towards zero (but that still leaves you with 5 uops in the loop, so it still takes 2 cycles to issue an iteration). Prefer using a one-register addressing mode. — Peter Cordes, Apr 24 '16 at 12:07

score 6 · Answer 1 · edited Jun 20 '20 at 09:12

Answer to the question title

Now that you've posted the full code: the call to count2(a,N) is hoisted out of the loop in main. The run time still increases very slightly with the loop count (e.g. 1<<18), but all that loop is doing is a single add. The compiler optimizes it to look more like this source:

uint64_t hoisted_count = count2(a,N);
for (uint64_t i=0; i < ((uint64_t)1<<18); i++) {
   n += hoisted_count;   // doesn't optimize to a multiply
}

There is no register conflict: %rax holds the result of the asm statement inlined from count2. It's then used as a source operand in the tiny loop that multiplies it by n through repeated addition.

(see the asm on the Godbolt Compiler Explorer, and note all the compiler warnings about arithmetic on void*s: clang refuses to compile your code):

## the for() loop in main, when using count2()
.L23:
    addq    %rax, %r12
    subq    $1, %rdx
    jne     .L23

%rdx is the loop counter here, and %r12 is the accumulator that holds n. IDK why gcc doesn't optimize it to a constant-time multiply.

Presumably the version that was 260k times slower didn't manage to hoist the whole count2 out of the loop. From gcc's perspective, the inline asm version is much simpler: the asm statement is treated as a pure function of its inputs, and gcc doesn't even know anything about it touching memory. The C version touches a bunch of memory, and is much more complicated to prove that it can be hoisted.

Using a "memory" clobber in the asm statement did prevent it from being hoisted when I checked on godbolt. You can tell from the presence or absence of a branch target in main before the vector block.

But anyway, the run time will be something like n + rep_count vs. n * rep_count.

The asm statement doesn't use a "memory" clobber or any memory inputs to tell gcc that it reads the memory pointed to by the input pointers. Incorrect optimizations could happen, e.g. being hoisted out of a loop that modified array elements. (See the Clobbers section in the manual for an example of using a dummy anonymous struct memory input instead of a blanket "memory" clobber. Unfortunately I don't think that's usable when the block of memory doesn't have compile-time-constant size.)

I think -fno-inline prevents hoisting because the function isn't marked with __attribute__((const)) or the slightly weaker __attribute__((pure)) to indicate no side-effects. After inlining, the optimizer can see that for the asm statement.

count0 doesn't get optimized to anything good because gcc and clang can't auto-vectorize loops where the number of iterations isn't known at the start. i.e. they suck at stuff like strlen or memchr, or search loops in general, even if they're told that it's safe to access memory beyond the end of the point where the search loop exits early (e.g. using char buf[static 512] as a function arg).

Optimizations for your asm code:

Like I commented on the question, using xor reg, 0xFFFF / jnz is silly compared to cmp reg, 0xFFFF / jnz, because cmp/jcc can macro-fuse into a compare-and-branch uop. cmp reg, mem / jne can also macro-fuse, so the scalar version that does a load/xor/branch is using 3x as many uops per compare. (Of course, Sandybridge can only micro-fuse the load if it doesn't use an indexed addressing mode. Also, SnB can only macro-fuse one pair per decode block, and but you'd probably get the first cmp/jcc and the loop branch to macro-fuse.) Anyway, the xor is a bad idea. It's better to only xor right before the tzcnt, since saving uops in the loop is more important than code-size or uops total.

Your scalar loop is 9 fused-domain uops, which is one too many to issue at one iteration per 2 clocks. (SnB is a 4-wide pipeline, and for tiny loops it can actually sustain that.)

The indenting in the code in the first version of the question, with the count += __builtin_ctz at the same level as the if, made me think you were counting mismatch blocks, rather than just finding the first.

Unfortunately the asm code I wrote for the first version of this answer doesn't solve the same problem as the OP's updated and clearer code. See an old version of this answer for SSE2 asm that counts 0xFF bytes using pcmpeqb/paddb, and psadbw for the horizontal sum to avoid wraparound.

Getting a speedup with SSE2 (or AVX):

Branching on the result of a pcmpeq takes many more uops than branching on a cmp. If our search array is big, we can use a loop that tests multiple vectors at once, and then figure out which byte had our hit after breaking out of the loop.

This optimization applies to AVX2 as well.

Here's my attempt, using GNU C inline asm with -masm=intel syntax. (Intrinsics might give better results, esp. when inlining, because the compiler understands intrinsics and so can do constant-propagation through them, and stuff like that. OTOH, you can often beat the compiler with hand-written asm if you understand the trade-offs and the microarchitecture you're targeting. Also, if you can safely make some assumptions, but you can't easily communicate them to the compiler.)

#include <stdint.h>
#include <immintrin.h>

// compile with -masm=intel
// len must be a multiple of 32  (TODO: cleanup loop)
// buf should be 16B-aligned for best performance
size_t find_first_zero_bit_avx1(const char *bitmap, size_t len) {
    // return size_t not uint64_t.  This same code works in 32bit mode, and in the x32 ABI where pointers are 32bit

    __m128i pattern, vtmp1, vtmp2;
    const char *result_pos;
    int tmpi;

    const char *bitmap_start = bitmap;

    asm (  // modifies the bitmap pointer, but we're inside a wrapper function
      "vpcmpeqw   %[pat], %[pat],%[pat]\n\t"          // all-ones

      ".p2align 4\n\t"   // force 16B loop alignment, for the benefit of CPUs without a loop buffer
      //IACA_START  // See the godbolt link for the macro definition
      ".Lcount_loop%=:\n\t"
//      "  movdqu    %[v1], [ %[p] ]\n\t"
//      "  pcmpeqb   %[v1], %[pat]\n\t"        // for AVX: fold the load into vpcmpeqb, making sure to still use a one-register addressing mode so it can micro-fuse
//      "  movdqu    %[v2], [ %[p] + 16 ]\n\t"
//      "  pcmpeqb   %[v2], %[pat]\n\t"

      "  vpcmpeqb  %[v1], %[pat], [ %[p] ]\n\t"  // Actually use AVX, to get a big speedup over the OP's scalar code on his SnB CPU
      "  vpcmpeqb  %[v2], %[pat], [ %[p] + 16 ]\n\t"

      "  vpand     %[v2], %[v2], %[v1]\n\t"         // combine the two results from this iteration
      "  vpmovmskb  %k[result], %[v2]\n\t"
      "  cmp       %k[result], 0xFFFF\n\t"          // k modifier: eax instead of rax
      "  jne     .Lfound%=\n\t"

      "  add       %[p], 32\n\t"
      "  cmp       %[p], %[endp]\n\t"              // this is only 2 uops after the previous cmp/jcc.  We could re-arrange the loop and put the branches farther apart if needed.  (e.g. start with a vpcmpeqb outside the loop, so each iteration actually sets up for the next)
      "  jb     .Lcount_loop%=\n\t"
      //IACA_END

      // any necessary code for the not-found case, e.g. bitmap = endp
      "  mov     %[result], %[endp]\n\t"
      "  jmp    .Lend%=\n\t"

      ".Lfound%=:\n\t"                       // we have to figure out which vector the first non-match was in, based on v1 and (v2&v1)
                                  // We could just search the bytes over again, but we don't have to.
                                  // we could also check v1 first and branch, instead of checking both and using a branchless check.
      "  xor       %k[result], 0xFFFF\n\t"
      "  tzcnt     %k[result], %k[result]\n\t"  // runs as bsf on older CPUs: same result for non-zero inputs, but different flags.  Faster than bsf on AMD
      "  add       %k[result], 16\n\t"          // result = byte count in case v1 is all-ones.  In that case, v2&v1 = v2

      "  vpmovmskb %k[tmp], %[v1]\n\t"
      "  xor       %k[tmp], 0xFFFF\n\t"
      "  bsf       %k[tmp], %k[tmp]\n\t"        // bsf sets ZF if its *input* was zero.  tzcnt's flag results are based on its output.  For AMD, it would be faster to use more insns (or a branchy strategy) and avoid bsf, but Intel has fast bsf.
      "  cmovnz    %k[result], %k[tmp]\n\t"     // if there was a non-match in v1, use it instead of tzcnt(v2)+16

      "  add       %[result], %[p]\n\t"         // If we needed to force 64bit, we could use %q[p].  But size_t should be 32bit in the x32 ABI, where pointers are 32bit.  This is one advantage to using size_t over uint64_t
      ".Lend%=:\n\t"
      : [result] "=&a" (result_pos),   // force compiler to pic eax/rax to save a couple bytes of code-size from the special cmp eax, imm32  and xor eax,imm32 encodings
        [p] "+&r" (bitmap),
        // throw-away outputs to let the compiler allocate registers.  All early-clobbered so they aren't put in the same reg as an input
        [tmp] "=&r" (tmpi),
        [pat] "=&x" (pattern),
        [v1] "=&x" (vtmp1), [v2] "=&x" (vtmp2)
      : [endp] "r" (bitmap+len)
        // doesn't compile: len isn't a compile-time constant
        // , "m" ( ({ struct { char x[len]; } *dummy = (typeof(dummy))bitmap ; *dummy; }) )  // tell the compiler *which* memory is an input.
      : "memory" // we read from data pointed to by bitmap, but bitmap[0..len] isn't an input, only the pointer.
    );

    return result_pos - bitmap_start;
}

This actually compiles and assembles to asm that looks like what I expected, but I didn't test it. Note that it leaves all register allocation to the compiler, so it's more inlining-friendly. Even without inlining, it doesn't force use of a call-preserved register that has to get saved/restored (e.g. your use of a "b" constraint).

Not done: scalar code to handle the last sub-32B chunk of data.

static perf analysis for Intel SnB-family CPUs based on Agner Fog's guides / tables. See also the x86 tag wiki. I'm assuming we're not bottlenecked on cache throughput, so this analysis only applies when the data is hot in L2 cache, or maybe only L1 cache is fast enough.

This loop can issue out of the front-end at one iteration (two vectors) per 2 clocks, because it's 7 fused-domain uops. (The front-end issues in groups of 4). (It's probably actually 8 uops, if the two cmp/jcc pairs are decoded in the same block. Haswell and later can do two macro-fusions per decode group, but previous CPUs can only macro-fuse the first. We could software-pipeline the loop so the early-out branch is farther from the p < endp branch.)

All of these fused-domain uops include an ALU uop, so the bottleneck will be on ALU execution ports. Haswell added a 4th ALU unit that can handle simple non-vector ops, including branches, so could run this loop at one iteration per 2 clocks (16B per clock). Your i5-2550k (mentioned in comments) is a SnB CPU.

I used IACA to count uops per port, since it's time consuming to do it by hand. IACA is dumb and thinks there's some kind of inter-iteration dependency other than the loop counter, so I had to use -no_interiteration:

g++ -masm=intel -Wall -Wextra -O3 -mtune=haswell find-first-zero-bit.cpp -c -DIACA_MARKS
iaca -64 -arch IVB -no_interiteration find-first-zero-bit.o

Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - find-first-zero-bit.o
Binary Format - 64Bit
Architecture  - SNB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 2.50 Cycles       Throughput Bottleneck: Port1, Port5

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.0    0.0  | 2.5  | 1.0    1.0  | 1.0    1.0  | 0.0  | 2.5  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   2^   |           | 1.0 | 1.0   1.0 |           |     |     | CP | vpcmpeqb xmm1, xmm0, xmmword ptr [rdx]
|   2^   |           | 0.6 |           | 1.0   1.0 |     | 0.4 | CP | vpcmpeqb xmm2, xmm0, xmmword ptr [rdx+0x10]
|   1    | 0.9       | 0.1 |           |           |     | 0.1 | CP | vpand xmm2, xmm2, xmm1
|   1    | 1.0       |     |           |           |     |     |    | vpmovmskb eax, xmm2
|   1    |           |     |           |           |     | 1.0 | CP | cmp eax, 0xffff
|   0F   |           |     |           |           |     |     |    | jnz 0x18
|   1    | 0.1       | 0.9 |           |           |     |     | CP | add rdx, 0x20
|   1    |           |     |           |           |     | 1.0 | CP | cmp rdx, rsi
|   0F   |           |     |           |           |     |     |    | jb 0xffffffffffffffe1

On SnB: pcmpeqb can run on p1/p5. Fused compare-and-branch can only run on p5. Non-fused cmp can run on p015. Anyway, if one of the branches doesn't macro-fuse, the loop can run at one iteration per 8/3 = 2.666 cycles. With macro-fusion, best-case is 7/3 = 2.333 cycles. (IACA doesn't try to simulate distribution of uops to ports exactly the way the hardware would dynamically make those decisions. However, we can't expect perfect scheduling from the hardware either, so 2 vectors per 2.5 cycles is probably reasonable with both macro-fusions happening. Uops that could have used port0 will sometimes steal port1 or port5, reducing throughput.)

As I said before, Haswell handles this loop better. IACA thinks HSW could run the loop at one iteration per 1.75c, but that's clearly wrong because the taken loop-branch ends the issue group. It will issue in a repeating 4,3 uop pattern. But the execution units can handle more throughput than the frontend for this loop, so it should really be able to keep up with the frontend on Haswell/Broadwell/Skylake and run at one iteration per 2 clocks.

Further unrolling of more vpcmpeqb / vpand is only 2 uops per vector (or 3 without AVX, where we'd load into a scratch and then use that as the destination for pcmpeqb.) So with sufficient unrolling, we should be able to do 2 vector loads per clock. Without AVX, this wouldn't be possible without the PAND trick, since a vector load/compare/movmsk/test-and-branch is 4 uops. Bigger unrolls make more work to decode the final position where we found a match: a scalar cmp-based cleanup loop might be a good idea once we're in the area. You could maybe use the same scalar loop for cleanup of non-multiple-of-32B sizes.

If using SSE, with movdqu / pcmpeqb xmm,xmm, we can use an indexed addressing mode without it costing us uops, because a movdqu load is always a single load uop regardless of addressing mode. (It doesn't need to micro-fuse with anything, unlike a store). This lets us save a uop of loop overhead by using a base pointer pointing to the end of the array, and the index counting up from zero. e.g. add %[idx], 32 / js to loop while the index is negative.

With AVX, however, we can save 2 uops by using a single-register addressing mode so vpcmpeqb %[v1], %[pat], [ %[p] + 16 ] can micro-fuse. This means we need the add/cmp/jcc loop structure I used in the example. The same applies to AVX2.

@user4602856: I had another idea for reducing overhead in the SIMD loop. Updated with code that should work 1.5 to 2x as fast as your SIMD loop, on your SnB CPU. — Peter Cordes, Apr 25 '16 at 12:24

codechimp · Answer 2 · 2016-04-24T14:10:39.757

2

So I think I found the problem. I think one of the registers used in my inline assembly, despite the clobber list, was conflicting with g++ use of them, and was corrupting the test iteration. I fed g++ version of the code, back as an inline assembly code and got the same 260000x acceleration as my own. Also, in retrospect, the "accelerated" computation time was absurdly short.

Finally, I was so focus on the code embodied as a function that I failed to notice that g++ had, in fact, in-lined (i was using -O3 optimization) the function into the test for-loop as well. When I forced g++ to not in-line (i.e. -fno-inline), the 260000x acceleration disappeared.

I think g++ failed to take into account the inline assembly code's "clobber list" when it in-lined the entire function without my permission.

Lesson learned. I need to do better on inline assembly constraints or block inline-ing of the function with __attribute__ ((noinline))

EDIT: Definitely found that g++ is using rax for the main() for-loop counter, in conflict with my use of rax.

edited Apr 24 '16 at 14:10

answered Apr 24 '16 at 03:32

codechimp

1,509
1
14
21

"without your permission"? One of the benefits of GNU C inline asm syntax is that it can inline, so you can wrap it with a function with no overhead. Also, if your code clobbered the loop counter after inlining, that's either a gcc bug, or far more likely, a bug in your code. You probably got the constraints (outputs/inputs/constraints) wrong. Also, it's best to let gcc pick registers for you if need temporaries inside your asm block, by using output-only operands that the C never touches. See [the bottom of this answer](http://stackoverflow.com/a/34522750/224132) for guides. – Peter Cordes Apr 24 '16 at 03:39
I could have gotten the constraints wrong, but I was explicit on all. ... : [count] "=a" (count) : [nBytes] "b" (nBytes), [data] "D" (data) : "r9" – codechimp Apr 24 '16 at 03:40
That looks ok. It's possible you found a gcc bug. Post your source code. It's absolutely a bug in either gcc or your code if inlining breaks your code. Oh, your inline asm depends on the data in memory, but doesn't specify a `"memory"` clobber. (If you don't ask for it, the actual data pointed to by an input pointer is *not* considered an input. The gcc inline asm docs have an example of using a `struct` as an input to tell the compiler which data). Did you actually find the problem in the compiler output with your inline asm? Or are you just guessing that that was the problem? – Peter Cordes Apr 24 '16 at 04:06
4

@user4602856 Post the inline asm and we can look at the constraints/clobbers/etc. Without the code, we're just guessing. – David Wohlferd Apr 24 '16 at 04:31
I'm guessing on the problem. I looked but could not find the explicit reg conflict. But I'm a newb on this inline assembly biz, so I just assumed I did something wrong. – codechimp Apr 24 '16 at 11:52
Correction: I found the problem. g++ uses `rax` for the counter main() for-loop, while I use `rax` for 0xFF byte `count` in the assembly – codechimp Apr 24 '16 at 12:03
I had a look at the asm output from the code you posted. Your constraints are fine, and gcc is making correct code. I updated my answer with the real reason: gcc hoists the inlined `count2` out of the loop, but it's not doing that for `count1`. So the inline asm version just optimizes a *lot* better, defeating your repeat-loop. Your conclusions at the end of your answer turned out to be wrong. – Peter Cordes Apr 24 '16 at 14:17

Why is one of these sooooo much faster than the other?

2 Answers2

Answer to the question title

Optimizations for your asm code:

Getting a speedup with SSE2 (or AVX):