11

Is there a gcc pragma or something I can use to force gcc to generate branch-free instructions on a specific section of code?

I have a piece of code that I want gcc to compile to branch-free code using cmov instructions:

int foo(int *a, int n, int x) {
    int i = 0, j = n;

    while (i < n) {
#ifdef PREFETCH
        __builtin_prefetch(a+16*i + 15);
#endif /* PREFETCH */
        j = (x <= a[i]) ? i : j;
        i = (x <= a[i]) ? 2*i + 1 : 2*i + 2;
    }
    return j;
}

and, indeed, it does so:

morin@soprano$ gcc -O4 -S -c test.c -o -    
    .file   "test.c"
    .text
    .p2align 4,,15
    .globl  foo
    .type   foo, @function
foo:
.LFB0:
    .cfi_startproc
    testl   %esi, %esi
    movl    %esi, %eax
    jle .L2
    xorl    %r8d, %r8d
    jmp .L3
    .p2align 4,,10
    .p2align 3
.L6:
    movl    %ecx, %r8d
.L3:
    movslq  %r8d, %rcx
    movl    (%rdi,%rcx,4), %r9d
    leal    (%r8,%r8), %ecx      # put 2*i in ecx
    leal    1(%rcx), %r10d       # put 2*i+1 in r10d
    addl    $2, %ecx             # put 2*i+2 in ecx
    cmpl    %edx, %r9d
    cmovge  %r10d, %ecx          # put 2*i+1 in ecx if appropriate
    cmovge  %r8d, %eax           # set j = i if appropriate
    cmpl    %esi, %ecx
    jl  .L6
.L2:
    rep ret
    .cfi_endproc
.LFE0:
    .size   foo, .-foo
    .ident  "GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
    .section    .note.GNU-stack,"",@progbits

(Yes, I realize the loop is a branch, but I'm talking about the choice operators inside the loop.)

Unfortunately, when I enable the __builtin_prefetch call, gcc generates branchy code:

morin@soprano$ gcc -DPREFETCH -O4 -S -c test.c -o -
    .file   "test.c"
    .text
    .p2align 4,,15
    .globl  foo
    .type   foo, @function
foo:
.LFB0:
    .cfi_startproc
    testl   %esi, %esi
    movl    %esi, %eax
    jle .L7
    xorl    %ecx, %ecx
    jmp .L5
    .p2align 4,,10
    .p2align 3
.L3:
    movl    %ecx, %eax           # this is the x <= a[i] branch
    leal    1(%rcx,%rcx), %ecx
    cmpl    %esi, %ecx
    jge .L11
.L5:
    movl    %ecx, %r8d           # this is the main branch
    sall    $4, %r8d             # setup the prefetch
    movslq  %r8d, %r8            # setup the prefetch
    prefetcht0  60(%rdi,%r8,4)   # do the prefetch
    movslq  %ecx, %r8
    cmpl    %edx, (%rdi,%r8,4)   # compare x with a[i]
    jge .L3
    leal    2(%rcx,%rcx), %ecx   # this is the x > a[i] branch
    cmpl    %esi, %ecx
    jl  .L5
.L11:
    rep ret
.L7:
    .p2align 4,,5
    rep ret
    .cfi_endproc
.LFE0:
    .size   foo, .-foo
    .ident  "GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
    .section    .note.GNU-stack,"",@progbits

I've tried using __attribute__((optimize("if-conversion2"))) on this function, but that has no effect.

The reason I care so much is that I haved hand-edited compiler-generated branch-free code (from the first example) to include the prefetcht0 instructions and it runs considerably faster than both of the versions gcc produces.

b4hand
  • 9,550
  • 4
  • 44
  • 49
Pat Morin
  • 1,688
  • 1
  • 12
  • 12
  • 1
    what optimization level are you compiling with? because I have a real hard time beating the compiler when I write normal code and use -O3 or -Ofast – Grady Player May 27 '15 at 15:28
  • I guess you can hint gcc about the probability of condition being true or false. It seems like your gcc is very convinced that the branch is either mostly true or mostly false so that runtime prediction works better than predication? – user3528438 May 27 '15 at 15:34
  • @GradyPlayer: This is compiled with -O4, and in the first example it happily uses cmov operations. – Pat Morin May 27 '15 at 18:17
  • @user3528438 : In that case, gcc is completely wrong. Each branch has a 50% chance of being taken at each step. – Pat Morin May 27 '15 at 18:19
  • Apparently, this is not specific to prefetching as such. Simply adding e.g. `asm ("");` instead of `__builtin_prefetch` has the same effect. – Jukka Suomela May 27 '15 at 20:05
  • @JukkaSuomela I can understand `asm` being a problem (who knows what it's doing) but gcc controls the code generated by the `__builtin_prefetch`, so I wouldn't have thought it would have such a profound effect. `clang` seems to handle it better. – Pat Morin May 27 '15 at 20:22

2 Answers2

6

If you really rely on that level of optimization, you have to write your own assembler stubs.

Reason is that even a modification elsewhere in the code might change the code the compiler (that is not gcc specific) emits. Also, a different version of gcc, different options (e.g. -fomit-frame-pointer) can change the code dramatically.

You should really only do this if you have to. Other influences might have much more impact, like cache configuration, memory allocation (DRAM-page/bank), execution order compared with concurrently run programs, CPU association, and much more. Play with compiler optimizations first. Command line options you will find in the docs (you did not post the version used, therefore not more specific).

A (serious) alternative would be to use clang/llvm. Or just help the gcc team improve their optimizers. You would not be the first. Note also that gcc has made massive improvements specifically for ARM over the last versions.

b4hand
  • 9,550
  • 4
  • 44
  • 49
too honest for this site
  • 12,050
  • 4
  • 30
  • 52
  • Unfortunately, this isn't acceptable. The code above is just a minimum working example. In reality, this is a templated C++ method in which "int* a" is "T* a" and "int x" is "T x". I can't write assembly stubs for every possible type T. This code is part of a project to study the effects of array layouts on searching. The prefetch instruction is very specific for a reason: The cache line containing the prefetched address will be accessed exactly four iterations after the prefetch instruction is issued. – Pat Morin May 27 '15 at 18:26
  • @PatMorin: No reason to blame (downvote? - bad attitude) the messenger for an unpleasant message. The gcc team will very likely be happy if you help them enhance their optimization algorithms. Alternatively, you might try clang/llvm; that might prove better here, or - at least - should be easier to extend with your own optimizer. Also note you tagged the question C and there are no templates in C, so you might as well blame yourself for giving wrong information. – too honest for this site May 27 '15 at 19:30
  • Fair enough. Sorry. I just wanted to highlight that writing assembly is not an answer to the question and jumped the gun. I'd remove my downvote, but it seems I'm not allowed to unless you change your answer. – Pat Morin May 27 '15 at 19:42
  • @PatMorin: Accepted. Just trying to help. (Its not for a downvote per se, but I just think it was not fair). – too honest for this site May 27 '15 at 19:54
  • Indeed, clang does what I want in this case (but does the opposite in another case that g++ handles well.) – Pat Morin May 27 '15 at 21:02
  • @PatMorin: I've written myself enough compilers/interpreters already to assure you this is quiet a minefield. Even more for high-end CPUs (whereas ARM is still easier than x86 - RISC vs. CISC instruction set with tons of legacy stuff). If you needs some well-defined environment, you might have a look at much simpler architectures like Cortex-M. They migh have less variation, but I would not bet on that. If you really have to rely on the code, you might really have to ressemble to assembler. Possibly automatically generated (I can recommend Python very much use it myself for this). – too honest for this site May 27 '15 at 21:04
5

It looks like gcc might have troubles to generate branch-free code on variables used in loop conditions and post-conditions, together with the constraints of keeping temporary registers alive across a pseudo-function intrinsic call.

There is something suspicious, the generated code from your function is different when using -funroll-all-loops and -fguess-branch-probability. I generates many return instructions. It smells like a little bug in gcc, around the rtl pass of the compiler, or simplifications of blocks of codes.

The following code is branch-less in both cases. This would be a good reason to submit a bug to GCC. At level -O3, GCC should always generate the same code.

int foo( int *a,  int n,  int x) {
     int c, i = 0, j = n;

    while (i < n) {
#ifdef PREFETCH
        __builtin_prefetch(a+16*i + 15);
#endif /* PREFETCH */
        c = (x > a[i]);
        j = c ? j : i;
        i = 2*i + 1 + c;
    }
    return j;
}

which generates this

        .cfi_startproc
        testl   %esi, %esi
        movl    %esi, %eax
        jle     .L4
        xorl    %ecx, %ecx
        .p2align 4,,10
        .p2align 3
.L3:
        movslq  %ecx, %r8
        cmpl    %edx, (%rdi,%r8,4)
        setl    %r8b
        cmovge  %ecx, %eax
        movzbl  %r8b, %r8d
        leal    1(%r8,%rcx,2), %ecx
        cmpl    %ecx, %esi
        jg      .L3
.L4:
        rep ret
        .cfi_endproc

and this

        .cfi_startproc
        testl   %esi, %esi
        movl    %esi, %eax
        jle     .L5
        xorl    %ecx, %ecx
        .p2align 4,,10
        .p2align 3
.L4:
        movl    %ecx, %r8d
        sall    $4, %r8d
        movslq  %r8d, %r8
        prefetcht0      60(%rdi,%r8,4)
        movslq  %ecx, %r8
        cmpl    %edx, (%rdi,%r8,4)
        setl    %r8b
        testb   %r8b, %r8b
        movzbl  %r8b, %r9d
        cmove   %ecx, %eax
        leal    1(%r9,%rcx,2), %ecx
        cmpl    %ecx, %esi
        jg      .L4
.L5:
        rep ret
        .cfi_endproc
Pierre
  • 437
  • 3
  • 4