Why -O1 is faster than -O2

Question

I wrote a C code like this:

#include <stdio.h>
#define N 19

int main(void){
    int a[N];
    int ans = 0;

    for(int i = 0; i < N; ++i){
        a[i] = 0;
    }
    for(;;){
        int i;
        ++ans;
        for(i = N - 1; a[i] == 2; --i){
            if(i == 0){
                printf("%d\n", ans);
                return 0;
            }else{
                a[i] = 0;
            }
        }
        ++a[i];
    }
}

This counts the ways to choose N (= 19 ) numbers from 0 to 2, and prints the number of ways (= 3^19 = 1,162,261,467).

I compiled this code with gcc. -O1 was faster than -O2. Why -O2 optimization was worse than -O1?

CPU: Intel(R) Core(TM) i7-8565U, x86_64
OS: Arch Linux (5.9.1-arch1-1)
compiler: gcc (GCC) 10.2.0

Edit:

Running gcc with -S option produced the following assembly codes: -O1

    .file   "a.c"
    .text
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC0:
    .string "%d\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB11:
    .cfi_startproc
    subq    $104, %rsp
    .cfi_def_cfa_offset 112
    movq    %fs:40, %rax
    movq    %rax, 88(%rsp)
    xorl    %eax, %eax
    movq    %rsp, %rax
    leaq    76(%rsp), %rdx
.L2:
    movl    $0, (%rax)
    addq    $4, %rax
    cmpq    %rdx, %rax
    jne .L2
    movl    $0, %esi
    jmp .L7
.L4:
    movslq  %edx, %rdx
    addl    $1, %ecx
    movl    %ecx, (%rsp,%rdx,4)
.L7:
    addl    $1, %esi
    movl    72(%rsp), %ecx
    leaq    68(%rsp), %rax
    movl    $18, %edx
    cmpl    $2, %ecx
    jne .L4
.L5:
    movl    $0, 4(%rax)
    subl    $1, %edx
    movl    (%rax), %ecx
    cmpl    $2, %ecx
    jne .L4
    subq    $4, %rax
    testl   %edx, %edx
    jne .L5
    leaq    .LC0(%rip), %rdi
    movl    $0, %eax
    call    printf@PLT
    movq    88(%rsp), %rax
    subq    %fs:40, %rax
    jne .L14
    movl    $0, %eax
    addq    $104, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 8
    ret
.L14:
    .cfi_restore_state
    call    __stack_chk_fail@PLT
    .cfi_endproc
.LFE11:
    .size   main, .-main
    .ident  "GCC: (GNU) 10.2.0"
    .section    .note.GNU-stack,"",@progbits

-O2

    .file   "a.c"
    .text
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC0:
    .string "%d\n"
    .section    .text.startup,"ax",@progbits
    .p2align 4
    .globl  main
    .type   main, @function
main:
.LFB11:
    .cfi_startproc
    subq    $104, %rsp
    .cfi_def_cfa_offset 112
    movl    $9, %ecx
    xorl    %esi, %esi
    movq    %fs:40, %rax
    movq    %rax, 88(%rsp)
    xorl    %eax, %eax
    movq    %rsp, %rdx
    movq    %rdx, %rdi
    rep stosq
    movl    $0, (%rdi)
    leaq    68(%rsp), %rdi
.L6:
    movl    72(%rsp), %ecx
    addl    $1, %esi
    movq    %rdi, %rax
    movl    $18, %edx
    cmpl    $2, %ecx
    je  .L4
    jmp .L3
    .p2align 4,,10
    .p2align 3
.L5:
    subq    $4, %rax
    testl   %edx, %edx
    je  .L14
.L4:
    movl    (%rax), %ecx
    movl    $0, 4(%rax)
    subl    $1, %edx
    cmpl    $2, %ecx
    je  .L5
.L3:
    movslq  %edx, %rdx
    addl    $1, %ecx
    movl    %ecx, (%rsp,%rdx,4)
    jmp .L6
    .p2align 4,,10
    .p2align 3
.L14:
    xorl    %eax, %eax
    leaq    .LC0(%rip), %rdi
    call    printf@PLT
    movq    88(%rsp), %rax
    subq    %fs:40, %rax
    jne .L15
    xorl    %eax, %eax
    addq    $104, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 8
    ret
.L15:
    .cfi_restore_state
    call    __stack_chk_fail@PLT
    .cfi_endproc
.LFE11:
    .size   main, .-main
    .ident  "GCC: (GNU) 10.2.0"
    .section    .note.GNU-stack,"",@progbits

And the benchmark is:

$ gcc a.c -O1
$ time ./a.out
1162261467

real    0m0.895s
user    0m0.894s
sys 0m0.000s
$ time ./a.out
1162261467

real    0m0.912s
user    0m0.911s
sys 0m0.000s
$ time ./a.out
1162261467

real    0m0.925s
user    0m0.924s
sys 0m0.001s
$ gcc a.c -O2
$ time ./a.out
1162261467

real    0m1.570s
user    0m1.568s
sys 0m0.000s
$ time ./a.out
1162261467

real    0m1.567s
user    0m1.562s
sys 0m0.004s
$ time ./a.out
1162261467

real    0m1.576s
user    0m1.568s
sys 0m0.001s
$ gcc a.c -O3
$ time ./a.out
1162261467

real    0m1.613s
user    0m1.612s
sys 0m0.000s
$ time ./a.out
1162261467

real    0m1.608s
user    0m1.599s
sys 0m0.003s
$ time ./a.out
1162261467

real    0m1.628s
user    0m1.628s
sys 0m0.000s
$ gcc a.c -Ofast
$ time ./a.out
1162261467

real    0m1.571s
user    0m1.570s
sys 0m0.001s
$ time ./a.out
1162261467

real    0m1.604s
user    0m1.595s
sys 0m0.004s
$ time ./a.out
1162261467

real    0m1.616s
user    0m1.613s
sys 0m0.000s
$ gcc a.c -O0
$ time ./a.out
1162261467

real    0m2.457s
user    0m2.456s
sys 0m0.001s
$ time ./a.out
1162261467

real    0m2.526s
user    0m2.525s
sys 0m0.000s
$ time ./a.out
1162261467

real    0m2.565s
user    0m2.565s
sys 0m0.000s

Edit:

I edited the code like this:

#include <stdio.h>
#define N 19

volatile int answer;

int main(void){
    int a[N];
    int ans = 0;

    for(int i = 0; i < N; ++i){
        a[i] = 0;
    }
    for(;;){
        int i;
        ++ans;
        for(i = N - 1; a[i] == 2; --i){
            if(i == 0){
                answer = ans;
                return 0;
            }else{
                a[i] = 0;
            }
        }
        ++a[i];
    }
}

And measured again:

$ gcc a.c -O1
$ time ./a.out

real    0m0.924s
user    0m0.924s
sys 0m0.000s
$ time ./a.out

real    0m0.950s
user    0m0.949s
sys 0m0.000s
$ time ./a.out

real    0m0.993s
user    0m0.989s
sys 0m0.004s
$ gcc a.c -O2
$ time ./a.out

real    0m1.637s
user    0m1.636s
sys 0m0.000s
$ time ./a.out

real    0m1.661s
user    0m1.656s
sys 0m0.004s
$ time ./a.out

real    0m1.656s
user    0m1.654s
sys 0m0.001s

Edit:

I added [[likely]] attribute after for(;;):

#include <stdio.h>
#define N 19

int main(void){
    int a[N];
    int ans = 0;

    for(int i = 0; i < N; ++i){
        a[i] = 0;
    }
    for(;;) [[likely]] {
        int i;
        ++ans;
        for(i = N - 1; a[i] == 2; --i){
            if(i == 0){
                printf("%d\n", ans);
                return 0;
            }else{
                a[i] = 0;
            }
        }
        ++a[i];
    }
}

Then, the result of benchmark changed:

$ g++ a.cpp -O1
$ for i in {1..5}; do time ./a.out; done
1162261467
./a.out  0.65s user 0.00s system 99% cpu 0.653 total
1162261467
./a.out  0.65s user 0.00s system 99% cpu 0.657 total
1162261467
./a.out  0.66s user 0.00s system 99% cpu 0.656 total
1162261467
./a.out  0.66s user 0.00s system 99% cpu 0.665 total
1162261467
./a.out  0.66s user 0.00s system 99% cpu 0.660 total
$ g++ a.cpp -O2
$ for i in {1..5}; do time ./a.out; done
1162261467
./a.out  0.66s user 0.00s system 99% cpu 0.661 total
1162261467
./a.out  0.65s user 0.00s system 99% cpu 0.648 total
1162261467
./a.out  0.66s user 0.00s system 99% cpu 0.659 total
1162261467
./a.out  0.65s user 0.00s system 99% cpu 0.654 total
1162261467
./a.out  0.66s user 0.00s system 99% cpu 0.657 total

-O2 is as fast as -O1! Thank you @Acorn .

`was faster` How did you measure "fastness"? `was worse` How did you measure it? — KamilCuk, Nov 19 '20 at 10:35
I can reproduce with my GCC. But since you asked on a specific arch, then please add the assembly to the question! Compile with `-S`. — Antti Haapala -- Слава Україні, Nov 19 '20 at 10:38
Some 90% of all strange benchmarking questions posted on SO can be explained by incorrect benchmarking. So maybe start there. How are we to reproduce this? — Lundin, Nov 19 '20 at 10:38
@Lundin `time ./a.out` in bash, 0.9 cpu seconds vs 1.5 cpu seconds, seems reliable. — Antti Haapala -- Слава Україні, Nov 19 '20 at 10:39
Try `gcc -Ofast ...` and see http://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html to try and reduce the `-O2` options to the one that makes your code run slower :-) — pmg, Nov 19 '20 at 10:40
@pmg -Ofast I get 1.5~ cpu seconds. Same for -O3, -O2. -O1 ~0.9 seconds, -O0 3 seconds — Antti Haapala -- Слава Україні, Nov 19 '20 at 10:41
What happens if you replace printf with a volatile dummy variable write? — Lundin, Nov 19 '20 at 10:43
@AnttiHaapala Yes but it too may or may not be compiled along with the -O option. — Lundin, Nov 19 '20 at 10:46
I think this is *again* the duplicate of the Peter Cordes' loads stuff... — Antti Haapala -- Слава Україні, Nov 19 '20 at 10:53
https://stackoverflow.com/a/62185084/918959 this. Except that this time it is optimized. — Antti Haapala -- Слава Україні, Nov 19 '20 at 10:54
I.e. because the array indexing *reads* from memory / *writes* to memory, if this happens too often in the pipeline it stalls. — Antti Haapala -- Слава Україні, Nov 19 '20 at 10:55
@AnttiHaapala There's a lot of store-forwarding, but only about 2/3rds of it is to the element that's about to be read next. As "carry" propagates to lower and lower elements, there's more room for OoO exec to hide the latency of multiple independent chains of store forwarding. On SKL I'm seeing ~2.48 IPC, ~2.34 fused-domain uops/clock with `perf stat` for `-O2`. (macro-fusion makes uops != insns.) And BTW, it's not *my* load stuff, don't blame me for Sandybridg's weirdness :P (j/k, I know what you meant, but turns out I wasn't the first to notice either; that answer of mine links a blog.) — Peter Cordes, Nov 21 '20 at 09:19
@AnttiHaapala: `-O1` hits 3.72 IPC, 3.61 fused-domain uops/clock. I'm also on Arch Linux, same versions as OP, and same CPU microarchitecture (Kaby / Coffee Lake are the same as Skylake; I set up Arch Linux's microcode update so my loop buffer is disabled). Interestingly, `gcc -O1 -march=native` is even faster, 913ms vs. 961. (3.56 billion cycles vs. 3.75). But exact same instruction count, so 3.92 IPC. — Peter Cordes, Nov 21 '20 at 09:21
@AnttiHaapala: Also note that the dep chain for any given counter is only 3 store-forwardings long, starting from a `mov [mem], 0` that breaks the dependency. (With only a control dependency on the previous chain, thus speculatively executed without waiting.) Store forwarding effectively "renames" memory, avoiding WAW and WAR hazards, so OoO exec can overlap them, making dependency not matter. https://blog.stuffedcow.net/2014/01/x86-memory-disambiguation/ / https://en.wikipedia.org/wiki/Memory_disambiguation#Avoiding_WAR_and_WAW_dependencies. So SF *latency* shouldn't be a problem at all. — Peter Cordes, Nov 21 '20 at 09:34
Hmm, `-O1 -march=native` vs. `-O1` generic numbers aren't super stable; some sets of runs (`perf stat -r4`) the generic `-O1` is almost as fast as the best "native", and sometimes "native" is almost as slow as the generic usually is. The only diff is `inc`/`dec` instead of `add` or `sub reg,1`, so just code alignment, presumably affecting branch-predictor aliasing. The inner loop branches don't touch 32-byte boundaries in either case, so we're not seeing the effects of the JCC erratum mitigation. — Peter Cordes, Nov 21 '20 at 10:12

score 2 · Accepted Answer · answered Nov 19 '20 at 20:02

2

Why -O2 optimization was worse than -O1?

Higher optimization levels should give you better performance in most cases. Nevertheless, you may find exceptions like this one. Particularly in micro-benchmarks like this one.

The code and data memory that your program uses is so small that caches and memory accesses are unlikely to be an issue. However, it is branch-heavy, which means it will come down to static and dynamic branching prediction.

If your compiler has got it wrong, like in this case, you can try to give it more information with likely/unlikely hints or profiling the program.

answered Nov 19 '20 at 20:02

Acorn

24,970
5
40
69

1

I added `[[likely]]` after `for(;;)`, and then -O2 became as fast as -O1!!!! Thanks alot!! – fiveseven Nov 21 '20 at 08:14
1

@fiveseven: In practice, `-fprofile-generate` / test run / `-fprofile-use` (profile-guided optimization) generally fixes any branch layout decisions. (And branchy vs. branchless, like in [gcc optimization flag -O3 makes code slower than -O2](https://stackoverflow.com/q/28875325)). If you can do profiling runs (with input that's representative of your real typical workloads), that's usually better than manually trying to guess or figure out which conditions would from manual hints, for large-scale programs. (And more future-proof! Changes to other code may change which path is taken.) – Peter Cordes Nov 21 '20 at 08:51
I tried that. `$ g++ a.cpp -O2 -fprofile-generate`, `$ ./a.out` then `g++ a.cpp -O2 -fprofile-generate`, and it worked! (0.70s, 0.69s, 0.68s, 0.67s, 0.71s) Thank you – fiveseven Nov 21 '20 at 09:19

score 0 · Answer 2 · answered Nov 19 '20 at 10:48

-O2 turns on many options in addition to O1, for example -falign-functions -falign-jumps -falign-labels -falign-loops. Each of them seemed to have a negative performance impact on top of -O1. I have i7-8550U and GCC 9.3.0-17ubuntu1~20.04.

I believe the branch prediction failures make this hard on the processor.

Why -O1 is faster than -O2

2 Answers2