Why does adding inline assembly comments cause such radical change in GCC's generated code?

Question

So, I had this code:

constexpr unsigned N = 1000;
void f1(char* sum, char* a, char* b) {
    for(int i = 0; i < N; ++i) {
        sum[i] = a[i] + b[i];
    }
}

void f2(char* sum, char* a, char* b) {
    char* end = sum + N;
    while(sum != end) {
        *sum++ = *a++ + *b++;
    }
}

I wanted to see the code that GCC 4.7.2 would generate. So I ran g++ -march=native -O3 -masm=intel -S a.c++ -std=c++11 And got the following output:

        .file   "a.c++"
        .intel_syntax noprefix
        .text
        .p2align 4,,15
        .globl  _Z2f1PcS_S_
        .type   _Z2f1PcS_S_, @function
_Z2f1PcS_S_:
.LFB0:
        .cfi_startproc
        lea     rcx, [rdx+16]
        lea     rax, [rdi+16]
        cmp     rdi, rcx
        setae   r8b
        cmp     rdx, rax
        setae   cl
        or      cl, r8b
        je      .L5
        lea     rcx, [rsi+16]
        cmp     rdi, rcx
        setae   cl
        cmp     rsi, rax
        setae   al
        or      cl, al
        je      .L5
        xor     eax, eax
        .p2align 4,,10
        .p2align 3
.L3:
        movdqu  xmm0, XMMWORD PTR [rdx+rax]
        movdqu  xmm1, XMMWORD PTR [rsi+rax]
        paddb   xmm0, xmm1
        movdqu  XMMWORD PTR [rdi+rax], xmm0
        add     rax, 16
        cmp     rax, 992
        jne     .L3
        mov     ax, 8
        mov     r9d, 992
.L2:
        sub     eax, 1
        lea     rcx, [rdx+r9]
        add     rdi, r9
        lea     r8, [rax+1]
        add     rsi, r9
        xor     eax, eax
        .p2align 4,,10
        .p2align 3
.L4:
        movzx   edx, BYTE PTR [rcx+rax]
        add     dl, BYTE PTR [rsi+rax]
        mov     BYTE PTR [rdi+rax], dl
        add     rax, 1
        cmp     rax, r8
        jne     .L4
        rep
        ret
.L5:
        mov     eax, 1000
        xor     r9d, r9d
        jmp     .L2
        .cfi_endproc
.LFE0:
        .size   _Z2f1PcS_S_, .-_Z2f1PcS_S_
        .p2align 4,,15
        .globl  _Z2f2PcS_S_
        .type   _Z2f2PcS_S_, @function
_Z2f2PcS_S_:
.LFB1:
        .cfi_startproc
        lea     rcx, [rdx+16]
        lea     rax, [rdi+16]
        cmp     rdi, rcx
        setae   r8b
        cmp     rdx, rax
        setae   cl
        or      cl, r8b
        je      .L19
        lea     rcx, [rsi+16]
        cmp     rdi, rcx
        setae   cl
        cmp     rsi, rax
        setae   al
        or      cl, al
        je      .L19
        xor     eax, eax
        .p2align 4,,10
        .p2align 3
.L17:
        movdqu  xmm0, XMMWORD PTR [rdx+rax]
        movdqu  xmm1, XMMWORD PTR [rsi+rax]
        paddb   xmm0, xmm1
        movdqu  XMMWORD PTR [rdi+rax], xmm0
        add     rax, 16
        cmp     rax, 992
        jne     .L17
        add     rdi, 992
        add     rsi, 992
        add     rdx, 992
        mov     r8d, 8
.L16:
        xor     eax, eax
        .p2align 4,,10
        .p2align 3
.L18:
        movzx   ecx, BYTE PTR [rdx+rax]
        add     cl, BYTE PTR [rsi+rax]
        mov     BYTE PTR [rdi+rax], cl
        add     rax, 1
        cmp     rax, r8
        jne     .L18
        rep
        ret
.L19:
        mov     r8d, 1000
        jmp     .L16
        .cfi_endproc
.LFE1:
        .size   _Z2f2PcS_S_, .-_Z2f2PcS_S_
        .ident  "GCC: (GNU) 4.7.2"
        .section        .note.GNU-stack,"",@progbits

I suck at reading assembly, so I decided to add some markers to know where the bodies of the loops went:

constexpr unsigned N = 1000;
void f1(char* sum, char* a, char* b) {
    for(int i = 0; i < N; ++i) {
        asm("# im in ur loop");
        sum[i] = a[i] + b[i];
    }
}

void f2(char* sum, char* a, char* b) {
    char* end = sum + N;
    while(sum != end) {
        asm("# im in ur loop");
        *sum++ = *a++ + *b++;
    }
}

And GCC spat this out:

    .file   "a.c++"
    .intel_syntax noprefix
    .text
    .p2align 4,,15
    .globl  _Z2f1PcS_S_
    .type   _Z2f1PcS_S_, @function
_Z2f1PcS_S_:
.LFB0:
    .cfi_startproc
    xor eax, eax
    .p2align 4,,10
    .p2align 3
.L2:
#APP
# 4 "a.c++" 1
    # im in ur loop
# 0 "" 2
#NO_APP
    movzx   ecx, BYTE PTR [rdx+rax]
    add cl, BYTE PTR [rsi+rax]
    mov BYTE PTR [rdi+rax], cl
    add rax, 1
    cmp rax, 1000
    jne .L2
    rep
    ret
    .cfi_endproc
.LFE0:
    .size   _Z2f1PcS_S_, .-_Z2f1PcS_S_
    .p2align 4,,15
    .globl  _Z2f2PcS_S_
    .type   _Z2f2PcS_S_, @function
_Z2f2PcS_S_:
.LFB1:
    .cfi_startproc
    xor eax, eax
    .p2align 4,,10
    .p2align 3
.L6:
#APP
# 12 "a.c++" 1
    # im in ur loop
# 0 "" 2
#NO_APP
    movzx   ecx, BYTE PTR [rdx+rax]
    add cl, BYTE PTR [rsi+rax]
    mov BYTE PTR [rdi+rax], cl
    add rax, 1
    cmp rax, 1000
    jne .L6
    rep
    ret
    .cfi_endproc
.LFE1:
    .size   _Z2f2PcS_S_, .-_Z2f2PcS_S_
    .ident  "GCC: (GNU) 4.7.2"
    .section    .note.GNU-stack,"",@progbits

This is considerably shorter, and has some significant differences like the lack of SIMD instructions. I was expecting the same output, with some comments somewhere in the middle of it. Am I making some wrong assumption here? Is GCC's optimizer hindered by asm comments?

I'd expect GCC (and most compilers) to treat ASM construct like block boxes. So they can't reason about what happens through such a box. And that does inhibit many optimizations, especially those are carried across loop boundaries. — Ira Baxter, Dec 19 '12 at 15:05
Try the extended `asm` form with empty output and clobber lists. — Kerrek SB, Dec 19 '12 at 15:05
@R.MartinhoFernandes: `asm("# im in ur loop" : : );` (see [documentation](http://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html)) — Mike Seymour, Dec 19 '12 at 15:21
@KerrekSB, Mike, thanks; that does not make a difference though. I guess I need to get more acquainted with assembly. — R. Martinho Fernandes, Dec 19 '12 at 15:24
Note that you can get a bit more help when looking at the generated assembly by adding the `-fverbose-asm` flag, which adds some annotations to help identify how things are moving around between registers. — Matthew Slattery, Dec 19 '12 at 15:30
Very interesting. Can be used to selectively avoid optimization in loops? — SChepurin, Dec 19 '12 at 15:32
@KerrekSB A do nothing `asm` statement is useless by definition. — curiousguy, Oct 06 '15 at 00:29
You get two different answers because you ask two different questions: translate this loop to assembly and translate these statements in a loop in assembly. — curiousguy, Oct 06 '15 at 00:38

score 65 · Accepted Answer · answered Dec 19 '12 at 15:18

65

The interactions with optimisations are explained about halfway down the "Assembler Instructions with C Expression Operands" page in the documentation.

GCC doesn't try to understand any of the actual assembly inside the asm; the only thing it knows about the content is what you (optionally) tell it in the output and input operand specification and the register clobber list.

In particular, note:

An asm instruction without any output operands will be treated identically to a volatile asm instruction.

and

The volatile keyword indicates that the instruction has important side-effects [...]

So the presence of the asm inside your loop has inhibited a vectorisation optimisation, because GCC assumes it has side effects.

answered Dec 19 '12 at 15:18

Matthew Slattery

45,290
8
103
119

1

Note that the side-effects of a Basic Asm statement must not include modifying registers or any memory that your C++ code ever reads/writes. But yes, the `asm` statement has to run once for every time it would in the C++ abstract machine, and GCC chooses not to vectorize and then emit the asm 16 times in a row per `paddb`. That would I think be legal though, because the char accesses aren't `volatile`. (Unlike with an extended asm statement with a `"memory"` clobber) – Peter Cordes Mar 11 '20 at 20:47
1

See https://gcc.gnu.org/wiki/ConvertBasicAsmToExtended for reasons not to use GNU C Basic Asm statements in general. Although this use case (just a comment marker) is one of the few where it's not unreasonable to try it. – Peter Cordes Mar 11 '20 at 20:56

score 23 · Answer 2 · answered Dec 19 '12 at 15:14

Note that gcc vectorized the code, splitting the loop body into two parts, the first processing 16 items at a time, and the second doing the remainder later.

As Ira commented, the compiler doesn't parse the asm block, so it does not know that it's just a comment. Even if it did, it has no way of knowing what you intended. The optmized loops have the body doubled, should it put your asm in each? Would you like it that it isn't executed 1000 times? It doesn't know, so it goes the safe route and falls back to the simple single loop.

score 3 · Answer 3 · answered Dec 20 '12 at 16:31

I don't agree with the "gcc doesn't understand what is in the asm() block". For example, gcc can deal quite well with optimising parameters, and even re-arranging asm() blocks such that it intermingles with the generated C code. This is why, if you look at inline assembler in for example the Linux kernel, it is nearly always prefixed with __volatile__ to ensure that the compiler "doesn't move the code around". I have had gcc move my "rdtsc" around, which made my measurements of the time it took to do certain thing.

As documented, gcc treats certain types of asm() blocks as "special", and thus doesn't optimise the code either side of the block.

That's not to say that gcc won't, sometimes, get confused by inline assembler blocks, or simply decide to give up on some particular optimisation because it can't follow the consequences of the assembler code, etc, etc. More importantly, it can often get confused by missing clobber tags - so if you have some instruction like cpuid that changes the value of EAX-EDX, it but you wrote the code so that it only uses EAX, the compiler may store things in EBX, ECX and EDX, and then your code acts very strange when these registers are overwritten... If you are lucky, it crashes immediately - then it's easy to figure out what goes on. But if you are unlucky, it crashes way down the line... Another tricky one is the divide instruction that give a second result in edx. If you don't care about the modulo, it's easy to forget that EDX was changed.

gcc really doesn't understand what is in the asm block - you have to tell it via an extended asm statement. without this extra information, gcc will not move around such blocks. gcc also doesn't get confused in the cases you state - you simply made a programming error by telling gcc it can use those registers when in fact, your code clobbers them. — Remember Monica, Nov 25 '15 at 10:34
Late reply, but I think it's worth saying. `volatile asm` tells GCC the code may have 'important side effects', and it'll deal with it with more special care. It may _still_ be deleted as part of dead-code-optimization or moved out. Interaction with C code needs to assume such (rare) case and impose strict sequential evaluation (e.g. by creating dependencies within the asm). — edmz, Nov 11 '17 at 13:06
GNU C Basic asm (no operand constraints, like the OP's `asm("")`) is implicitly volatile, just like Extended asm with no output operands. GCC doesn't understand the asm template string, only the constraints; that's why it's *essential* to accurately and completely describe your asm to the compiler using constraints. Substituting operands into the template string doesn't take any more understanding than `printf` using a format string. TL:DR: don't use GNU C Basic asm for anything, except maybe use-cases like this with pure comments. — Peter Cordes, Mar 11 '20 at 20:51

curiousguy · Answer 4 · 2020-03-13T02:16:18.983

-2

This answer is now modified: it was originally written with a mindset considering inline Basic Asm as a pretty strongly specified tool, but it's nothing like that in GCC. Basic Asm is weak and so the answer was edited.

Each assembly comment acts as a breakpoint.

EDIT: But a broken one, as you use Basic Asm. Inline asm (an asm statement inside a function body) without explicit clobber list is a weakly specified feature in GCC and its behavior is hard to define. It doesn't seem (I don't fully grasp its guarantees) attached to anything in particular, so while the assembly code must be run at some point if the function is run, it isn't clear when it is run for any non trivial optimization level. A breakpoint that can be reordered with neighboring instruction isn't a very useful "breakpoint". END EDIT

You could run your program in an interpreter that breaks at each comment and prints out the state of every variable (using debug information). These points must exist so that you observe the environment (state of registers and memory).

Without the comment, no observation point exists, and the loop is compiled as a single mathematical function taking an environment and producing a modified environment.

You want to know the answer of a meaningless question: you want to know how each instruction (or maybe block, or maybe range of instruction) is compiled, but no single isolated instruction (or block) is compiled; the whole stuff is compiled as a whole.

A better question would be:

Hello GCC. Why do you believe this asm output is implementing the source code? Please explain step by step, with every assumption.

But then you wouldn't want to read a proof longer than the asm output, written in term of GCC internal representation.

edited Mar 13 '20 at 02:16

answered Oct 06 '15 at 02:22

curiousguy

8,038
2
40
58

1

*These points must exist so that you observe the environment (state of registers and memory).* - this might be true for unoptimized code. With optimizations enabled, whole functions might disappear from the binary. We're talking about optimized code here. – Bartek Banachewicz Oct 07 '15 at 12:24
@BartekBanachewicz No, they must still exist by definition. "_We're talking about optimized code here_" No, we are talking about `asm`. – curiousguy Oct 07 '15 at 13:03
1

We're talking about assembly generated as a result of compiling with optimizations enabled. Hence you're **wrong** in stating that anything must exist. – Bartek Banachewicz Oct 07 '15 at 13:05
@BartekBanachewicz Are you saying that the content of `asm` statement do not have to exist? – curiousguy Jun 03 '18 at 00:01
`asm` with no outputs (including Basic Asm) is implicitly `volatile`, and thus can't be optimized away (unless the whole function is never called and thus optimized away). But there's no implicit `"memory"` clobber in a Basic Asm statement, and even then non-escaped local vars don't have to be in sync with memory or even exist. See https://gcc.gnu.org/wiki/ConvertBasicAsmToExtended for reasons not to use Basic Asm (no in/out/clobber constraints) – Peter Cordes Mar 11 '20 at 20:55
@PeterCordes I previously assumed that Basic Asm was different and much stronger. As specified, it seems almost never guaranteed to work and not usable. I assume that many programmers made the same assumptions I made. – curiousguy Mar 12 '20 at 03:10
1

Yeah, IDK why anyone ever would, and agree that nobody ever should. As the link in my last comment explains, nobody ever should, and there has been debate about strengthening it (e.g. with an implicit `"memory"` clobber) as a bandaid for the existing buggy code that surely exists. Even for instructions like `asm("cli")` that only affect part of the architectural state that compiler-generated code doesn't touch, you still need it ordered wrt. compiler-generated loads/stores (e.g. if you're disabling interrupts around a critical section). – Peter Cordes Mar 12 '20 at 03:17
1

With it not being safe to clobber the red-zone, even inefficient manual save/restore of registers (with push/pop) inside the asm statement isn't safe, unless you `add rsp, -128` first. But doing that is just obviously braindead. – Peter Cordes Mar 12 '20 at 03:20
@PeterCordes 1) I couldn't find the words "stack" or "red zone" in GCC docs on Basic Asm; OTOH it says "_Clobbering registers (which basic asm does not support) can give better performance than push/pop._" which suggests the stack is usable in Basic Asm. 2) Except for "_outside of C functions, you must use basic asm_" [Basic-Asm](https://gcc.gnu.org/onlinedocs/gcc/Basic-Asm.html) Basic Asm looks like an essentially broken feature. Its use should trigger a least warning (one for each TU, not one warning for every instance). – curiousguy Mar 13 '20 at 02:01
It's something you can't do in Extended Asm either - [How do I tell gcc that my inline assembly clobbers part of the stack?](https://stackoverflow.com/q/39160450) / [Using base pointer register in C++ inline asm](https://stackoverflow.com/q/34520013). Most ABIs don't have a red-zone, though, and you can always compile with `-mno-red-zone`. That performance vs. push/pop was presumably written with other cases in mind, like i386, or x86-64 kernel code, or other cases where push/pop is safe but slow. – Peter Cordes Mar 13 '20 at 02:29
1

Currently GCC treats Basic Asm exactly equivalent to `asm("" :::)` (implicitly volatile because it has no outputs, but not tied to the rest of the code by input or output dependencies. And no `"memory"` clobber). And of course it doesn't do `%operand` replacement on the template string, so literal `%` doesn't have to be escaped as `%%`. So yes, agreed, deprecating Basic Asm outside of `__attribute__((naked))` functions and global scope would be a good idea. – Peter Cordes Mar 13 '20 at 02:32

Why does adding inline assembly comments cause such radical change in GCC's generated code?

4 Answers4

Linked