3

I'm trying to learn x86-64 inline assembly and decided to implement this very simple swap method that simply orders a and b in ascending order:

#include <stdio.h>

void swap(int* a, int* b)
{
    asm(".intel_syntax noprefix");
    asm("mov    eax, DWORD PTR [rdi]");
    asm("mov    ebx, DWORD PTR [rsi]");
    asm("cmp    eax, ebx");
    asm("jle    .L1");
    asm("mov    DWORD PTR [rdi], ebx");
    asm("mov    DWORD PTR [rsi], eax");
    asm(".L1:");
    asm(".att_syntax noprefix");
}

int main()
{
    int input[3];

    scanf("%d%d%d", &input[0], &input[1], &input[2]);

    swap(&input[0], &input[1]);
    swap(&input[1], &input[2]);
    swap(&input[0], &input[1]);

    printf("%d %d %d\n", input[0], input[1], input[2]);

    return 0;
}

The above code works as expected when I run it with this command:

> gcc main.c
> ./a.out
> 3 2 1
> 1 2 3

However, as soon as I turn optimazation on I get the following error messages:

> gcc -O2 main.c
> main.c: Assembler messages:
> main.c:12: Error: symbol `.L1' is already defined
> main.c:12: Error: symbol `.L1' is already defined
> main.c:12: Error: symbol `.L1' is already defined

If I've understood it correctly, this is because gcc tries to inline my swap function when optimization is turned on, causing the label .L1 to be defined multiple times in the assembly file.

I've tried to find an answer to this problem, but nothing seems to work. In this previusly asked question it's suggested to use local labels instead, and I've tried that aswell:

#include <stdio.h>

void swap(int* a, int* b)
{
    asm(".intel_syntax noprefix");
    asm("mov    eax, DWORD PTR [rdi]");
    asm("mov    ebx, DWORD PTR [rsi]");
    asm("cmp    eax, ebx");
    asm("jle    1f");
    asm("mov    DWORD PTR [rdi], ebx");
    asm("mov    DWORD PTR [rsi], eax");
    asm("1:");
    asm(".att_syntax noprefix");
}

But when trying to run the program I now get a segmentation fault instead:

> gcc -O2 main.c
> ./a.out
> 3 2 1
> Segmentation fault

I also tried the suggested solution to this previusly asked question and changed the name .L1 to CustomLabel1 in case there would be a name collision, but it still gives me the old error:

> gcc -O2 main.c
> main.c: Assembler messages:
> main.c:12: Error: symbol `CustomLabel1' is already defined
> main.c:12: Error: symbol `CustomLabel1' is already defined
> main.c:12: Error: symbol `CustomLabel1' is already defined

Finally I also tried this suggestion:

void swap(int* a, int* b)
{
    asm(".intel_syntax noprefix");
    asm("mov    eax, DWORD PTR [rdi]");
    asm("mov    ebx, DWORD PTR [rsi]");
    asm("cmp    eax, ebx");
    asm("jle    label%=");
    asm("mov    DWORD PTR [rdi], ebx");
    asm("mov    DWORD PTR [rsi], eax");
    asm("label%=:");
    asm(".att_syntax noprefix");
}

But then I get these errors instead:

main.c: Assembler messages:
main.c:9: Error: invalid character '=' in operand 1
main.c:12: Error: invalid character '%' in mnemonic
main.c:9: Error: invalid character '=' in operand 1
main.c:12: Error: invalid character '%' in mnemonic
main.c:9: Error: invalid character '=' in operand 1
main.c:12: Error: invalid character '%' in mnemonic
main.c:9: Error: invalid character '=' in operand 1
main.c:12: Error: invalid character '%' in mnemonic

So, my question is:

How can I use labels in inline assembly?


This is the disassemble output for the optimized version:

> gcc -O2 -S main.c

    .file   "main.c"
    .section    .text.unlikely,"ax",@progbits
.LCOLDB0:
    .text
.LHOTB0:
    .p2align 4,,15
    .globl  swap
    .type   swap, @function
swap:
.LFB23:
    .cfi_startproc
#APP
# 5 "main.c" 1
    .intel_syntax noprefix
# 0 "" 2
# 6 "main.c" 1
    mov eax, DWORD PTR [rdi]
# 0 "" 2
# 7 "main.c" 1
    mov ebx, DWORD PTR [rsi]
# 0 "" 2
# 8 "main.c" 1
    cmp eax, ebx
# 0 "" 2
# 9 "main.c" 1
    jle 1f
# 0 "" 2
# 10 "main.c" 1
    mov DWORD PTR [rdi], ebx
# 0 "" 2
# 11 "main.c" 1
    mov DWORD PTR [rsi], eax
# 0 "" 2
# 12 "main.c" 1
    1:
# 0 "" 2
# 13 "main.c" 1
    .att_syntax noprefix
# 0 "" 2
#NO_APP
    ret
    .cfi_endproc
.LFE23:
    .size   swap, .-swap
    .section    .text.unlikely
.LCOLDE0:
    .text
.LHOTE0:
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC1:
    .string "%d%d%d"
.LC2:
    .string "%d %d %d\n"
    .section    .text.unlikely
.LCOLDB3:
    .section    .text.startup,"ax",@progbits
.LHOTB3:
    .p2align 4,,15
    .globl  main
    .type   main, @function
main:
.LFB24:
    .cfi_startproc
    subq    $40, %rsp
    .cfi_def_cfa_offset 48
    movl    $.LC1, %edi
    movq    %fs:40, %rax
    movq    %rax, 24(%rsp)
    xorl    %eax, %eax
    leaq    8(%rsp), %rcx
    leaq    4(%rsp), %rdx
    movq    %rsp, %rsi
    call    __isoc99_scanf
#APP
# 5 "main.c" 1
    .intel_syntax noprefix
# 0 "" 2
# 6 "main.c" 1
    mov eax, DWORD PTR [rdi]
# 0 "" 2
# 7 "main.c" 1
    mov ebx, DWORD PTR [rsi]
# 0 "" 2
# 8 "main.c" 1
    cmp eax, ebx
# 0 "" 2
# 9 "main.c" 1
    jle 1f
# 0 "" 2
# 10 "main.c" 1
    mov DWORD PTR [rdi], ebx
# 0 "" 2
# 11 "main.c" 1
    mov DWORD PTR [rsi], eax
# 0 "" 2
# 12 "main.c" 1
    1:
# 0 "" 2
# 13 "main.c" 1
    .att_syntax noprefix
# 0 "" 2
# 5 "main.c" 1
    .intel_syntax noprefix
# 0 "" 2
# 6 "main.c" 1
    mov eax, DWORD PTR [rdi]
# 0 "" 2
# 7 "main.c" 1
    mov ebx, DWORD PTR [rsi]
# 0 "" 2
# 8 "main.c" 1
    cmp eax, ebx
# 0 "" 2
# 9 "main.c" 1
    jle 1f
# 0 "" 2
# 10 "main.c" 1
    mov DWORD PTR [rdi], ebx
# 0 "" 2
# 11 "main.c" 1
    mov DWORD PTR [rsi], eax
# 0 "" 2
# 12 "main.c" 1
    1:
# 0 "" 2
# 13 "main.c" 1
    .att_syntax noprefix
# 0 "" 2
# 5 "main.c" 1
    .intel_syntax noprefix
# 0 "" 2
# 6 "main.c" 1
    mov eax, DWORD PTR [rdi]
# 0 "" 2
# 7 "main.c" 1
    mov ebx, DWORD PTR [rsi]
# 0 "" 2
# 8 "main.c" 1
    cmp eax, ebx
# 0 "" 2
# 9 "main.c" 1
    jle 1f
# 0 "" 2
# 10 "main.c" 1
    mov DWORD PTR [rdi], ebx
# 0 "" 2
# 11 "main.c" 1
    mov DWORD PTR [rsi], eax
# 0 "" 2
# 12 "main.c" 1
    1:
# 0 "" 2
# 13 "main.c" 1
    .att_syntax noprefix
# 0 "" 2
#NO_APP
    movl    8(%rsp), %r8d
    movl    4(%rsp), %ecx
    movl    $.LC2, %esi
    movl    (%rsp), %edx
    xorl    %eax, %eax
    movl    $1, %edi
    call    __printf_chk
    movq    24(%rsp), %rsi
    xorq    %fs:40, %rsi
    jne .L6
    xorl    %eax, %eax
    addq    $40, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 8
    ret
.L6:
    .cfi_restore_state
    call    __stack_chk_fail
    .cfi_endproc
.LFE24:
    .size   main, .-main
    .section    .text.unlikely
.LCOLDE3:
    .section    .text.startup
.LHOTE3:
    .ident  "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.4) 5.4.0 20160609"
    .section    .note.GNU-stack,"",@progbits
Community
  • 1
  • 1
  • Have you tried naming the label without the dot? Assembly labels generated from C normally have the dot form. I think it might be reserved. – Petr Skocik Mar 08 '17 at 20:58
  • the latter solution is the correct syntax. It's just that your code doesn't work properly and crashes. – Jean-François Fabre Mar 08 '17 at 21:00
  • @PSkocik Yes, I tried naming it "CustomLabel1" (without a dot), and it still gives me the same error. – fighting_falcon93 Mar 08 '17 at 21:06
  • 1
    I may be wrong, but maybe you're retrieving the parameters from the stack, and there's no valid stack since the function is inlined and the parameters are optimized out. – Jean-François Fabre Mar 08 '17 at 21:08
  • @Jean-FrançoisFabre But then why does it work correctly when running without optimization? – fighting_falcon93 Mar 08 '17 at 21:08
  • because when not inlined it remains a call, and parameters are passed on the stack. – Jean-François Fabre Mar 08 '17 at 21:09
  • 1
    Tried it. `__attribute__((__noinline__))` fixes it. – Petr Skocik Mar 08 '17 at 21:10
  • @Jean-FrançoisFabre When running without optimization, the 2 parameters are passed in registers `rdi` and `rsi`. Does this change when running with `-O2`? Isn't passing parameters on the stack slower than passing them in registers? – fighting_falcon93 Mar 08 '17 at 21:15
  • @PSkocik Yeah, but now the compiler won't inline the function right? – fighting_falcon93 Mar 08 '17 at 21:15
  • 1
    @fighting_falcon93 just disassemble the optimized version to find out. My answer is then inaccurate... – Jean-François Fabre Mar 08 '17 at 21:16
  • @Jean-FrançoisFabre I've added the dissasemble output of the optimized version to the question. – fighting_falcon93 Mar 08 '17 at 21:23
  • Note: the single `asm` statements can be arbitrarily reordered by the compiler. I strongly doubt that's what you want! How about reading the documentation? and don't rely on the registers being set as you think. gcc provides extensions to make this all transparent to both sides. – too honest for this site Mar 08 '17 at 21:26
  • You need to tell GCC what to store in `rsi` and `rdi` in the asm statement parameters, you can't just expect them to have the values you need. – interjay Mar 08 '17 at 21:30
  • similar issue here: http://stackoverflow.com/questions/24471469/force-gcc-to-pass-arguments-in-registers – Jean-François Fabre Mar 08 '17 at 21:34
  • @PSkocik: Forbidding inlining is the wrong approach. Instead 1) don't use assmbly unless **unavoidable** (mostly if you have to use special instructions) 2) Use the gcc facilities to access variables from assembly to make it transparent to where the data is located. – too honest for this site Mar 08 '17 at 21:34
  • @Olaf But if the compiler decides to inline my functions, shouldn't it also be the compilers task to set up the registers in a way so the function is still working as before? Let's say I wrote the function in C instead, and the compiler decided to inline it, it would have to make this work aswell? – fighting_falcon93 Mar 08 '17 at 21:38
  • 1
    @fighting_falcon93: And how would the compiler know which registers your assembly code uses?? - Exactly! By you telling it. Again: Read the documentation. The `asm` extension is well documented. But in the first place: do not use assembly. That function above is commonly used in normal C. The compiler know this pattern, if not written too complicated, the compiler well possibly can use a single instruction or even none at all, depending on the surrounding code. – too honest for this site Mar 08 '17 at 21:40
  • @Olaf By setting registers and pushing the stack in the exact same way as if it would have been a function, except that the function call itself is never done? If I specify that a function written in C takes 2 parameters, the compiler will have to "translate" these 2 parameters into a format that works for an inlined version of the same function, that's why I expected it to do the same for the registers. – fighting_falcon93 Mar 08 '17 at 21:46
  • 1
    I don't think the segfault is related to your labels, it'll be because the optimizer rearranges things. @fighting_falcon93 You could think of the compiler as doing that and then optimizing the result - except the optimizer doesn't know what your inline assembly is doing so it more-or-less ignores it and puts the variables in whatever registers are most convenient. – user253751 Mar 08 '17 at 22:18
  • @fighting_falcon93: That's not how inline assembly works and the reason why I multiple times pointed you at the documentation! As you seem to prefer to ignore this, feel free to discuss this with the compiler, it might have more time; good bye. – too honest for this site Mar 08 '17 at 22:29
  • @Olaf I'll read the documentation, I just explained why I expected the compiler to do this automaticly :) – fighting_falcon93 Mar 08 '17 at 22:48
  • You could also do something like this: `void swap(int *a, int *b) { __asm__ ( "cmp %[b], %[a]\n\t" "jle 1f\n\t" "xchg %[a], %[b]\n\t" "1:\n" : [a]"+r"(*a),[b]"+r"(*b) : : "cc" ); return; }` – Michael Petch Mar 09 '17 at 05:40
  • @MichaelPetch - yikes! don't know if the OP is that keen on saving bytes yet, but you definitely don't want to use `xchg` on any processor much younger than 20 years old! It has an implicit lock prefix - and can cost a lot of cycles. And as a sync primitive, it can be very, very expensive. It's a pity it ever attained such a purpose. It would be an incredibly useful instruction otherwise... – Brett Hale Mar 09 '17 at 05:53
  • @BrettHale Implicit lock with _XCHG_ only occurs when you use a memory operand. I use 2 registers here, so there is no lock on _XCHG_. – Michael Petch Mar 09 '17 at 05:54
  • 1
    @MichaelPetch - you're right - I jumped the gun there! I also wonder if, given the OPs disassembly clearly being x86-64, a couple of `cmov`s might be a better. I'm never quite sure with `cmov`. I suppose it's better than a cold / unpredictable branch? That Peter Cordes guy would know, right down to the transistor level:) – Brett Hale Mar 09 '17 at 05:57
  • 1
    @BrettHale : *CMOV*s would work, and only reason I didn't suggest it was that there'd be no use for the label (and then part of his question disappears lol) – Michael Petch Mar 09 '17 at 06:01
  • @MichaelPetch + @BrettHale: Yeah, I'm going to learn to use the `cmov` instruction aswell, but I wanted to start with conditional jumps first, and therefore I wrote the code in a not fully optimal way to suit the learning ;) Is it correct that `cmov` doesn't cause branch mispredictions as the instruction is always fetched, but the execution step decides whether to mov or not? – fighting_falcon93 Mar 09 '17 at 10:44
  • A question to the people downvoting this question: Why? I asked an honest question about something that I'm trying to learn, I explained the problem in detail, I did my own research before asking, and I asked the question in a way so that it could be answered effectively. I know there are a lot of people with the opinion that I should let the compiler do this for me, but does that mean it's a bad question? – fighting_falcon93 Mar 09 '17 at 10:51
  • Possibly downvotes might be related to the fact that the question posed in the title is not really your problem at all. In your question you have a proper answer of using `jle 1f` and a label `1:` . That is perfectly fine. The reason your code doesn't work when run in some environments (optimized code especially) is that you incorrectly use `asm` statements without understanding the intricacies of inline assembler. It is likely the reason so may people suggested not using inline assembly unless one has to. – Michael Petch Mar 09 '17 at 15:55
  • Inline GCC assembly is also a bad way to learn assembly because it can be difficult to get right and if you don't get it right it can cause subtle bugs that may only manifest in the future. GCC doesn't analyze what the assembly template does. GCC inline assembler templates force the developer to tell it what they want. – Michael Petch Mar 09 '17 at 15:58
  • You might have posed a different question had you taken the code and run it through a debugger like _GDB_ or dumped the exectuable out with _OBJDUMP_. You would have likely observed that the `swap` function was inlined and inserted right into `main` and that the normal parameter passing mechanism didn't apply - they were optimized out. As well by not specifying proper constraints and clobbers the code generator would have tried to perform optimizations that would create further potential conflicts. – Michael Petch Mar 09 '17 at 16:12

3 Answers3

9

There are plenty of tutorials - including this one (probably the best I know of), and some info on operand size modifiers.

Here's the first implementation - swap_2 :

void swap_2 (int *a, int *b)
{
    int tmp0, tmp1;

    __asm__ volatile (
        "movl (%0), %k2\n\t" /* %2 (tmp0) = (*a) */
        "movl (%1), %k3\n\t" /* %3 (tmp1) = (*b) */
        "cmpl %k3, %k2\n\t"
        "jle  %=f\n\t"       /* if (%2 <= %3) (at&t!) */
        "movl %k3, (%0)\n\t"
        "movl %k2, (%1)\n\t"
        "%=:\n\t"

        : "+r" (a), "+r" (b), "=r" (tmp0), "=r" (tmp1) :
        : "memory" /* "cc" */ );
}

A few notes:

  • volatile (or __volatile__) is required, as the compiler only 'sees' (a) and (b) (and doesn't 'know' you're potentially exchanging their contents), and would otherwise be free to optimize the whole asm statement away - tmp0 and tmp1 would otherwise be considered unused variables too.

  • "+r" means that this is both an input and output that may be modified; only it isn't in this case, and they could strictly be input only - more on that in a bit...

  • The 'l' suffix on 'movl' isn't really necessary; neither is the 'k' (32-bit) length modifier for the registers. Since you're using the Linux (ELF) ABI, an int is 32 bits for both IA32 and x86-64 ABIs.

  • The %= token generates a unique label for us. BTW, the jump syntax <label>f means a forward jump, and <label>b means back.

  • For correctness, we need "memory" as the compiler has no way of knowing if values from dereferenced pointers have been changed. This may be an issue in more complex inline asm surrounded by C code, as it invalidates all currently held values in memory - and is often a sledgehammer approach. Appearing at the end of a function in this fashion, it's not going to be an issue - but you can read more on it here (see: Clobbers)

  • The "cc" flags register clobber is detailed in the same section. on x86, it does nothing. Some writers include it for clarity, but since practically all non-trivial asm statements affect the flags register, it's just assumed to be clobbered by default.

Here's the C implementation - swap_1 :

void swap_1 (int *a, int *b)
{
    if (*a > *b)
    {
        int t = *a; *a = *b; *b = t;
    }
}

Compiling with gcc -O2 for x86-64 ELF, I get identical code. Just a bit of luck that the compiler chose tmp0 and tmp1 to use the same free registers for temps... cutting out the noise, like the .cfi directives, etc., gives:

swap_2:
        movl (%rdi), %eax
        movl (%rsi), %edx
        cmpl %edx, %eax
        jle  21f
        movl %edx, (%rdi)
        movl %eax, (%rsi)
        21:
        ret

As stated, the swap_1 code was identical, except that the compiler chose .L1 for its jump label. Compiling the code with -m32 generated the same code (apart from using the tmp registers in a different order). There's more overhead, as the IA32 ELF ABI passes parameters on the stack, whereas the x86-64 ABI passes the first two parameters in %rdi and %rsi respectively.


Treating (a) and (b) as input only - swap_3 :

void swap_3 (int *a, int *b)
{
    int tmp0, tmp1;

    __asm__ volatile (
        "mov (%[a]), %[x]\n\t" /* x = (*a) */
        "mov (%[b]), %[y]\n\t" /* y = (*b) */
        "cmp %[y], %[x]\n\t"
        "jle  %=f\n\t"         /* if (x <= y) (at&t!) */
        "mov %[y], (%[a])\n\t"
        "mov %[x], (%[b])\n\t"
        "%=:\n\t"

        : [x] "=&r" (tmp0), [y] "=&r" (tmp1)
        : [a] "r" (a), [b] "r" (b) : "memory" /* "cc" */ );
}

I've done away with the 'l' suffix and 'k' modifiers here, because they're not needed. I've also used the 'symbolic name' syntax for operands, as it often helps to make the code more readable.

(a) and (b) are now indeed input-only registers. So what's the "=&r" syntax mean? The & denotes an early clobber operand. In this case, the value may be written to before we finish using the input operands, and therefore the compiler must choose registers different from those selected for the input operands.

Once again, the compiler generates identical code as it did for swap_1 and swap_2.


I wrote way more than I planned on this answer, but as you can see, it's very difficult to maintain awareness of all the information the compiler must be made aware of, as well as the idiosyncrasies of each instruction set (ISA) and ABI.

Brett Hale
  • 21,653
  • 2
  • 61
  • 90
  • Very detailed explanation, and the code works without any segfaults and produces the correct output, so I'll choose this one as the answer. Although, may I ask, why doesn't the compiler do this automaticly? Shouldn't the compiler be able to read the asm instructions that we write and understand what registers and memory locations that have been changed? May I also ask, how would this have looked in Intel-syntax? Once again thanks for your very detailed answer. – fighting_falcon93 Mar 09 '17 at 10:34
  • @fighting_falcon93 Depending on the complexity of an inline assembler template a code analysis tool may not be able to determine exactly what changes and what the side effects are in an assembler template. Rather than GCC developers wasting their efforts on something that may not always work they force the developer to be explicit and force them to describe exactly what the input, outputs, clobbers and other side effects are. – Michael Petch Mar 09 '17 at 17:07
3

You cannot just put a bunch of asm statements inline like that. The optimizer is free to re-order, duplicate, and drop them based on what constraints it knows. (In your case, it knows none.)

So firstly, you should consolidate the asm together, with proper read/write/clobber constraints. Secondly, there is a special asm goto form that gives assembly to C-level labels.

void swap(int *a, int *b) {
    int tmp1, tmp2;
    asm(
        "mov (%2), %0\n"
        "mov (%3), %1\n"
        : "=r" (tmp1), "=r" (tmp2)
        : "r" (a), "r" (b)
        : "memory"   // pointer in register doesn't imply that the pointed-to memory has to be "in sync"
        // or use "m" memory source operands to let the compiler pick the addressing mode
    );
    asm goto(
        "cmp %1, %0\n"
        "jle %l4\n"
        "mov %1, (%2)\n"
        "mov %0, (%3)\n"
        :
        : "r" (tmp1), "r" (tmp2), "r" (a), "r" (b)
        : "cc", "memory"
        : L1
    );
L1:
    return;
}
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
ephemient
  • 198,619
  • 38
  • 280
  • 391
  • 1
    I tried your suggestion, but it still generates a segmentation fault when compiled with `-O2`, however it runs fine without any optimization. – fighting_falcon93 Mar 08 '17 at 21:55
  • @fighting_falcon93 Maybe try compiling to assembly code (gcc -S) so you can look at what the compiler is actually doing – user253751 Mar 08 '17 at 22:19
  • 3
    I might have tried something similar (without the goto); avoiding the "memory" clobber; and getting the assembler to pick temp registers: `int temp1, temp2; __asm__ ( "mov %[a], %[tmp1]\n\t" "mov %[b], %[tmp2]\n\t" "cmp %[tmp2], %[tmp1]\n\t" "jle 1f\n\t" "mov %[tmp2], %[a]\n\t" "mov %[tmp1], %[b]\n\t" "1:\n" : [a]"+m"(*a),[b]"+m"(*b), [tmp1]"=r"(temp1), [tmp2]"=r"(temp2) : : "cc" );` – Michael Petch Mar 08 '17 at 22:45
  • 1
    Don't mix `"r"` constraints with explicit register use. The compiler is free to choose `%edx` or `%ecx` for `(a)` and `(b)`, regardless of what you list as clobbered registers. Consider what happens if `%edx` is assigned to `%1`... – Brett Hale Mar 08 '17 at 22:46
  • @BrettHale Oh that's probably it. Easy fix, just have to create a couple temporaries... – ephemient Mar 08 '17 at 22:48
  • On top of what Brett mentioned be careful. If the contents of a register may be modified never use it as an input operand. If you potentially change a register make sure they are output operand (or input/output) because the compiler may potentially mess up if it can assume the registers used will contain the same value before and after the template is run. – Michael Petch Mar 08 '17 at 22:59
  • I'm reading all the comments and trying to follow ;) I thought that there was a caller and callee convention that specifies which registers that must be pushed/popped before/after a function call, and which specific registers that should be used for sending parameters etc. Is this not the case? Is the compiler allowed to do this as it wants? @ephemient: Temporaries? You mean like copying the parameters into local function variables and then using them in the assembly code? – fighting_falcon93 Mar 08 '17 at 23:07
  • @fighting_falcon93 There are caller/callee save registers, but the C compiler can choose to use any registers for any purpose before or after your asm code runs. The ABI isn't respected for inlined calls anyway. Yes, local variables in the C function. – ephemient Mar 08 '17 at 23:10
  • It's not safe to dereference a pointer input operand (even for reading) without a "memory" clobber or a dummy memory input operand that tells the compiler you're reading from the pointed-to memory. Just use `"m"` inputs / outputs and let the compiler pick an addressing mode. – Peter Cordes Sep 25 '18 at 00:41
0

You cannot assume values are in any particular register in your asm code -- you need to use constraints to tell gcc what values you want to read and write and get it to tell you which register they are in. The gcc docs tell you most of what you need to know, but are pretty dense. There are also tutorials out there that you can easily find with a web search (here or here)

Chris Dodd
  • 119,907
  • 13
  • 134
  • 226