0

What is the compiler doing in the beginning of the switch statement (snippet section below) to come up with the address in %rax so it can notrack jmpq *%rax to the correct offset ?

Are the constants 0xe07 and 0xdfb padding ?

lea    0xe07(%rip),%rax 
lea    0xdfb(%rip),%rdx

Snippet

10 [1]      switch (c) {
    0x5555555551ec  <+   35>        0f be 45 fc              movsbl -0x4(%rbp),%eax
    0x5555555551f0  <+   39>        83 e8 2d                 sub    $0x2d,%eax
    0x5555555551f3  <+   42>        83 f8 32                 cmp    $0x32,%eax
    0x5555555551f6  <+   45>        0f 87 18 01 00 00        ja     0x555555555314 <aa2i(char)+331>
    0x5555555551fc  <+   51>        89 c0                    mov    %eax,%eax
    0x5555555551fe  <+   53>        48 8d 14 85 00 00 00 00  lea    0x0(,%rax,4),%rdx
    0x555555555206  <+   61>        48 8d 05 07 0e 00 00     lea    0xe07(%rip),%rax        # 0x555555556014
    0x55555555520d  <+   68>        8b 04 02                 mov    (%rdx,%rax,1),%eax
    0x555555555210  <+   71>        48 98                    cltq
    0x555555555212  <+   73>        48 8d 15 fb 0d 00 00     lea    0xdfb(%rip),%rdx        # 0x555555556014
    0x555555555219  <+   80>        48 01 d0                 add    %rdx,%rax
    0x55555555521c  <+   83>        3e ff e0                 notrack jmpq *%rax

cpp code

#include <QCoreApplication>

const int ANY=20;       //number representing an X (any amino acid) internally
const int GAP=21;       //number representing a gap internally

char aa2i(char c) {

    //A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V
    if (c >= 'a' && c <= 'z') c += 'A' - 'a';
    switch (c) {
        case 'A':
            return 0;
        case 'R':
            return 1;
        case 'N':
            return 2;
        case 'D':
            return 3;
        case 'C':
            return 4;
        case 'Q':
            return 5;
        case 'E':
            return 6;
        case 'G':
            return 7;
        case 'H':
            return 8;
        case 'I':
            return 9;
        case 'L':
            return 10;
        case 'K':
            return 11;
        case 'M':
            return 12;
        case 'F':
            return 13;
        case 'P':
            return 14;
        case 'S':
            return 15;
        case 'T':
            return 16;
        case 'W':
            return 17;
        case 'Y':
            return 18;
        case 'V':
            return 19;
        case 'X':
            return ANY;
        case 'J':
            return ANY;
        case 'O':
            return ANY;
        case 'U':
            return 4;  //Selenocystein -> Cystein
        case 'B':
            return 3;  //D (or N)
        case 'Z':
            return 6;  //E (or Q)
        case '-':
            return GAP;
        case '.':
            return GAP;
        case '_':
            return GAP;
    }
    if (c >= 0 && c <= 32) return -1; // white space and control characters
    return -2;
}

int main(int argc, char *argv[])
{   
    aa2i('R');

}

Disassembly:

            6 [1]   char aa2i(char c) {
    0x5555555551c9                  f3 0f 1e fa              endbr64
    0x5555555551cd  <+    4>        55                       push   %rbp
    0x5555555551ce  <+    5>        48 89 e5                 mov    %rsp,%rbp
    0x5555555551d1  <+    8>        89 f8                    mov    %edi,%eax
    0x5555555551d3  <+   10>        88 45 fc                 mov    %al,-0x4(%rbp)
            9 [1]       if (c >= 'a' && c <= 'z') c += 'A' - 'a';
    0x5555555551d6  <+   13>        80 7d fc 60              cmpb   $0x60,-0x4(%rbp)
    0x5555555551da  <+   17>        7e 10                    jle    0x5555555551ec <aa2i(char)+35>
    0x5555555551dc  <+   19>        80 7d fc 7a              cmpb   $0x7a,-0x4(%rbp)
    0x5555555551e0  <+   23>        7f 0a                    jg     0x5555555551ec <aa2i(char)+35>
    0x5555555551e2  <+   25>        0f b6 45 fc              movzbl -0x4(%rbp),%eax
    0x5555555551e6  <+   29>        83 e8 20                 sub    $0x20,%eax
    0x5555555551e9  <+   32>        88 45 fc                 mov    %al,-0x4(%rbp)
            10 [1]      switch (c) {
    0x5555555551ec  <+   35>        0f be 45 fc              movsbl -0x4(%rbp),%eax
    0x5555555551f0  <+   39>        83 e8 2d                 sub    $0x2d,%eax
    0x5555555551f3  <+   42>        83 f8 32                 cmp    $0x32,%eax
    0x5555555551f6  <+   45>        0f 87 18 01 00 00        ja     0x555555555314 <aa2i(char)+331>
    0x5555555551fc  <+   51>        89 c0                    mov    %eax,%eax
    0x5555555551fe  <+   53>        48 8d 14 85 00 00 00 00  lea    0x0(,%rax,4),%rdx
    0x555555555206  <+   61>        48 8d 05 07 0e 00 00     lea    0xe07(%rip),%rax        # 0x555555556014
    0x55555555520d  <+   68>        8b 04 02                 mov    (%rdx,%rax,1),%eax
    0x555555555210  <+   71>        48 98                    cltq
    0x555555555212  <+   73>        48 8d 15 fb 0d 00 00     lea    0xdfb(%rip),%rdx        # 0x555555556014
    0x555555555219  <+   80>        48 01 d0                 add    %rdx,%rax
    0x55555555521c  <+   83>        3e ff e0                 notrack jmpq *%rax
            12 [1]              return 0;
    0x55555555521f  <+   86>        b8 00 00 00 00           mov    $0x0,%eax
    0x555555555224  <+   91>        e9 03 01 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            14 [1]              return 1;
    0x555555555229  <+   96>        b8 01 00 00 00           mov    $0x1,%eax
    0x55555555522e  <+  101>        e9 f9 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            16 [1]              return 2;
    0x555555555233  <+  106>        b8 02 00 00 00           mov    $0x2,%eax
    0x555555555238  <+  111>        e9 ef 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            18 [1]              return 3;
    0x55555555523d  <+  116>        b8 03 00 00 00           mov    $0x3,%eax
    0x555555555242  <+  121>        e9 e5 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            20 [1]              return 4;
    0x555555555247  <+  126>        b8 04 00 00 00           mov    $0x4,%eax
    0x55555555524c  <+  131>        e9 db 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            22 [1]              return 5;
    0x555555555251  <+  136>        b8 05 00 00 00           mov    $0x5,%eax
    0x555555555256  <+  141>        e9 d1 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            24 [1]              return 6;
    0x55555555525b  <+  146>        b8 06 00 00 00           mov    $0x6,%eax
    0x555555555260  <+  151>        e9 c7 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            26 [1]              return 7;
    0x555555555265  <+  156>        b8 07 00 00 00           mov    $0x7,%eax
    0x55555555526a  <+  161>        e9 bd 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            28 [1]              return 8;
    0x55555555526f  <+  166>        b8 08 00 00 00           mov    $0x8,%eax
    0x555555555274  <+  171>        e9 b3 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            30 [1]              return 9;
    0x555555555279  <+  176>        b8 09 00 00 00           mov    $0x9,%eax
    0x55555555527e  <+  181>        e9 a9 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            32 [1]              return 10;
    0x555555555283  <+  186>        b8 0a 00 00 00           mov    $0xa,%eax
    0x555555555288  <+  191>        e9 9f 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            34 [1]              return 11;
    0x55555555528d  <+  196>        b8 0b 00 00 00           mov    $0xb,%eax
    0x555555555292  <+  201>        e9 95 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            36 [1]              return 12;
    0x555555555297  <+  206>        b8 0c 00 00 00           mov    $0xc,%eax
    0x55555555529c  <+  211>        e9 8b 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            38 [1]              return 13;
    0x5555555552a1  <+  216>        b8 0d 00 00 00           mov    $0xd,%eax
    0x5555555552a6  <+  221>        e9 81 00 00 00           jmpq   0x55555555532c <aa2i(char)+355>
            40 [1]              return 14;
    0x5555555552ab  <+  226>        b8 0e 00 00 00           mov    $0xe,%eax
    0x5555555552b0  <+  231>        eb 7a                    jmp    0x55555555532c <aa2i(char)+355>
            42 [1]              return 15;
    0x5555555552b2  <+  233>        b8 0f 00 00 00           mov    $0xf,%eax
    0x5555555552b7  <+  238>        eb 73                    jmp    0x55555555532c <aa2i(char)+355>
            44 [1]              return 16;
    0x5555555552b9  <+  240>        b8 10 00 00 00           mov    $0x10,%eax
    0x5555555552be  <+  245>        eb 6c                    jmp    0x55555555532c <aa2i(char)+355>
            46 [1]              return 17;
    0x5555555552c0  <+  247>        b8 11 00 00 00           mov    $0x11,%eax
    0x5555555552c5  <+  252>        eb 65                    jmp    0x55555555532c <aa2i(char)+355>
            48 [1]              return 18;
    0x5555555552c7  <+  254>        b8 12 00 00 00           mov    $0x12,%eax
    0x5555555552cc  <+  259>        eb 5e                    jmp    0x55555555532c <aa2i(char)+355>
            50 [1]              return 19;
    0x5555555552ce  <+  261>        b8 13 00 00 00           mov    $0x13,%eax
    0x5555555552d3  <+  266>        eb 57                    jmp    0x55555555532c <aa2i(char)+355>
            52 [1]              return ANY;
    0x5555555552d5  <+  268>        b8 14 00 00 00           mov    $0x14,%eax
    0x5555555552da  <+  273>        eb 50                    jmp    0x55555555532c <aa2i(char)+355>
            54 [1]              return ANY;
    0x5555555552dc  <+  275>        b8 14 00 00 00           mov    $0x14,%eax
    0x5555555552e1  <+  280>        eb 49                    jmp    0x55555555532c <aa2i(char)+355>
            56 [1]              return ANY;
    0x5555555552e3  <+  282>        b8 14 00 00 00           mov    $0x14,%eax
    0x5555555552e8  <+  287>        eb 42                    jmp    0x55555555532c <aa2i(char)+355>
            58 [1]              return 4;  //Selenocystein -> Cystein
    0x5555555552ea  <+  289>        b8 04 00 00 00           mov    $0x4,%eax
    0x5555555552ef  <+  294>        eb 3b                    jmp    0x55555555532c <aa2i(char)+355>
            60 [1]              return 3;  //D (or N)
    0x5555555552f1  <+  296>        b8 03 00 00 00           mov    $0x3,%eax
    0x5555555552f6  <+  301>        eb 34                    jmp    0x55555555532c <aa2i(char)+355>
            62 [1]              return 6;  //E (or Q)
    0x5555555552f8  <+  303>        b8 06 00 00 00           mov    $0x6,%eax
    0x5555555552fd  <+  308>        eb 2d                    jmp    0x55555555532c <aa2i(char)+355>
            64 [1]              return GAP;
    0x5555555552ff  <+  310>        b8 15 00 00 00           mov    $0x15,%eax
    0x555555555304  <+  315>        eb 26                    jmp    0x55555555532c <aa2i(char)+355>
            66 [1]              return GAP;
    0x555555555306  <+  317>        b8 15 00 00 00           mov    $0x15,%eax
    0x55555555530b  <+  322>        eb 1f                    jmp    0x55555555532c <aa2i(char)+355>
            68 [1]              return GAP;
    0x55555555530d  <+  324>        b8 15 00 00 00           mov    $0x15,%eax
    0x555555555312  <+  329>        eb 18                    jmp    0x55555555532c <aa2i(char)+355>
            70 [1]      if (c >= 0 && c <= 32) return -1; // white space and control characters
    0x555555555314  <+  331>        80 7d fc 00              cmpb   $0x0,-0x4(%rbp)
    0x555555555318  <+  335>        78 0d                    js     0x555555555327 <aa2i(char)+350>
    0x55555555531a  <+  337>        80 7d fc 20              cmpb   $0x20,-0x4(%rbp)
    0x55555555531e  <+  341>        7f 07                    jg     0x555555555327 <aa2i(char)+350>
    0x555555555320  <+  343>        b8 ff ff ff ff           mov    $0xffffffff,%eax
    0x555555555325  <+  348>        eb 05                    jmp    0x55555555532c <aa2i(char)+355>
            71 [1]      return -2;
    0x555555555327  <+  350>        b8 fe ff ff ff           mov    $0xfffffffe,%eax
            72 [1]  }
    0x55555555532c  <+  355>        5d                       pop    %rbp
    0x55555555532d  <+  356>        c3                       retq
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
Lydon Ch
  • 8,637
  • 20
  • 79
  • 132
  • 2
    They are not padding, they are the offset to `0x555555556014` as helpfully printed by the disassembler. I assume that's where the jump table is. Instead of disassembly you might want to look at the actual assembly instead. – Jester Jan 28 '23 at 19:46
  • This doesn't address the question, but `if (c >= 'a' && c <= 'z') c += 'A' - 'a';` should be replaced with `c = std::toupper(c);`. The first snippet makes a couple of assumptions about character encodings. – Pete Becker Jan 28 '23 at 19:49
  • As discussed on [SIMD code for transforming one-letter amino acid code into integer between 0 and 22](https://stackoverflow.com/posts/comments/132656720), GCC uses an actual jump table, not a data lookup table, so this is pretty bad, requiring correct branch prediction for every amino acid. Only clang makes decent asm from a `switch` for this. Are you optimizing the fallback for targets without AVX2, or the cleanup code for the last few array elements? Harold's `vpshufb` code should be faster than the best case for a LUT on large arrays. – Peter Cordes Jan 28 '23 at 20:04
  • @PeteBecker: The OP's previous questions about this code were about optimizing it, so they probably want to make those ASCII assumptions to get performance gains, not calling a libc function that does a lookup in a LOCALE-dependent character table. Something like this can compile to `lea` for the addition plus `sub` / `cmp` / `cmovb` to branchlessly select the `c+0x20` or `c` based on the range-check. – Peter Cordes Jan 28 '23 at 20:11
  • This looks like a debug build, so of course it's garbage. Note that it's reloading `c` from `-0x4(%rbp)` where it must have been spilled in the first place. – Peter Cordes Jan 28 '23 at 20:13
  • A more efficient method would be to put the numbers into an array and use the letters as an index. Look up tables are more efficient than switch statements. – Thomas Matthews Jan 28 '23 at 20:16
  • @PeterCordes, yes this is NOT simd version, this is the original one with no optimisation whatsoever – Lydon Ch Jan 28 '23 at 20:27
  • @ThomasMatthews: A clever compiler can sometimes compile a switch into an array lookup, when every case just returns a constant. As I mentioned, clang actually does that with this code, but not GCC. Lookup tables are more efficient than jump tables, and using a switch in the C++ source might or might not be fast, depending on the cleverness of the compiler and the complexity of the code. – Peter Cordes Jan 28 '23 at 20:33
  • @PeterCordes -- `std::toupper` is just a table lookup. Hand-rolling a non-portable replacement is almost certainly not a significant improvement. – Pete Becker Jan 28 '23 at 20:34
  • @PeteBecker: It's not "just" a lookup table, it's a library function that doesn't fully inline. Note the call to `call __ctype_tolower_loc@PLT` to get the LUT address. Looking at a single function that can't inline make it look even worse than when this inlines into a loop in a larger function, since it costs an extra `push`/`pop` to save/restore a call-preserved reg to hold `c` across that call. And hopefully that `__ctype..._loc` call could be hoisted out of a loop. https://godbolt.org/z/ohYYbEz7W shows doing it manually is shorter in this case, for this one-character function. – Peter Cordes Jan 28 '23 at 20:54
  • @PeterCordes -- have you timed the two, and determined that the library function is, in fact, a bottleneck? That, of course, is the only valid basis for rolling your own. – Pete Becker Jan 28 '23 at 20:57
  • @PeteBecker: If you consider a loop calling this function and hoisting out the call and load of the pointer, it's maybe still ahead. But it's still extra an extra load in a function that might bottleneck on load throughput if optimized to use a bigger LUT instead of so much ALU work to reduce the number of cases. Clang invented an immediate bitmap tested with `bt` before indexing the table for the switch. (OTOH, that makes `tolower` redundant as well, just make the upper and lower case ranges part of the LUT you use instead of this switch.) – Peter Cordes Jan 28 '23 at 20:57
  • @PeterCordes -- so, you rewrite code that you think "might bottleneck", rather than rewriting code that **is** a bottleneck? Again: do you know that this code is a significant bottleneck in the application (that you haven't seen)? – Pete Becker Jan 28 '23 at 20:59
  • @PeteBecker: I'm familiar enough with the pipeline in modern Intel and AMD CPUs to estimate performance bottlenecks when branch mispredicts and cache misses aren't happening. If using this `switch` unchanged, yes, `tolower` is probably a win as a drop-in replacement, in a loop that calls this function on multiple characters, assuming most of the overhead can hoist out. But like I discussed [in comments on the OP's previous question](https://stackoverflow.com/posts/comments/132718916), and my prev. comment here, all the extra ALU work should just get rolled into a larger 128 or 256-entry LUT. – Peter Cordes Jan 28 '23 at 21:01
  • @PeteBecker: I had been forgetting how much extra ALU work clang invented to use a LUT for this switch, so a loop using it would be pretty far from just loading a new character and then doing another load from a LUT. (Which would bottleneck at 1 char per clock cycle on Ice Lake and Zen 2 and older, which can do 2 loads per clock cycle.) In that case an extra LUT lookups for `tolower` would be likely to run into a throughput bottleneck on load execution units, so yes spending 4 ALU uops (lea / sub/cmp/cmov) would be a throughput win in a well-optimized loop. – Peter Cordes Jan 28 '23 at 21:06
  • @PeteBecker: As for "rewrite", I'd be unlikely to have written it exactly this way in the first place. But the OP's code exists, and the rewrite would be to change it to use `tolower`. (But I know what you mean, reimplement standard functions that already exist. There's certainly merit to your suggestion, but part of my point was that if it doesn't need to be locale-aware, it's potentially slower to use a function that does take locales into account. I had been expecting that it wouldn't inline at all, which would make it obvious garbage. But if the call can be hoisted, it can be ok.) – Peter Cordes Jan 28 '23 at 21:08
  • @PeteBecker: On huge advantage to an arithmetic implementation instead of lookup-table in general is that it can potentially auto-vectorize to a few instructions that do it for 16 or 32 chars in parallel. That doesn't happen here because the result is used for a non-trivial `switch` mapping, but it's very much something to consider when working with known-ASCII data. – Peter Cordes Jan 28 '23 at 21:36
  • @PeterCordes -- okay, now it's an optimization that doesn't apply here, but nevertheless should have been done? This is getting rather far afield. – Pete Becker Jan 28 '23 at 23:14

1 Answers1

2

This is the so called RIP-Relative Addressing (described in Intel Volume 2. Chapter 2.2.1.6). Basically it is adding the offset (0xe07 and 0xdfb) to the address of the next instruction. Most probably this is the location where the jump table is located, and it looks like it is just after the code of the function.

In first case address of the next instruction is 0x55555555520d + 0xe07 = 0x555555556014. In second case address of the next instruction is 0x555555555219 + 0xdfb = 0x555555556014.

Hristo Iliev
  • 114
  • 5