1

Here's a basic working C++ program.

#include <string>
#include <string_view>
#include <iostream>

template<typename ValueType>
struct BasicType {
    ValueType value{};
    std::string type{};

    BasicType(ValueType T, const std::string_view desc) :
    value{T}, type{desc} {}
};

template<typename T>
std::ostream& operator<<(std::ostream& out, const BasicType<T> BT ) {
    out << BT.value << BT.type;
    return out;
}

struct IntT : public BasicType<int> {
    IntT(int value, const std::string_view desc) : 
      BasicType(value, desc) 
    {}
};
struct FloatT : public BasicType<float> {
    FloatT(float value, const std::string_view desc) : 
      BasicType(value, desc) 
    {}
};

int main() {
    IntT hours(3, "hrs");
    FloatT seconds(2.5f, "s");

    std::cout << hours << " and " << seconds;

    return 0;  
}

And it's output is quite obvious

3hrs and 2.5s

There is no issue with the code and it's just a test sample. I was however curious as to how some of the major different compilers treat this code differently to better understand their internal workings and the pros/cons of each compiler.

I used Compiler Explorer to test this out just to look at the differences in the generated assembly. I'm compiling each under c++17 with O2 optimization for performance speed rather than O1 for code size.


Here's the generated assembly from Clang(trunk) with compiler flags set to: -std=C++17 -O2

main:                                   # @main
        push    rbp
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        sub     rsp, 168
        mov     dword ptr [rsp + 128], 3
        lea     r15, [rsp + 152]
        mov     qword ptr [rsp + 136], r15
        mov     dword ptr [rsp + 152], 7565928
        mov     qword ptr [rsp + 144], 3
        mov     dword ptr [rsp + 48], 1075838976
        lea     r12, [rsp + 72]
        mov     qword ptr [rsp + 56], r12
        mov     word ptr [rsp + 72], 115
        mov     qword ptr [rsp + 64], 1
        mov     dword ptr [rsp + 88], 3
        lea     r13, [rsp + 112]
        mov     qword ptr [rsp + 96], r13
        mov     dword ptr [rsp + 112], 7565928
        mov     qword ptr [rsp + 104], 3
        mov     edi, offset std::cout
        mov     esi, 3
        call    std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
        mov     rsi, qword ptr [rsp + 96]
        mov     rdx, qword ptr [rsp + 104]
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     edi, offset std::cout
        mov     esi, offset .L.str.2
        mov     edx, 5
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        movss   xmm0, dword ptr [rsp + 48]      # xmm0 = mem[0],zero,zero,zero
        movss   dword ptr [rsp + 8], xmm0
        lea     rbp, [rsp + 32]
        mov     qword ptr [rsp + 16], rbp
        mov     r14, qword ptr [rsp + 56]
        mov     rbx, qword ptr [rsp + 64]
        mov     qword ptr [rsp], rbx
        cmp     rbx, 15
        jbe     .LBB0_4
        lea     rdi, [rsp + 16]
        mov     rsi, rsp
        xor     edx, edx
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_create(unsigned long&, unsigned long)
        mov     qword ptr [rsp + 16], rax
        mov     rcx, qword ptr [rsp]
        mov     qword ptr [rsp + 32], rcx
        test    rbx, rbx
        jne     .LBB0_8
        jmp     .LBB0_11
.LBB0_4:
        mov     rax, rbp
        test    rbx, rbx
        je      .LBB0_11
.LBB0_8:
        cmp     rbx, 1
        jne     .LBB0_10
        mov     cl, byte ptr [r14]
        mov     byte ptr [rax], cl
        jmp     .LBB0_11
.LBB0_10:
        mov     rdi, rax
        mov     rsi, r14
        mov     rdx, rbx
        call    memcpy@PLT
.LBB0_11:
        mov     rax, qword ptr [rsp]
        mov     qword ptr [rsp + 24], rax
        mov     rcx, qword ptr [rsp + 16]
        mov     byte ptr [rcx + rax], 0
        movss   xmm0, dword ptr [rsp + 8]       # xmm0 = mem[0],zero,zero,zero
        cvtss2sd        xmm0, xmm0
        mov     edi, offset std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rsi, qword ptr [rsp + 16]
        mov     rdx, qword ptr [rsp + 24]
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     rdi, qword ptr [rsp + 16]
        cmp     rdi, rbp
        je      .LBB0_15
        call    operator delete(void*)
.LBB0_15:
        mov     rdi, qword ptr [rsp + 96]
        cmp     rdi, r13
        je      .LBB0_17
        call    operator delete(void*)
.LBB0_17:
        mov     rdi, qword ptr [rsp + 56]
        cmp     rdi, r12
        je      .LBB0_19
        call    operator delete(void*)
.LBB0_19:
        mov     rdi, qword ptr [rsp + 136]
        cmp     rdi, r15
        je      .LBB0_21
        call    operator delete(void*)
.LBB0_21:
        xor     eax, eax
        add     rsp, 168
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret
        mov     rbx, rax
        mov     rdi, qword ptr [rsp + 16]
        cmp     rdi, rbp
        je      .LBB0_25
        call    operator delete(void*)
        jmp     .LBB0_25
        mov     rbx, rax
.LBB0_25:
        mov     rdi, qword ptr [rsp + 96]
        cmp     rdi, r13
        je      .LBB0_27
        call    operator delete(void*)
.LBB0_27:
        mov     rdi, qword ptr [rsp + 56]
        cmp     rdi, r12
        je      .LBB0_29
        call    operator delete(void*)
.LBB0_29:
        mov     rdi, qword ptr [rsp + 136]
        cmp     rdi, r15
        je      .LBB0_31
        call    operator delete(void*)
.LBB0_31:
        mov     rdi, rbx
        call    _Unwind_Resume@PLT
_GLOBAL__sub_I_example.cpp:             # @_GLOBAL__sub_I_example.cpp
        push    rax
        mov     edi, offset std::__ioinit
        call    std::ios_base::Init::Init() [complete object constructor]
        mov     edi, offset std::ios_base::Init::~Init() [complete object destructor]
        mov     esi, offset std::__ioinit
        mov     edx, offset __dso_handle
        pop     rax
        jmp     __cxa_atexit                    # TAILCALL

.L.str.2:
        .asciz  " and "

And Clang is generating 147 lines of assembly.


Here is GCC(trunk) generated assembly with compiler options set to: -std=c++17 -O2

.LC0:
        .string "hrs"
.LC2:
        .string "s"
.LC3:
        .string " and "
main:
        push    rbx
        mov     edx, OFFSET FLAT:.LC0+3
        mov     esi, OFFSET FLAT:.LC0
        sub     rsp, 192
        lea     rax, [rsp+24]
        lea     rdi, [rsp+8]
        mov     DWORD PTR [rsp], 3
        mov     QWORD PTR [rsp+8], rax
        call    void std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_construct<char const*>(char const*, char const*, std::forward_iterator_tag) [clone .isra.0]
        lea     rax, [rsp+72]
        mov     edx, OFFSET FLAT:.LC2+1
        mov     esi, OFFSET FLAT:.LC2
        lea     rdi, [rsp+56]
        mov     DWORD PTR [rsp+48], 0x40200000
        mov     QWORD PTR [rsp+56], rax
        call    void std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_construct<char const*>(char const*, char const*, std::forward_iterator_tag) [clone .isra.0]
        mov     eax, DWORD PTR [rsp]
        lea     rsi, [rsp+8]
        lea     rdi, [rsp+104]
        mov     DWORD PTR [rsp+96], eax
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) [complete object constructor]
        mov     esi, DWORD PTR [rsp+96]
        mov     edi, OFFSET FLAT:_ZSt4cout
        call    std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
        mov     rdx, QWORD PTR [rsp+112]
        mov     rsi, QWORD PTR [rsp+104]
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     esi, OFFSET FLAT:.LC3
        mov     edi, OFFSET FLAT:_ZSt4cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)
        movss   xmm0, DWORD PTR [rsp+48]
        lea     rsi, [rsp+56]
        lea     rdi, [rsp+152]
        mov     rbx, rax
        movss   DWORD PTR [rsp+144], xmm0
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) [complete object constructor]
        pxor    xmm0, xmm0
        mov     rdi, rbx
        cvtss2sd        xmm0, DWORD PTR [rsp+144]
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rdx, QWORD PTR [rsp+160]
        mov     rsi, QWORD PTR [rsp+152]
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        lea     rdi, [rsp+152]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        lea     rdi, [rsp+104]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        lea     rdi, [rsp+56]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        lea     rdi, [rsp+8]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        add     rsp, 192
        xor     eax, eax
        pop     rbx
        ret
        mov     rbx, rax
        jmp     .L10
        mov     rbx, rax
        jmp     .L7
        mov     rbx, rax
        jmp     .L8
        mov     rbx, rax
        jmp     .L9
main.cold:
        lea     rdi, [rsp+152]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L8:
        lea     rdi, [rsp+104]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L9:
        lea     rdi, [rsp+56]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L10:
        lea     rdi, [rsp+8]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        mov     rdi, rbx
        call    _Unwind_Resume
_GLOBAL__sub_I_main:
        sub     rsp, 8
        mov     edi, OFFSET FLAT:_ZStL8__ioinit
        call    std::ios_base::Init::Init() [complete object constructor]
        mov     edx, OFFSET FLAT:__dso_handle
        mov     esi, OFFSET FLAT:_ZStL8__ioinit
        mov     edi, OFFSET FLAT:_ZNSt8ios_base4InitD1Ev
        add     rsp, 8
        jmp     __cxa_atexit

And this produces 95 lines of assembly instructions.


Finally we get to MSVC- x64 msvc v19.latest with compiler options set to: /std:c++17 /O2 and needless to say it's too large to post here directly so here is the link with the MSVC options Compiler Explorer ...And this amazingly generates a little over 1600 lines of code.


A jump from about 95 to 150 lines of instructions from GCC to Clang is one thing, but the jump to over 1500+ lines is something else. What gives? Is this major difference in generated assembly instructions between the compilers themselves and how MSVC implements their version of the C++17 language or is this related to how Compiler Explorer works with how they implement each compiler? And why is there such a dramatic jump in instructions?

Nicol Bolas
  • 449,505
  • 63
  • 781
  • 982
Francis Cugler
  • 7,788
  • 2
  • 28
  • 59
  • 2
    My bet is MSVC inlines more of iostream. Honestly, comparing anything that involves it will be hard, even across different versions of the same compiler or the same version with different versions of the standard library, because the inlining vs call tradeoff is really not so clear for io-bounded operations. I'd recommend making a test program that's not focused on iostreams. – Marcus Müller Jan 06 '22 at 21:45
  • @MarcusMüller Good point, but it's amazing how GCC and Clang are in the 90 - 150 range and MSVC well here's a book for you! – Francis Cugler Jan 06 '22 at 21:46
  • not really. the difference might really be replacing one `call` with an inline of `operator<<` or something. Or do you statically link all three? – Marcus Müller Jan 06 '22 at 21:47
  • @MarcusMüller I'm more familiar with MSVC than the other compilers, I have learned a little bit about Clangs LLVM, it's just trying to better understand the tools that I'm working with as opposed to the actual language and its features directly. – Francis Cugler Jan 06 '22 at 21:48
  • @MarcusMüller I just tried it within Compiler Explorer itself for right now, I have not tried or tested it with any IDE or working environment on my own PC. Also, I would only be testing MSCV and GCC if that was the case and not Clang... – Francis Cugler Jan 06 '22 at 21:49
  • 1
    Similar question: https://stackoverflow.com/questions/61879225/why-does-the-msvc-c-compiler-expand-a-simple-hello-world-into-4000-lines-of-as – user17732522 Jan 06 '22 at 21:50
  • 2
    @FrancisCugler if you add `-s` to the clang and gcc calls, they will have to include all the binary code they use in the executable itself, and can't rely on external code (in this case, in libstdc++). Don't know what the "static" flag for MSVC is, but there's surely one. Then you get something that's fair to compare! Otherwise, it makes little difference if 1000 lines are in your executable or in a shared library it loads. – Marcus Müller Jan 06 '22 at 21:52
  • @user17732522 Thank you for the link... very close to what I'm witnessing. So I'm gathering that it's MSVC and how they are implementing the iostream library. – Francis Cugler Jan 06 '22 at 21:52
  • @MarcusMüller True and I do understand that. I know a good portion of the MSVC flags, as for GCC and Clang I know some of the basic ones, but I'd have to look the rest of them up, and what you are saying does make sense about what Compiler Explorer is showing based on the flags. It may not necessarily be showing the entire generated binaries. – Francis Cugler Jan 06 '22 at 21:54
  • @MarcusMüller Well, this is kind of off topic but once I do eventually get a new PC with at least Windows 10 maybe 11 ... I don't mind Linux but I'm not as familiar with it and a lot of my uses still revolves around the Windows Environment... but once I have that, I probably won't even be looking at C++17 anymore. I'll move to C++20 with at least an MSVS 19 looking forward to C++23. As for now, I'm still stuck on Windows 7 lol. I can't even use DX12 right now, only DX11 but Vulkan is a viable API, but I would love to start using C++20 features. – Francis Cugler Jan 06 '22 at 21:58
  • @MarcusMüller Especially with the use of Modules and trying to eliminate the "PreProcessor" as much as possible. – Francis Cugler Jan 06 '22 at 22:00
  • @MarcusMüller I did add the `-s` option on Compiler Explorer and it's still giving me the same generated assembly for both `GCC` and `Clang`... – Francis Cugler Jan 06 '22 at 22:05
  • @user17732522 Or it could be how they are handling the stack frame, linkage, etc. and the difference between Window's DLLs versus the POSIX platforms, or it could just be Microsoft's implementation of the standard library. However, the link to the related question was very informative. – Francis Cugler Jan 06 '22 at 22:11
  • 2
    @FrancisCugler I don't know how things work on Windows. But it seems to me that most of that extra stuff are template instantiations and related type information. At least on godbolt, if you use only e.g. `stdio.h` instead of ``, the output is short as expected. There is also a question [here](https://stackoverflow.com/questions/56699545/redundant-template-instantiations-left-over-by-msvc) showing that MSVC does emit template instantiations which it wouldn't have to from a pure C++ perspective, since the definitions are required in every translation unit anyway. – user17732522 Jan 06 '22 at 22:25
  • @user17732522 Sounds like it makes sense considering that MSVC is mostly engineered towards Windows and their implementation as opposed to GCC and Clang for Linux & MAC and their implementations of the basic C/C++ Runtime Library. It's not always the case but typically MSCV is coupled with Windows, GCC & GNU with Linux and Clang/LLVM with Apple/Mac... There's a lot of variables and layers of abstraction involved to consider without doing any kind of deep analysis. And this only accounts for the x86 instruction set architecture, primarily Intel/AMD... – Francis Cugler Jan 07 '22 at 00:47
  • 1
    @MarcusMüller: IIRC, MSVC doesn't precisely *inline* iostreams code either, it just clutters the asm output listing with stand-alone definitions of lots of functions that aren't actually called. I think I recall reading that if you actually assembled that `.asm` output, you'd get link errors because MSVC's asm output includes conflicting symbols, and unlike gcc/clang isn't truly usable as-is. So one should find the asm just for the function one's actually interested in, such as `main`. An optimized build should inline most template functions, even if MSVC emits stand-alone defs too. – Peter Cordes Jan 07 '22 at 05:55
  • 1
    @PeterCordes uff, but interesting! – Marcus Müller Jan 07 '22 at 12:31
  • @PeterCordes Yeah,I'm not requiring it to be answered yet I feel it is worth asking. I'm not looking for an opinion. I'm just curious as to the difference in the generated assemblies that I'm seeing. I don't really have to know it, at least not for now. Yet the vast difference in the amount of instructions between MSVC and the other two and where it's coming from is worth asking about to gain insight into that curiosity. Yes it might be a bit vague to answer as there are many parts, many variables involved yet the question itself is still specific to one thing. Yes it is quite interesting! – Francis Cugler Jan 07 '22 at 12:52
  • @PeterCordes Having various people trying to answer and explain it where each has their own level of expertise will definitely provide that insight and it may even help future Software and maybe even Hardware Engineers down the road. As for me, I'm purely self taught, independent study and I started with C++ years ago and I've looked into many other things such as building a CPU from Logic Gates while knowing how they are made from electrical components. I've looked into Hardware Emulation, Compiler Design, OS Design... I even picked up Python for AI Programming. If it involves a circuit... :) – Francis Cugler Jan 07 '22 at 12:53

0 Answers0