1

I have two functions below which in terms of execution should be identical. The only difference is that in the second function, funcB(), I moved the branch on (decobj == NULL) outside the loop which is always NULL anyway. I thought that the compiler would easily optimize funcA() so that it would have identical execution time to FuncB(), but that's not the case and there is a big difference in execution time:

FuncA took: 3644530 uS
FuncB took: 1664598 uS
FuncB took: 1626528 uS
FuncA took: 3108780 uS
Finished!

I am using Visual Studio 2019 (Preview), compiled x64 with full optimization enabled:

enter image description here

Does anyone know how to get Visual Studio to optimize funcA() to make it as fast as funcB()?

#include <iostream>
#include <chrono>

using namespace std;

struct DecoderObj {
    int decode(int i, int j) {
        return i + j;
    }
};

static DecoderObj* decobj = nullptr;
static const int iterations = 3000;

void funcA() {
    auto start = chrono::high_resolution_clock::now();
    for (int rep = 0; rep < iterations; rep++) {
        for (int i = 0; i < 999; i++) {
            for (int j = 0; j < 999; j++) {
                int a = decobj ? decobj->decode(i, j) : i - j;
                if (a > 99999) {
                    cout << "This should never print";
                    return;
                }
            }
        }
    }
    auto end = chrono::high_resolution_clock::now();
    cout << "FuncA took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
}

void funcB() {
    auto start = chrono::high_resolution_clock::now();
    for (int rep = 0; rep < iterations; rep++) {
        if (decobj != nullptr) {
            for (int i = 0; i < 999; i++) {
                for (int j = 0; j < 999; j++) {
                    int a = decobj->decode(i, j);
                    if (a > 99999) {
                        cout << "This should never print";
                        return;
                    }
                }
            }
        }
        else { //decobj is NULL
            for (int i = 0; i < 999; i++) {
                for (int j = 0; j < 999; j++) {
                    int a = i - j;
                    if (a > 99999) {
                        cout << "This should never print";
                        return;
                    }
                }
            }
        }
    }
    auto end = chrono::high_resolution_clock::now();
    cout << "FuncB took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
}

int main() {
    funcA();
    funcB();
    funcB();
    funcA();

    std::cout << "Finished!\n";
}

The other odd thing I noticed is that if I add decobj = nullptr; to the beginning of funcB(), the function actually slows down and takes about twice as long. Not sure why that would be the case as it's even more obvious for the compiler to completely eliminate the first branch and simply only generate code for the second branch. I don't see why doing that would have slowed down function execution.

Here is the assembly code for funcA() (I removed the timers):

; 17   :     for (int rep = 0; rep < iterations; rep++) {

    mov r9, QWORD PTR ?decobj@@3PEAUDecoderObj@@EA
    xor r10d, r10d
    npad    6
$LL4@funcA:

; 18   :         for (int i = 0; i < 999; i++) {

    xor r8d, r8d
    npad    13
$LL7@funcA:

; 19   :             for (int j = 0; j < 999; j++) {

    xor eax, eax
    mov edx, r8d
$LL10@funcA:

; 20   :                 int a = decobj ? decobj->decode(i, j) : i - j;

    lea ecx, DWORD PTR [rax+r8]
    test    r9, r9
    jne SHORT $LN14@funcA
    mov ecx, edx
$LN14@funcA:

; 21   :                 if (a > 99999) {

    cmp ecx, 99999              ; 0001869fH
    jg  SHORT $LN20@funcA

; 19   :             for (int j = 0; j < 999; j++) {

    inc eax
    dec edx
    cmp eax, 999                ; 000003e7H
    jl  SHORT $LL10@funcA

; 18   :         for (int i = 0; i < 999; i++) {

    inc r8d
    cmp r8d, 999                ; 000003e7H
    jl  SHORT $LL7@funcA

; 16   :     //auto start = chrono::high_resolution_clock::now();
; 17   :     for (int rep = 0; rep < iterations; rep++) {

    inc r10d
    cmp r10d, 3000              ; 00000bb8H
    jl  SHORT $LL4@funcA

; 23   :                     return;
; 24   :                 }
; 25   :             }
; 26   :         }
; 27   :     }
; 28   :     //auto end = chrono::high_resolution_clock::now();
; 29   :     //cout << "FuncA took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
; 30   : }

    ret 0
$LN20@funcA:

; 22   :                     cout << "This should never print";

    mov rcx, QWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
    lea rdx, OFFSET FLAT:??_C@_0BI@DPJNIIKO@This?5should?5never?5print@
    jmp ??$?6U?$char_traits@D@std@@@std@@YAAEAV?$basic_ostream@DU?$char_traits@D@std@@@0@AEAV10@PEBD@Z ; std::operator<<<std::char_traits<char> >
?funcA@@YAXXZ ENDP                  ; funcA

Here is the assembly code for FuncB(). I also removed the timer here too:

_TEXT   SEGMENT
?funcB@@YAXXZ PROC                  ; funcB, COMDAT

; 33   :     //auto start = chrono::high_resolution_clock::now();
; 34   :     for (int rep = 0; rep < iterations; rep++) {

    mov r9, QWORD PTR ?decobj@@3PEAUDecoderObj@@EA
    xor r8d, r8d
    npad    6
$LL4@funcB:

; 35   :         if (decobj != nullptr) {

    xor edx, edx
    test    r9, r9
    je  SHORT $LL13@funcB
    npad    9
$LL7@funcB:

; 37   :                 for (int j = 0; j < 999; j++) {

    xor eax, eax
$LL10@funcB:

; 38   :                     int a = decobj->decode(i, j);
; 39   :                     if (a > 99999) {

    lea ecx, DWORD PTR [rax+rdx]
    cmp ecx, 99999              ; 0001869fH
    jg  SHORT $LN30@funcB

; 37   :                 for (int j = 0; j < 999; j++) {

    inc eax
    cmp eax, 999                ; 000003e7H
    jl  SHORT $LL10@funcB

; 36   :             for (int i = 0; i < 999; i++) {

    inc edx
    cmp edx, 999                ; 000003e7H
    jl  SHORT $LL7@funcB

; 40   :                         cout << "This should never print";
; 41   :                         return;
; 42   :                     }
; 43   :                 }
; 44   :             }
; 45   :         }

    jmp SHORT $LN2@funcB
$LL13@funcB:

; 48   :                 for (int j = 0; j < 999; j++) {

    xor ecx, ecx
    mov eax, edx
$LL16@funcB:

; 49   :                     int a = i - j;
; 50   :                     if (a > 99999) {

    cmp eax, 99999              ; 0001869fH
    jg  SHORT $LN30@funcB

; 48   :                 for (int j = 0; j < 999; j++) {

    inc ecx
    dec eax
    cmp ecx, 999                ; 000003e7H
    jl  SHORT $LL16@funcB

; 46   :         else { //decobj is NULL
; 47   :             for (int i = 0; i < 999; i++) {

    inc edx
    cmp edx, 999                ; 000003e7H
    jl  SHORT $LL13@funcB
$LN2@funcB:

; 33   :     //auto start = chrono::high_resolution_clock::now();
; 34   :     for (int rep = 0; rep < iterations; rep++) {

    inc r8d
    cmp r8d, 3000               ; 00000bb8H
    jge SHORT $LN3@funcB
    jmp SHORT $LL4@funcB
$LN30@funcB:

; 51   :                         cout << "This should never print";
; 52   :                         return;
; 53   :                     }
; 54   :                 }
; 55   :             }
; 56   :         }
; 57   :     }
; 58   :    // auto end = chrono::high_resolution_clock::now();
; 59   :     //cout << "FuncB took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
; 60   : }

    mov rcx, QWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
    lea rdx, OFFSET FLAT:??_C@_0BI@DPJNIIKO@This?5should?5never?5print@
    jmp ??$?6U?$char_traits@D@std@@@std@@YAAEAV?$basic_ostream@DU?$char_traits@D@std@@@0@AEAV10@PEBD@Z ; std::operator<<<std::char_traits<char> >
$LN3@funcB:
    ret 0
?funcB@@YAXXZ ENDP                  ; funcB
_TEXT   ENDS

The code for the two functions is very different even though I think the compiler should have generated the same code for both. It should have realized that decobj will always be NULL and never generated code for that branch. Both funcA and funcB test the decobj pointer for NULL.

I also ran the Performance Profiler and it also shows funcA() spending time on the decode obj NULL check:

enter image description here

Here is the profile for FuncB():

enter image description here

PentiumPro200
  • 641
  • 7
  • 23
  • 1
    What CPU do you have exactly? (e.g. i7-6700k Skylake). Code alignment effects might be doing something unfortunate for the "decobj is NULL" loop, e.g. spanning a 32B boundary for a loop that would otherwise be 1 iteration/clock. Or [32-byte aligned routine does not fit the uops cache](https://stackoverflow.com/a/61016915) - JCC erratum mitigation in microcode introducing a performance pothole your compiler probably doesn't work around yet. – Peter Cordes Dec 13 '20 at 01:57
  • Did you look at the assembly output, to see what the compiler is doing? – ChrisMM Dec 13 '20 at 03:27
  • @PeterCordes I am running this on a Core i5-8350U CPU. My thought is that either way, with "decobj != nullptr" statement or without it, the compiler should realize that the "if (decobj != nullptr) " will never get executed and therefore will never generate code for it. I added that line to funcA() as well, but was surprised that it slowed down funcB() instead of sped up funcA(). The compiler should have been able to easily optimize both functions with that line present. Should have been able to optimize without the line, but definitely with it. – PentiumPro200 Dec 13 '20 at 04:25
  • Yes you'd hope that would be obvious even to MSVC, especially a version that recent, but you'd have to check the asm to be sure. e.g. put it up on https://godbolt.org/ or ask the compiler locally to output asm instead of machine code. (Or disassemble after compiling). First order of business is to figure out exactly why it's slow, whether a microarchitecural pothole, or just a bunch of work, and then if that work should have been optimized away. – Peter Cordes Dec 13 '20 at 06:50
  • 2
    Clang has same perfomance for both methods [here](https://quick-bench.com/q/5zkF4JHJgvnpsPNYuKNVmaN2tig) as it generates same assembly... – Jarod42 Dec 13 '20 at 09:01
  • Weird, apparently MSVC doesn't bother to analyze the whole compilation unit and see that nothing modifies `decobj` (even possibly indirectly by letting a pointer to it escape this compilation unit), and also fails at other optimizations. MSVC is better than it used to be, but still not on the level of GCC and clang for optimization. https://www.agner.org/optimize/blog/read.php?i=1015 – Peter Cordes Dec 13 '20 at 17:35

0 Answers0