I have two functions below which in terms of execution should be identical. The only difference is that in the second function, funcB(), I moved the branch on (decobj == NULL) outside the loop which is always NULL anyway. I thought that the compiler would easily optimize funcA() so that it would have identical execution time to FuncB(), but that's not the case and there is a big difference in execution time:
FuncA took: 3644530 uS
FuncB took: 1664598 uS
FuncB took: 1626528 uS
FuncA took: 3108780 uS
Finished!
I am using Visual Studio 2019 (Preview), compiled x64 with full optimization enabled:
Does anyone know how to get Visual Studio to optimize funcA() to make it as fast as funcB()?
#include <iostream>
#include <chrono>
using namespace std;
struct DecoderObj {
int decode(int i, int j) {
return i + j;
}
};
static DecoderObj* decobj = nullptr;
static const int iterations = 3000;
void funcA() {
auto start = chrono::high_resolution_clock::now();
for (int rep = 0; rep < iterations; rep++) {
for (int i = 0; i < 999; i++) {
for (int j = 0; j < 999; j++) {
int a = decobj ? decobj->decode(i, j) : i - j;
if (a > 99999) {
cout << "This should never print";
return;
}
}
}
}
auto end = chrono::high_resolution_clock::now();
cout << "FuncA took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
}
void funcB() {
auto start = chrono::high_resolution_clock::now();
for (int rep = 0; rep < iterations; rep++) {
if (decobj != nullptr) {
for (int i = 0; i < 999; i++) {
for (int j = 0; j < 999; j++) {
int a = decobj->decode(i, j);
if (a > 99999) {
cout << "This should never print";
return;
}
}
}
}
else { //decobj is NULL
for (int i = 0; i < 999; i++) {
for (int j = 0; j < 999; j++) {
int a = i - j;
if (a > 99999) {
cout << "This should never print";
return;
}
}
}
}
}
auto end = chrono::high_resolution_clock::now();
cout << "FuncB took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
}
int main() {
funcA();
funcB();
funcB();
funcA();
std::cout << "Finished!\n";
}
The other odd thing I noticed is that if I add decobj = nullptr; to the beginning of funcB(), the function actually slows down and takes about twice as long. Not sure why that would be the case as it's even more obvious for the compiler to completely eliminate the first branch and simply only generate code for the second branch. I don't see why doing that would have slowed down function execution.
Here is the assembly code for funcA() (I removed the timers):
; 17 : for (int rep = 0; rep < iterations; rep++) {
mov r9, QWORD PTR ?decobj@@3PEAUDecoderObj@@EA
xor r10d, r10d
npad 6
$LL4@funcA:
; 18 : for (int i = 0; i < 999; i++) {
xor r8d, r8d
npad 13
$LL7@funcA:
; 19 : for (int j = 0; j < 999; j++) {
xor eax, eax
mov edx, r8d
$LL10@funcA:
; 20 : int a = decobj ? decobj->decode(i, j) : i - j;
lea ecx, DWORD PTR [rax+r8]
test r9, r9
jne SHORT $LN14@funcA
mov ecx, edx
$LN14@funcA:
; 21 : if (a > 99999) {
cmp ecx, 99999 ; 0001869fH
jg SHORT $LN20@funcA
; 19 : for (int j = 0; j < 999; j++) {
inc eax
dec edx
cmp eax, 999 ; 000003e7H
jl SHORT $LL10@funcA
; 18 : for (int i = 0; i < 999; i++) {
inc r8d
cmp r8d, 999 ; 000003e7H
jl SHORT $LL7@funcA
; 16 : //auto start = chrono::high_resolution_clock::now();
; 17 : for (int rep = 0; rep < iterations; rep++) {
inc r10d
cmp r10d, 3000 ; 00000bb8H
jl SHORT $LL4@funcA
; 23 : return;
; 24 : }
; 25 : }
; 26 : }
; 27 : }
; 28 : //auto end = chrono::high_resolution_clock::now();
; 29 : //cout << "FuncA took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
; 30 : }
ret 0
$LN20@funcA:
; 22 : cout << "This should never print";
mov rcx, QWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
lea rdx, OFFSET FLAT:??_C@_0BI@DPJNIIKO@This?5should?5never?5print@
jmp ??$?6U?$char_traits@D@std@@@std@@YAAEAV?$basic_ostream@DU?$char_traits@D@std@@@0@AEAV10@PEBD@Z ; std::operator<<<std::char_traits<char> >
?funcA@@YAXXZ ENDP ; funcA
Here is the assembly code for FuncB(). I also removed the timer here too:
_TEXT SEGMENT
?funcB@@YAXXZ PROC ; funcB, COMDAT
; 33 : //auto start = chrono::high_resolution_clock::now();
; 34 : for (int rep = 0; rep < iterations; rep++) {
mov r9, QWORD PTR ?decobj@@3PEAUDecoderObj@@EA
xor r8d, r8d
npad 6
$LL4@funcB:
; 35 : if (decobj != nullptr) {
xor edx, edx
test r9, r9
je SHORT $LL13@funcB
npad 9
$LL7@funcB:
; 37 : for (int j = 0; j < 999; j++) {
xor eax, eax
$LL10@funcB:
; 38 : int a = decobj->decode(i, j);
; 39 : if (a > 99999) {
lea ecx, DWORD PTR [rax+rdx]
cmp ecx, 99999 ; 0001869fH
jg SHORT $LN30@funcB
; 37 : for (int j = 0; j < 999; j++) {
inc eax
cmp eax, 999 ; 000003e7H
jl SHORT $LL10@funcB
; 36 : for (int i = 0; i < 999; i++) {
inc edx
cmp edx, 999 ; 000003e7H
jl SHORT $LL7@funcB
; 40 : cout << "This should never print";
; 41 : return;
; 42 : }
; 43 : }
; 44 : }
; 45 : }
jmp SHORT $LN2@funcB
$LL13@funcB:
; 48 : for (int j = 0; j < 999; j++) {
xor ecx, ecx
mov eax, edx
$LL16@funcB:
; 49 : int a = i - j;
; 50 : if (a > 99999) {
cmp eax, 99999 ; 0001869fH
jg SHORT $LN30@funcB
; 48 : for (int j = 0; j < 999; j++) {
inc ecx
dec eax
cmp ecx, 999 ; 000003e7H
jl SHORT $LL16@funcB
; 46 : else { //decobj is NULL
; 47 : for (int i = 0; i < 999; i++) {
inc edx
cmp edx, 999 ; 000003e7H
jl SHORT $LL13@funcB
$LN2@funcB:
; 33 : //auto start = chrono::high_resolution_clock::now();
; 34 : for (int rep = 0; rep < iterations; rep++) {
inc r8d
cmp r8d, 3000 ; 00000bb8H
jge SHORT $LN3@funcB
jmp SHORT $LL4@funcB
$LN30@funcB:
; 51 : cout << "This should never print";
; 52 : return;
; 53 : }
; 54 : }
; 55 : }
; 56 : }
; 57 : }
; 58 : // auto end = chrono::high_resolution_clock::now();
; 59 : //cout << "FuncB took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
; 60 : }
mov rcx, QWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
lea rdx, OFFSET FLAT:??_C@_0BI@DPJNIIKO@This?5should?5never?5print@
jmp ??$?6U?$char_traits@D@std@@@std@@YAAEAV?$basic_ostream@DU?$char_traits@D@std@@@0@AEAV10@PEBD@Z ; std::operator<<<std::char_traits<char> >
$LN3@funcB:
ret 0
?funcB@@YAXXZ ENDP ; funcB
_TEXT ENDS
The code for the two functions is very different even though I think the compiler should have generated the same code for both. It should have realized that decobj will always be NULL and never generated code for that branch. Both funcA and funcB test the decobj pointer for NULL.
I also ran the Performance Profiler and it also shows funcA() spending time on the decode obj NULL check:
Here is the profile for FuncB():