visual studio c++ release mode slower than debug mode for a loop incrementing a volatile

Question

I have a very simple c++ program

int main()
{
    volatile int index = 0;
    for (int i = 0; i < 1000000000; i++) {
        for(int j = 0; j < 50; j++)
        {
            index++;
        }
    }
}

I compiled the code in release and debug mode and checked the assembly codes.

// Release mode assembly
main    PROC                        ; COMDAT

; 9    :    volatile int index = 0;

    mov DWORD PTR index$[rsp], 0

; 10   :    for (int i = 0; i < 1000000000; i++) {

    mov edx, 1000000000             ; 3b9aca00H
    npad    3
$LL4@main:

; 11   :        for(int j = 0; j < 50; j++)

    mov eax, 50                 ; 00000032H
    npad    11
$LL7@main:

; 12   :        {
; 13   :            index++;

    mov ecx, DWORD PTR index$[rsp]
    inc ecx
    mov DWORD PTR index$[rsp], ecx
    sub rax, 1
    jne SHORT $LL7@main

; 10   :    for (int i = 0; i < 1000000000; i++) {

    sub rdx, 1
    jne SHORT $LL4@main

; 14   :        }
; 15   :    }
; 16   : }

    ret 0
main    ENDP
_TEXT   ENDS
END

// Debug mode assembly
main    PROC                        ; COMDAT
; 8    : {

$LN9:
    push    rbp
    push    rdi
    sub rsp, 328                ; 00000148H
    lea rbp, QWORD PTR [rsp+32]
    lea rcx, OFFSET FLAT:__38206C9F_Source@cpp
    call    __CheckForDebuggerJustMyCode

; 9    :    volatile int index = 0;

    mov DWORD PTR index$[rbp], 0

; 10   :    for (int i = 0; i < 1000000000; i++) {

    mov DWORD PTR i$1[rbp], 0
    jmp SHORT $LN4@main
$LN2@main:
    mov eax, DWORD PTR i$1[rbp]
    inc eax
    mov DWORD PTR i$1[rbp], eax
$LN4@main:
    cmp DWORD PTR i$1[rbp], 1000000000      ; 3b9aca00H
    jge SHORT $LN3@main

; 11   :        for(int j = 0; j < 50; j++)

    mov DWORD PTR j$2[rbp], 0
    jmp SHORT $LN7@main
$LN5@main:
    mov eax, DWORD PTR j$2[rbp]
    inc eax
    mov DWORD PTR j$2[rbp], eax
$LN7@main:
    cmp DWORD PTR j$2[rbp], 50          ; 00000032H
    jge SHORT $LN6@main

; 12   :        {
; 13   :            index++;

    mov eax, DWORD PTR index$[rbp]
    inc eax
    mov DWORD PTR index$[rbp], eax

; 14   :        }

    jmp SHORT $LN5@main
$LN6@main:

; 15   :    }

    jmp SHORT $LN2@main
$LN3@main:

; 16   : }

    xor eax, eax
    lea rsp, QWORD PTR [rbp+296]
    pop rdi
    pop rbp
    ret 0
main    ENDP
_TEXT   ENDS
END

Based on the assemblies variables i and j are just in registers in release mode while in debug mode they are in memory and any change to them is written back instantly.

So I think release mode should be faster than debug mode but it was actually twice slower. I checked the program outside the visual studio by using stopwatch, release mode took 1 minute to finish while debug mode took only 30 seconds.

I'm really curious why it is slower!

details:

release mode machine code

; Listing generated by Microsoft (R) Optimizing Compiler Version 19.30.30706.0 

include listing.inc

INCLUDELIB OLDNAMES

EXTRN   __security_check_cookie:PROC
PUBLIC  main
; Function compile flags: /Ogtpy
;   COMDAT main
_TEXT   SEGMENT
index$ = 8
main    PROC                        ; COMDAT
; File D:\Visual Studio Projects\MultiThread_Test\Source.cpp
; Line 9
  00000 c7 44 24 08 00
    00 00 00     mov     DWORD PTR index$[rsp], 0
; Line 10
  00008 ba 00 ca 9a 3b   mov     edx, 1000000000        ; 3b9aca00H
  0000d 0f 1f 00     npad    3
$LL4@main:
; Line 11
  00010 b8 32 00 00 00   mov     eax, 50            ; 00000032H
  00015 66 66 66 0f 1f
    84 00 00 00 00
    00       npad    11
$LL7@main:
; Line 13
  00020 8b 4c 24 08  mov     ecx, DWORD PTR index$[rsp]
  00024 ff c1        inc     ecx
  00026 89 4c 24 08  mov     DWORD PTR index$[rsp], ecx
  0002a 48 83 e8 01  sub     rax, 1
  0002e 75 f0        jne     SHORT $LL7@main
; Line 10
  00030 48 83 ea 01  sub     rdx, 1
  00034 75 da        jne     SHORT $LL4@main
; Line 16
  00036 c3       ret     0
main    ENDP
_TEXT   ENDS
END

Debug mode machine codes
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.30.30706.0 

include listing.inc

INCLUDELIB MSVCRTD
INCLUDELIB OLDNAMES

msvcjmc SEGMENT
__38206C9F_Source@cpp DB 01H
msvcjmc ENDS
PUBLIC  main
PUBLIC  __JustMyCode_Default
EXTRN   _RTC_InitBase:PROC
EXTRN   _RTC_Shutdown:PROC
EXTRN   __CheckForDebuggerJustMyCode:PROC
;   COMDAT pdata
pdata   SEGMENT
$pdata$main DD  imagerel $LN9
    DD  imagerel $LN9+107
    DD  imagerel $unwind$main
pdata   ENDS
;   COMDAT rtc$TMZ
rtc$TMZ SEGMENT
_RTC_Shutdown.rtc$TMZ DQ FLAT:_RTC_Shutdown
rtc$TMZ ENDS
;   COMDAT rtc$IMZ
rtc$IMZ SEGMENT
_RTC_InitBase.rtc$IMZ DQ FLAT:_RTC_InitBase
rtc$IMZ ENDS
;   COMDAT voltbl
voltbl  SEGMENT
_volmd  DB  01bH
    DB  053H
    DB  058H
voltbl  ENDS
;   COMDAT xdata
xdata   SEGMENT
$unwind$main DD 025050f01H
    DD  010a230fH
    DD  070030029H
    DD  05002H
xdata   ENDS
; Function compile flags: /Odt
;   COMDAT __JustMyCode_Default
_TEXT   SEGMENT
__JustMyCode_Default PROC               ; COMDAT
  00000 c2 00 00     ret     0
__JustMyCode_Default ENDP
_TEXT   ENDS
; Function compile flags: /Odtp /RTCsu /ZI
;   COMDAT main
_TEXT   SEGMENT
index$ = 4
i$1 = 36
j$2 = 68
main    PROC                        ; COMDAT
; File D:\Visual Studio Projects\MultiThread_Test\Source.cpp
; Line 8
$LN9:
  00000 40 55        push    rbp
  00002 57       push    rdi
  00003 48 81 ec 48 01
    00 00        sub     rsp, 328       ; 00000148H
  0000a 48 8d 6c 24 20   lea     rbp, QWORD PTR [rsp+32]
  0000f 48 8d 0d 00 00
    00 00        lea     rcx, OFFSET FLAT:__38206C9F_Source@cpp
  00016 e8 00 00 00 00   call    __CheckForDebuggerJustMyCode
; Line 9
  0001b c7 45 04 00 00
    00 00        mov     DWORD PTR index$[rbp], 0
; Line 10
  00022 c7 45 24 00 00
    00 00        mov     DWORD PTR i$1[rbp], 0
  00029 eb 08        jmp     SHORT $LN4@main
$LN2@main:
  0002b 8b 45 24     mov     eax, DWORD PTR i$1[rbp]
  0002e ff c0        inc     eax
  00030 89 45 24     mov     DWORD PTR i$1[rbp], eax
$LN4@main:
  00033 81 7d 24 00 ca
    9a 3b        cmp     DWORD PTR i$1[rbp], 1000000000 ; 3b9aca00H
  0003a 7d 23        jge     SHORT $LN3@main
; Line 11
  0003c c7 45 44 00 00
    00 00        mov     DWORD PTR j$2[rbp], 0
  00043 eb 08        jmp     SHORT $LN7@main
$LN5@main:
  00045 8b 45 44     mov     eax, DWORD PTR j$2[rbp]
  00048 ff c0        inc     eax
  0004a 89 45 44     mov     DWORD PTR j$2[rbp], eax
$LN7@main:
  0004d 83 7d 44 32  cmp     DWORD PTR j$2[rbp], 50 ; 00000032H
  00051 7d 0a        jge     SHORT $LN6@main
; Line 13
  00053 8b 45 04     mov     eax, DWORD PTR index$[rbp]
  00056 ff c0        inc     eax
  00058 89 45 04     mov     DWORD PTR index$[rbp], eax
; Line 14
  0005b eb e8        jmp     SHORT $LN5@main
$LN6@main:
; Line 15
  0005d eb cc        jmp     SHORT $LN2@main
$LN3@main:
; Line 16
  0005f 33 c0        xor     eax, eax
  00061 48 8d a5 28 01
    00 00        lea     rsp, QWORD PTR [rbp+296]
  00068 5f       pop     rdi
  00069 5d       pop     rbp
  0006a c3       ret     0
main    ENDP
_TEXT   ENDS
END

I can't reproduce your results, you should show us the two compiler command lines that VS is actually using. Compiling from the command line, I got no optimisation flags: 1:34, /Ox: 1:19. The two timings are similar because the generated assembly for the 'hot' inner loop is not really that different between the two versions. My tests conducted with MSVC Compiler Version 19.16.27045 for x86 on a Core i3 8100. — Paul Sanders, Mar 27 '22 at 11:42
Could you post the actual machine code showing `nop` paddings and instruction addresses? — xiver77, Mar 27 '22 at 12:39
That information is necessary to test your code on another machine without the same version of MSVC. — xiver77, Mar 27 '22 at 12:41
We have to see what exactly `npad` is producing. That is not an actual instruction. It's a macro specific to the assembler embedded in MSVC. — xiver77, Mar 27 '22 at 12:53
It's not slower. This type of microbenchmark is pointless; you are not measuring what you think you're measuring. — Cody Gray - on strike, Mar 27 '22 at 13:28
@PaulSanders I compiled using default flags in visual studio 2022 for both debug and release for x64 : for release:(/GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Zc:inline /fp:precise /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oi /MD /FC /EHsc /nologo /FAs) , for Debug : (/GS /W3 /Zc:wchar_t /ZI /Gm- /Od /sdl /Zc:inline /fp:precise /D "_DEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /RTC1 /Gd /MDd /FC /EHsc /nologo /FAs) — Arshia Tahayori, Mar 27 '22 at 13:33
@CodyGray I don't think this is pointless. I can't test it now, but if it is reproduceable there is a reason for being slow. Such reason is interesting rather than pointless at least to me. — xiver77, Mar 27 '22 at 16:19
Looks like the usual Sandybridge-family store-forwarding latency issue, @CodyGray. Note the `volatile`. It's a valid asm question, but not an interesting C question. I fixed the title. — Peter Cordes, Mar 27 '22 at 16:31
@ArshiaTahayori: Without weird CPU-architecture effects, I would have expected both versions to be the same speed, botlenecked on store/reload latency. (On `index` because it's `volatile`, and in debug mode also on the `j++` also doing store/reload in parallel.) — Peter Cordes, Mar 27 '22 at 19:50

visual studio c++ release mode slower than debug mode for a loop incrementing a volatile

0 Answers0