Why godbolt generate different asm output than my actual asm code in Visual Studio?

Question

Here's the code generated by godbolt.

Here's the same code generated by Visual studio on my main.asm file (enabled by Project->C/C++->Output Files->Assembly With Source Code (/FAs) under Assembler Output field):

; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1 

    TITLE   c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    .686P
    .XMM
    include listing.inc
    .model  flat

INCLUDELIB OLDNAMES

EXTRN   __imp____std_terminate:PROC
EXTRN   @__security_check_cookie@4:PROC
EXTRN   __imp____CxxFrameHandler3:PROC
PUBLIC  ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z       ; std::less<void>::operator()<double const &,double const &>
PUBLIC  ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC  ??$clamp@N@std@@YAABNABN00@Z            ; std::clamp<double>
PUBLIC  _main
PUBLIC  ?ProcessOptimized@MyPlugin@@QAEXH@Z     ; MyPlugin::ProcessOptimized
PUBLIC  ?Process@MyPlugin@@QAEXH@Z          ; MyPlugin::Process
PUBLIC  ??1MyPlugin@@QAE@XZ             ; MyPlugin::~MyPlugin
PUBLIC  ??0MyPlugin@@QAE@XZ             ; MyPlugin::MyPlugin
PUBLIC  ?ProcessOptimized@Param@@QAEXHH@Z       ; Param::ProcessOptimized
PUBLIC  ?Process@Param@@QAEXHH@Z            ; Param::Process
PUBLIC  ??0Param@@QAE@XZ                ; Param::Param
PUBLIC  __real@3ff0000000000000
PUBLIC  __real@400921fb54442d18
PUBLIC  __real@4024000000000000
PUBLIC  __real@406fe00000000000
PUBLIC  __xmm@00000003000000020000000100000000
PUBLIC  __xmm@400921fb54442d18400921fb54442d18
PUBLIC  __xmm@406fe00000000000406fe00000000000
EXTRN   __chkstk:PROC
EXTRN   ___security_cookie:DWORD
EXTRN   __fltused:DWORD
;   COMDAT __xmm@406fe00000000000406fe00000000000
CONST   SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
    DB  '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST   ENDS
;   COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST   SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
    DB  018H, '-DT', 0fbH, '!', 09H, '@'
CONST   ENDS
;   COMDAT __xmm@00000003000000020000000100000000
CONST   SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
    DB  00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST   ENDS
;   COMDAT __real@406fe00000000000
CONST   SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r   ; 255
CONST   ENDS
;   COMDAT __real@4024000000000000
CONST   SEGMENT
__real@4024000000000000 DQ 04024000000000000r   ; 10
CONST   ENDS
;   COMDAT __real@400921fb54442d18
CONST   SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r   ; 3.14159
CONST   ENDS
;   COMDAT __real@3ff0000000000000
CONST   SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r   ; 1
CONST   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??0Param@@QAE@XZ
_TEXT   SEGMENT
??0Param@@QAE@XZ PROC                   ; Param::Param, COMDAT
; _this$ = ecx

; 23   :    Param() { }

    xorps   xmm0, xmm0
    mov eax, ecx
    movsd   QWORD PTR [ecx], xmm0
    movsd   QWORD PTR [ecx+16], xmm0
    movsd   xmm0, QWORD PTR __real@4024000000000000
    movsd   QWORD PTR [ecx+32], xmm0
    movsd   xmm0, QWORD PTR __real@3ff0000000000000
    movsd   QWORD PTR [ecx+48], xmm0
    movsd   QWORD PTR [ecx+64], xmm0
    ret 0
??0Param@@QAE@XZ ENDP                   ; Param::Param
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?Process@Param@@QAEXHH@Z
_TEXT   SEGMENT
$T1 = -24                       ; size = 8
$T3 = -16                       ; size = 8
$T2 = -8                        ; size = 8
_voiceIndex$ = 8                    ; size = 4
_blockSize$dead$ = 12                   ; size = 4
?Process@Param@@QAEXHH@Z PROC               ; Param::Process, COMDAT
; _this$ = ecx

; 25   :    inline void Process(int voiceIndex, int blockSize) {

    push    ebp
    mov ebp, esp
    sub esp, 24                 ; 00000018H

; 26   :        double *pB = b[voiceIndex];

    mov eax, DWORD PTR _voiceIndex$[ebp]
    xorps   xmm5, xmm5

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm2, QWORD PTR __real@400921fb54442d18
    push    esi
    mov esi, ecx
    shl eax, 11                 ; 0000000bH
    push    edi
    movsd   QWORD PTR $T1[ebp], xmm2
    mov ecx, 256                ; 00000100H
    movsd   QWORD PTR $T2[ebp], xmm5
    movsd   xmm3, QWORD PTR [esi+48]
    lea edx, DWORD PTR [esi+2128]
    movsd   xmm1, QWORD PTR [esi]
    add edx, eax
    mulsd   xmm3, QWORD PTR [esi+32]
    movsd   xmm4, QWORD PTR [esi+64]
    npad    11
$LL4@Process:
    movsd   xmm0, QWORD PTR [edx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [edx]
    mulsd   xmm0, xmm4
    comisd  xmm0, xmm2
    movsd   QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN10@Process
    movaps  xmm0, xmm2
    jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T2[ebp]
    lea edi, DWORD PTR $T3[ebp]
    cmovbe  eax, edi
    movsd   xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

    add edx, 8

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    addsd   xmm1, xmm0
    sub ecx, 1
    jne SHORT $LL4@Process

; 35   :        }
; 36   : 
; 37   :        mPhase = phase;
; 38   :    }

    pop edi
    movsd   QWORD PTR [esi], xmm1
    pop esi
    mov esp, ebp
    pop ebp
    ret 8
?Process@Param@@QAEXHH@Z ENDP               ; Param::Process
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT   SEGMENT
_v_phase$ = -16                     ; size = 16
_voiceIndex$ = 8                    ; size = 4
_blockSize$dead$ = 12                   ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC          ; Param::ProcessOptimized, COMDAT
; _this$ = ecx

; 39   :    inline void ProcessOptimized(int voiceIndex, int blockSize) {

    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -16                ; fffffff0H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov DWORD PTR [esp+4], ebp
    mov ebp, esp

; 40   :        double *pB = b[voiceIndex];

    mov eax, DWORD PTR _voiceIndex$[ebx]
    mov edx, ecx
    shl eax, 11                 ; 0000000bH
    xorps   xmm3, xmm3
    xorps   xmm2, xmm2
    sub esp, 16                 ; 00000010H
    xorps   xmm7, xmm7
    mov ecx, 128                ; 00000080H

; 41   :        double *pC = c[voiceIndex];
; 42   :        double phase = mPhaseOptimized;
; 43   :        double bp0 = mNoteFrequency * mHostPitch;

    movsd   xmm5, QWORD PTR [edx+48]
    mulsd   xmm5, QWORD PTR [edx+32]

; 44   : 
; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

    movsd   xmm6, QWORD PTR [edx+64]

; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49   : 
; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54   : 
; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR [eax+edx+80]
    movups  xmm4, XMMWORD PTR [eax+edx+80]
    movups  xmm1, XMMWORD PTR [eax+edx+2128]
    mulsd   xmm5, xmm6
    unpcklpd xmm3, xmm0

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm0, QWORD PTR [eax+edx+2128]
    add eax, 2136               ; 00000858H
    unpcklpd xmm2, xmm0
    add eax, edx

; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59   : 
; 60   :        __m128d v_phaseAcc1;
; 61   :        __m128d v_phaseAcc2;
; 62   :        __m128d v_phase = _mm_set1_pd(phase);

    movsd   xmm0, QWORD PTR [edx+16]
    unpcklpd xmm5, xmm5
    unpcklpd xmm6, xmm6
    mulpd   xmm4, xmm5
    mulpd   xmm1, xmm6
    mulpd   xmm3, xmm5
    mulpd   xmm2, xmm6
    unpcklpd xmm0, xmm0
    npad    2
$LL4@ProcessOpt:

; 63   : 
; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65   :            // some other code (that will use phase, like sin(phase))
; 66   : 
; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

    addpd   xmm1, xmm4

; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75   : 
; 76   :            v_pB0 = _mm_load_pd(pB + 2);

    movups  xmm4, XMMWORD PTR [eax-2040]
    addpd   xmm2, xmm3

; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78   :            v_pC0 = _mm_load_pd(pC + 2);
; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80   : 
; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);

    movups  xmm3, XMMWORD PTR [eax-2048]
    maxpd   xmm1, xmm7
    maxpd   xmm2, xmm7
    minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    addpd   xmm0, xmm1
    movups  xmm1, XMMWORD PTR [eax+8]
    addpd   xmm0, xmm2

; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);

    movups  xmm2, XMMWORD PTR [eax]
    add eax, 16                 ; 00000010H
    movaps  XMMWORD PTR _v_phase$[ebp], xmm0
    mulpd   xmm4, xmm5
    mulpd   xmm1, xmm6
    mulpd   xmm3, xmm5

; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm6
    sub ecx, 1
    jne SHORT $LL4@ProcessOpt

; 85   :        }
; 86   : 
; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

    movsd   xmm0, QWORD PTR _v_phase$[ebp+8]
    movsd   QWORD PTR [edx+16], xmm0

; 88   :    }

    mov esp, ebp
    pop ebp
    mov esp, ebx
    pop ebx
    ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP          ; Param::ProcessOptimized
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??0MyPlugin@@QAE@XZ
_TEXT   SEGMENT
??0MyPlugin@@QAE@XZ PROC                ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx

; 97   :        // fill b
; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    movaps  xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
    xorps   xmm0, xmm0
    movaps  xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
    xor edx, edx
    push    esi
    mov esi, ecx
    push    edi

; 14   :    alignas(16) double mPhase = 0.0;

    movsd   QWORD PTR [esi], xmm0

; 97   :        // fill b
; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR [esi+88]

; 15   :    alignas(16) double mPhaseOptimized = 0.0;

    movsd   QWORD PTR [esi+16], xmm0

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   xmm0, QWORD PTR __real@4024000000000000
    movsd   QWORD PTR [esi+32], xmm0

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   xmm0, QWORD PTR __real@3ff0000000000000
    movsd   QWORD PTR [esi+48], xmm0

; 18   :    alignas(16) double mRadiansPerSample = 1.0;

    movsd   QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:

; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));

    movd    xmm0, edx
    lea eax, DWORD PTR [edx+2]
    pshufd  xmm1, xmm0, 0
    lea ecx, DWORD PTR [ecx+32]
    movq    xmm0, xmm2
    add edx, 4
    paddd   xmm1, xmm0
    cvtdq2pd xmm0, xmm1
    divpd   xmm0, xmm3

; 101  : 
; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;

    movlpd  QWORD PTR [ecx-40], xmm0
    movhpd  QWORD PTR [ecx-32], xmm0
    movd    xmm0, eax
    pshufd  xmm1, xmm0, 0
    movq    xmm0, xmm2
    paddd   xmm1, xmm0
    cvtdq2pd xmm0, xmm1
    divpd   xmm0, xmm3
    movlpd  QWORD PTR [ecx-24], xmm0
    movhpd  QWORD PTR [ecx-16], xmm0
    cmp edx, 256                ; 00000100H
    jl  SHORT $LL7@MyPlugin

; 103  :            }
; 104  :        }
; 105  : 
; 106  :        // fill c
; 107  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea edi, DWORD PTR [esi+2128]
    xor eax, eax
    mov ecx, 512                ; 00000200H
    rep stosd

; 109  :                double value = 0.0;
; 110  : 
; 111  :                mParam1.c[voiceIndex][sampleIndex] = value;
; 112  :            }
; 113  :        }
; 114  :    }

    pop edi
    mov eax, esi
    pop esi
    ret 0
??0MyPlugin@@QAE@XZ ENDP                ; MyPlugin::MyPlugin
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??1MyPlugin@@QAE@XZ
_TEXT   SEGMENT
??1MyPlugin@@QAE@XZ PROC                ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx

; 115  :    ~MyPlugin() { }

    ret 0
??1MyPlugin@@QAE@XZ ENDP                ; MyPlugin::~MyPlugin
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT   SEGMENT
$T2 = -28                       ; size = 8
$T4 = -20                       ; size = 8
$T3 = -12                       ; size = 8
_blockSize$dead$ = 8                    ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC             ; MyPlugin::Process, COMDAT
; _this$ = ecx

; 117  :    void Process(int blockSize) {

    push    ebp
    mov ebp, esp
    sub esp, 28                 ; 0000001cH

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm2, QWORD PTR __real@400921fb54442d18
    xorps   xmm5, xmm5

; 117  :    void Process(int blockSize) {

    push    esi
    mov esi, ecx

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T2[ebp], xmm2

; 117  :    void Process(int blockSize) {

    push    edi

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T3[ebp], xmm5
    mov edx, 256                ; 00000100H
    movsd   xmm3, QWORD PTR [esi+48]

; 27   :        double *pC = c[voiceIndex];

    lea ecx, DWORD PTR [esi+2128]

; 28   :        double phase = mPhase;
; 29   :        double bp0 = mNoteFrequency * mHostPitch;

    movsd   xmm1, QWORD PTR [esi]
    mulsd   xmm3, QWORD PTR [esi+32]
    movsd   xmm4, QWORD PTR [esi+64]
    npad    3
$LL9@Process:

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm0, QWORD PTR [ecx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [ecx]
    mulsd   xmm0, xmm4
    comisd  xmm0, xmm2
    movsd   QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN15@Process
    movaps  xmm0, xmm2
    jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T3[ebp]
    lea edi, DWORD PTR $T4[ebp]
    cmovbe  eax, edi
    movsd   xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

    add ecx, 8

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    addsd   xmm1, xmm0
    sub edx, 1
    jne SHORT $LL9@Process

; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119  :            mParam1.Process(voiceIndex, blockSize);
; 120  :        }
; 121  :    }

    pop edi

; 37   :        mPhase = phase;

    movsd   QWORD PTR [esi], xmm1

; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119  :            mParam1.Process(voiceIndex, blockSize);
; 120  :        }
; 121  :    }

    pop esi
    mov esp, ebp
    pop ebp
    ret 4
?Process@MyPlugin@@QAEXH@Z ENDP             ; MyPlugin::Process
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT   SEGMENT
_v_phase$31 = -16                   ; size = 16
_blockSize$dead$ = 8                    ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC        ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx

; 122  :    void ProcessOptimized(int blockSize) {

    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -16                ; fffffff0H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov DWORD PTR [esp+4], ebp
    mov ebp, esp
    mov edx, ecx
    xorps   xmm3, xmm3
    xorps   xmm2, xmm2
    sub esp, 16                 ; 00000010H

; 40   :        double *pB = b[voiceIndex];

    mov ecx, 128                ; 00000080H
    movsd   xmm6, QWORD PTR [edx+48]
    lea eax, DWORD PTR [edx+2136]
    mulsd   xmm6, QWORD PTR [edx+32]

; 41   :        double *pC = c[voiceIndex];
; 42   :        double phase = mPhaseOptimized;
; 43   :        double bp0 = mNoteFrequency * mHostPitch;
; 44   : 
; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

    movsd   xmm7, QWORD PTR [edx+64]

; 54   : 
; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR [edx+80]
    movsd   xmm5, QWORD PTR [edx+16]
    movups  xmm4, XMMWORD PTR [edx+80]
    movups  xmm1, XMMWORD PTR [edx+2128]
    mulsd   xmm6, xmm7
    unpcklpd xmm3, xmm0

; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm0, QWORD PTR [edx+2128]
    unpcklpd xmm7, xmm7
    unpcklpd xmm6, xmm6
    unpcklpd xmm2, xmm0
    xorps   xmm0, xmm0

; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49   : 
; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);

    mulpd   xmm4, xmm6

; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

    mulpd   xmm1, xmm7

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);

    mulpd   xmm3, xmm6

; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm7

; 59   : 
; 60   :        __m128d v_phaseAcc1;
; 61   :        __m128d v_phaseAcc2;
; 62   :        __m128d v_phase = _mm_set1_pd(phase);

    unpcklpd xmm5, xmm5
    npad    13
$LL9@ProcessOpt:

; 63   : 
; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65   :            // some other code (that will use phase, like sin(phase))
; 66   : 
; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

    addpd   xmm1, xmm4

; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75   : 
; 76   :            v_pB0 = _mm_load_pd(pB + 2);

    movups  xmm4, XMMWORD PTR [eax-2040]
    addpd   xmm2, xmm3

; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78   :            v_pC0 = _mm_load_pd(pC + 2);
; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80   : 
; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);

    movups  xmm3, XMMWORD PTR [eax-2048]
    maxpd   xmm1, xmm0
    maxpd   xmm2, xmm0
    minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    addpd   xmm5, xmm1
    movups  xmm1, XMMWORD PTR [eax+8]
    addpd   xmm5, xmm2

; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);

    movups  xmm2, XMMWORD PTR [eax]
    add eax, 16                 ; 00000010H
    movaps  XMMWORD PTR _v_phase$31[ebp], xmm5
    mulpd   xmm4, xmm6
    mulpd   xmm1, xmm7
    mulpd   xmm3, xmm6

; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm7
    sub ecx, 1
    jne SHORT $LL9@ProcessOpt

; 85   :        }
; 86   : 
; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

    movsd   xmm0, QWORD PTR _v_phase$31[ebp+8]
    movsd   QWORD PTR [edx+16], xmm0

; 123  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124  :            mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125  :        }
; 126  :    }

    mov esp, ebp
    pop ebp
    mov esp, ebx
    pop ebx
    ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP        ; MyPlugin::ProcessOptimized
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT _main
_TEXT   SEGMENT
_counterProcessing$1$ = -4304               ; size = 4
_counterProcessing$ = -4304             ; size = 8
_bp0$1$ = -4296                     ; size = 8
_v_radiansPerSample$1$ = -4288              ; size = 16
$T3 = -4264                     ; size = 8
_v_phase$38 = -4256                 ; size = 16
$T4 = -4256                     ; size = 8
$T2 = -4232                     ; size = 8
tv1040 = -4224                      ; size = 16
tv1039 = -4208                      ; size = 16
_myPlugin$ = -4192                  ; size = 4176
__$ArrayPad$ = -4                   ; size = 4
_main   PROC                        ; COMDAT

; 129  : int main() {

    push    ebp
    mov ebp, esp
    and esp, -16                ; fffffff0H
    mov eax, 4312               ; 000010d8H
    call    __chkstk
    mov eax, DWORD PTR ___security_cookie
    xor eax, esp
    mov DWORD PTR __$ArrayPad$[esp+4312], eax

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   xmm0, QWORD PTR __real@4024000000000000

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR _myPlugin$[esp+4392]
    movsd   xmm1, QWORD PTR __real@406fe00000000000
    xorps   xmm2, xmm2

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   QWORD PTR _myPlugin$[esp+4344], xmm0

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    xor eax, eax

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   xmm0, QWORD PTR __real@3ff0000000000000

; 129  : int main() {

    push    esi
    push    edi

; 14   :    alignas(16) double mPhase = 0.0;

    movsd   QWORD PTR _myPlugin$[esp+4320], xmm2

; 15   :    alignas(16) double mPhaseOptimized = 0.0;

    movsd   QWORD PTR _myPlugin$[esp+4336], xmm2

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   QWORD PTR _myPlugin$[esp+4368], xmm0

; 18   :    alignas(16) double mRadiansPerSample = 1.0;

    movsd   QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
    movd    xmm0, eax

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR [ecx+8]

; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));

    cvtdq2pd xmm0, xmm0
    inc eax
    divsd   xmm0, xmm1

; 101  : 
; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;

    movsd   QWORD PTR [ecx-8], xmm0
    cmp eax, 256                ; 00000100H
    jl  SHORT $LL11@main

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm6, QWORD PTR __real@400921fb54442d18

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea edi, DWORD PTR _myPlugin$[esp+6448]
    mov ecx, 512                ; 00000200H

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T2[esp+4320], xmm6

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    xor eax, eax

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T3[esp+4320], xmm2

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    rep stosd
    movsd   xmm3, QWORD PTR _myPlugin$[esp+4352]
    xorps   xmm0, xmm0
    mulsd   xmm3, QWORD PTR _myPlugin$[esp+4368]

; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movaps  xmm4, xmm2
    movsd   xmm1, QWORD PTR _myPlugin$[esp+4384]

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm5, QWORD PTR _myPlugin$[esp+4336]

; 130  :    MyPlugin myPlugin;
; 131  : 
; 132  :    long long numProcessing = 5;
; 133  :    long long counterProcessing = 0;

    movlpd  QWORD PTR _counterProcessing$[esp+4320], xmm0

; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR _myPlugin$[esp+4400]
    movaps  xmm7, xmm3
    mulsd   xmm7, QWORD PTR _myPlugin$[esp+4384]

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    mov edi, DWORD PTR _counterProcessing$[esp+4324]
    mov esi, DWORD PTR _counterProcessing$[esp+4320]
    unpcklpd xmm4, xmm0
    movsd   xmm0, QWORD PTR _myPlugin$[esp+6448]
    movups  XMMWORD PTR tv1040[esp+4320], xmm4
    movaps  xmm4, xmm2
    unpcklpd xmm1, xmm1
    unpcklpd xmm4, xmm0
    movups  XMMWORD PTR tv1039[esp+4320], xmm4
    movsd   xmm4, QWORD PTR _myPlugin$[esp+4320]
    movsd   QWORD PTR _bp0$1$[esp+4320], xmm3
    unpcklpd xmm7, xmm7
    movaps  XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
    npad    8
$LL2@main:

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    add esi, 1

; 26   :        double *pB = b[voiceIndex];

    lea ecx, DWORD PTR _myPlugin$[esp+6448]

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    mov DWORD PTR _counterProcessing$1$[esp+4320], esi

; 26   :        double *pB = b[voiceIndex];

    mov edx, 256                ; 00000100H

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    adc edi, 0
    npad    10
$LL29@main:

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm0, QWORD PTR [ecx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [ecx]
    mulsd   xmm0, QWORD PTR _myPlugin$[esp+4384]
    comisd  xmm0, xmm6
    movsd   QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN35@main
    movaps  xmm0, xmm6
    jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T3[esp+4320]
    lea esi, DWORD PTR $T4[esp+4320]
    cmovbe  eax, esi
    movsd   xmm0, QWORD PTR [eax]

// ...

(Note: I've removed some lines because StackOverflow limit it.)

Its pretty different. Also, I see the code generated by VS is a bit redundant i.e. search for string phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);: there are many.

Which settings am I missing?. I've matched the same MSVC version (19.15), on a X86 build, placing also the actual optimization I have.

Compilation command in godbolt is not even close to typical VS project command, most of essential options (such as `/EHsc`, `/Zc`, `/MD`) are missing. Also compiler vesion on godbolt is a bit different. — user7860670, Jan 04 '19 at 08:27
godbolt.org filters out unused labels and assembler directives by default, but you can disable these filters in the compiler panel. — cpplearner, Jan 04 '19 at 09:10
Also, AFAIK, Microsoft does not guarantee that the VC-compiler is consistent between compilations. It may generate different (but functionally equivalent) code on subsequent invocations. — Johan, Jan 04 '19 at 10:14
@Johan: really? Why? :D Where can I get more details about this? — markzzz, Jan 05 '19 at 12:15

score 2 · Answer 1 · answered Jul 21 '19 at 06:18

It doesn't seem that you're using the same compiler flags. The assembly dump from Visual Studio shows that each function was optimized with the flags /Ogtp, which are used internally when you specify /Og in the command line. On the other hand, in the godbolt version, you used /Ot /O2, which internally correspond to /Ogtpy. If I manually add the /Oy flag, the code becomes slightly different, but still not the same as the one generated by Visual Studio.

I realize that the compiler versions are not exactly the same, but the difference between 19.15.26726.0 and 19.15.26732.1 is very minor and probably only includes bug fixes. I think there are other flags that are different. You can go to the Property Pages of your project and find all the compiler options that have been used in the "All Options" and "Additional Options" panes. In the Release build, many options are used other than /arch:SSE2 /Ot /O2. Note that /arch:SSE2 is the default, so you don't have to explicitly specify it. Also, /O2 implies /Ot. So /arch:SSE2 /Ot /O2 is equivalent to /O2.

Would using the -deterministic option influence any of the assembly or just the binary output? I found this article: https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/compiler-options/deterministic-compiler-option — Robert Houghton, Jul 23 '19 at 15:38
@InstructionPointer This is a C# compiler option, not MSVC. Anyway, I don't think that option influences the generated CIL code (the C# compiler generates CIL code, not native code). — Hadi Brais, Jul 23 '19 at 15:52

Robert Houghton · Answer 2 · 2019-07-20T23:03:55.787

0

There are multiple paths to a destination.

Roger Orr gave a good talk at an ACCU conference on the quirks of compiling. For example, a simple "hello world" will generate 98 lines of asm in GCC but 6,704 in MSVC.

To quickly and simply answer your question: in your Godbolt link the version is 19.15.26726.0 and your main.asm file 19.15.26732.1

Close, but maybe enough to make this difference?

19.15.26726.0

19.15.26732.1

MSVC is particularly weird, you can output asm with GCC then use that asm to pass through GCC again and get the same machine code. You can't in MSVC. So maybe if the versions were exactly the same, you would still get different asm, that'd be a fun experiment to run, this article shows you how to run two different versions of MSVC side by side in Visual Studio.

edited Jul 20 '19 at 23:03

answered Jul 20 '19 at 20:56

Robert Houghton

1,202
16
28

*that the disassembler might take a second to resynchronize between instructions and data.* That doesn't make sense. First of all, normal Godbolt output is the compiler's asm output, not disassembly of a binary. 2nd, compiler-generated object files have metadata that separates code from data, and symbol names for the tops of functions. Godbolt fires of batch jobs that compile and then disassemble to tmp files, not one shared filename so you can't get into a situation where the disassembler is reading a file while it's being modified by a compiler. You're always getting some snapshot. – Peter Cordes Jul 20 '19 at 21:22
thank you @PeterCordes as always for your guidance. I got carried away so just deleted that whole last part. I need to learn more about this metadata in the object files. – Robert Houghton Jul 20 '19 at 23:06
1

MSVC is weird in that it's asm output includes (stand alone definitions for) other related functions. And yeah, I think I've read it doesn't always exactly match the machine code you'd get if you don't ask for asm. Anyway, I think you can't reliably just assemble its asm output to get a usable `.obj`. You can with GCC; in fact GCC *always* compiles to asm and then assembles that separately; it doesn't have a built-in assembler. (Running `gcc -c foo.s` makes the gcc front-end run the `.s` through `as`.) – Peter Cordes Jul 20 '19 at 23:38
2

While MSVC produces asm output files, they aren't [assemble-able](https://stackoverflow.com/questions/21182665/compile-64-bit-hello-world-assembly-output-from-msvc2012): *we made the call to no longer support assembling C/C++ generated listing files. In other words, the listing files are for informational purposes only.* – David Wohlferd Jul 21 '19 at 00:18

Why godbolt generate different asm output than my actual asm code in Visual Studio?

2 Answers2

Linked