This code was generated by .NET Core 3.0 JIT, for my manually vectorized C# code:
00007FFE6C7D2103 vmovdqu xmm5,xmmword ptr [rcx]
00007FFE6C7D2107 vmovdqu xmm6,xmmword ptr [rcx+10h]
00007FFE6C7D210C vmovdqu xmm7,xmmword ptr [rcx+20h]
00007FFE6C7D2111 vmovdqu xmm8,xmmword ptr [rcx+30h]
00007FFE6C7D2116 vpand xmm9,xmm5,xmm0
00007FFE6C7D211A vpand xmm10,xmm6,xmm0
00007FFE6C7D211E vpackusdw xmm9,xmm9,xmm10
00007FFE6C7D2123 vpslldq xmm9,xmm9,1
00007FFE6C7D2129 vpand xmm10,xmm5,xmm1
00007FFE6C7D212D vpand xmm11,xmm6,xmm1
00007FFE6C7D2131 vpackusdw xmm10,xmm10,xmm11
00007FFE6C7D2136 vpsrldq xmm5,xmm5,1
00007FFE6C7D213B vpsrldq xmm6,xmm6,1
00007FFE6C7D2140 vpand xmm5,xmm5,xmm1
00007FFE6C7D2144 vpand xmm6,xmm6,xmm1
00007FFE6C7D2148 vpackusdw xmm5,xmm5,xmm6
var low = brightness( r, g, b, redMul, greenMul, blueMul );
00007FFE6C7D214D vpmulhuw xmm9,xmm9,xmm2
00007FFE6C7D2151 vpmulhuw xmm10,xmm10,xmm3
00007FFE6C7D2155 vpmulhuw xmm5,xmm5,xmm4
00007FFE6C7D2159 vpaddusw xmm6,xmm9,xmm10
00007FFE6C7D215E vpaddusw xmm5,xmm6,xmm5
00007FFE6C7D2162 vpsrlw xmm5,xmm5,8
00007FFE6C7D2167 vpand xmm6,xmm7,xmm0
00007FFE6C7D216B vpand xmm9,xmm8,xmm0
00007FFE6C7D216F vpackusdw xmm6,xmm6,xmm9
00007FFE6C7D2174 vpslldq xmm9,xmm6,1
00007FFE6C7D2179 vpand xmm6,xmm7,xmm1
00007FFE6C7D217D vpand xmm10,xmm8,xmm1
00007FFE6C7D2181 vpackusdw xmm10,xmm6,xmm10
00007FFE6C7D2186 vpsrldq xmm6,xmm7,1
00007FFE6C7D218B vpsrldq xmm7,xmm8,1
00007FFE6C7D2191 vpand xmm6,xmm6,xmm1
00007FFE6C7D2195 vpand xmm7,xmm7,xmm1
00007FFE6C7D2199 vpackusdw xmm6,xmm6,xmm7
var hi = brightness( r, g, b, redMul, greenMul, blueMul );
00007FFE6C7D219E vpmulhuw xmm7,xmm9,xmm2
00007FFE6C7D21A2 vpmulhuw xmm8,xmm10,xmm3
00007FFE6C7D21A6 vpmulhuw xmm6,xmm6,xmm4
00007FFE6C7D21AA vpaddusw xmm7,xmm7,xmm8
00007FFE6C7D21AF vpaddusw xmm6,xmm7,xmm6
00007FFE6C7D21B3 vpsrlw xmm6,xmm6,8
00007FFE6C7D21B8 vpackuswb xmm5,xmm5,xmm6
Sse2.Store( dst, bytes );
00007FFE6C7D21BC vmovdqu xmmword ptr [rdx],xmm5
src += 64;
00007FFE6C7D21C0 add rcx,40h
dst += 16;
00007FFE6C7D21C4 add rdx,10h
while( src < srcEnd )
00007FFE6C7D21C8 cmp rcx,rax
00007FFE6C7D21CB jb 00007FFE6C7D2103
This code was generated by VC++ 2015, when compiling my manually vectorized C++.
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD11C0 vmovdqu xmm6,xmmword ptr [rcx-10h]
00007FF735AD11C5 vmovdqu xmm7,xmmword ptr [rcx-20h]
loadRgb( src + 2, r, g, b );
00007FF735AD11CA vmovdqu xmm9,xmmword ptr [rcx]
00007FF735AD11CE vmovdqu xmm8,xmmword ptr [rcx+10h]
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD11D3 vpand xmm3,xmm10,xmm6
00007FF735AD11D7 vpand xmm1,xmm11,xmm6
00007FF735AD11DB vpand xmm0,xmm11,xmm7
00007FF735AD11DF vpackusdw xmm1,xmm0,xmm1
00007FF735AD11E4 vpslldq xmm2,xmm1,1
const auto low = brightness( r, g, b );
00007FF735AD11E9 vpmulhuw xmm4,xmm2,xmm12
00007FF735AD11EE vpand xmm0,xmm10,xmm7
00007FF735AD11F2 vpackusdw xmm1,xmm0,xmm3
const auto low = brightness( r, g, b );
00007FF735AD11F7 vpmulhuw xmm2,xmm1,xmm13
00007FF735AD11FC vpaddusw xmm5,xmm4,xmm2
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD1200 vpsrldq xmm0,xmm6,1
00007FF735AD1205 vpand xmm3,xmm0,xmm10
00007FF735AD120A vpsrldq xmm1,xmm7,1
00007FF735AD120F vpand xmm2,xmm1,xmm10
00007FF735AD1214 vpackusdw xmm0,xmm2,xmm3
const auto low = brightness( r, g, b );
00007FF735AD1219 vpmulhuw xmm3,xmm0,xmm14
00007FF735AD121E vpaddusw xmm1,xmm5,xmm3
00007FF735AD1222 vpsrlw xmm6,xmm1,8
loadRgb( src + 2, r, g, b );
00007FF735AD1227 vpand xmm2,xmm11,xmm8
00007FF735AD122C vpand xmm0,xmm11,xmm9
00007FF735AD1231 vpackusdw xmm1,xmm0,xmm2
00007FF735AD1236 vpslldq xmm2,xmm1,1
const auto hi = brightness( r, g, b );
00007FF735AD123B vpmulhuw xmm4,xmm2,xmm12
loadRgb( src + 2, r, g, b );
00007FF735AD1240 vpand xmm0,xmm10,xmm9
00007FF735AD1245 vpand xmm3,xmm10,xmm8
00007FF735AD124A vpackusdw xmm1,xmm0,xmm3
const auto hi = brightness( r, g, b );
00007FF735AD124F vpmulhuw xmm2,xmm1,xmm13
00007FF735AD1254 vpaddusw xmm5,xmm4,xmm2
loadRgb( src + 2, r, g, b );
00007FF735AD1258 vpsrldq xmm1,xmm9,1
00007FF735AD125E vpand xmm2,xmm1,xmm10
00007FF735AD1263 vpsrldq xmm0,xmm8,1
00007FF735AD1269 vpand xmm3,xmm0,xmm10
00007FF735AD126E vpackusdw xmm0,xmm2,xmm3
const auto hi = brightness( r, g, b );
00007FF735AD1273 vpmulhuw xmm3,xmm0,xmm14
00007FF735AD1278 vpaddusw xmm1,xmm5,xmm3
00007FF735AD127C vpsrlw xmm2,xmm1,8
src += 4;
00007FF735AD1281 lea rcx,[rcx+40h]
const auto bytes = packus_epi16( low, hi );
00007FF735AD1285 vpackuswb xmm0,xmm6,xmm2
VecInteger* dest = (VecInteger*)destinationBytes;
while( src < srcEnd )
00007FF735AD1289 lea rax,[rcx-20h]
storeu_all( dest, bytes );
00007FF735AD128D vmovdqu xmmword ptr [rdx],xmm0
dest++;
00007FF735AD1291 lea rdx,[rdx+10h]
00007FF735AD1295 cmp rax,r8
00007FF735AD1298 jb Sse::convertToGrayscale+80h (07FF735AD11C0h)
Both snippets above only include the main loop of the program. As you see, they have nearly identical instructions, yet C# is twice as slow as C++.
Specifically, when tested with 511M pixels, the result on my PC (AMD Ryzen 5 3600) C++ code takes 221 ms, C# code takes 410 ms.
Why?
See Why is C# twice as slow as C++ even though the generated machine code is nearly identical? for the C# source.
C++ source code: https://github.com/Const-me/IntelIntrinsics/blob/master/CppDemo/brightness.cpp https://github.com/Const-me/IntelIntrinsics/blob/master/CppDemo/brightness.inl