I noticed something strange when I compile this code on godbolt, with MSVC:
#include <intrin.h>
#include <cstdint>
void test(unsigned char*& pSrc) {
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(pSrc));
int32_t mask = _mm256_movemask_epi8(data);
if (!mask) {
++pSrc;
}
else {
unsigned long v;
_BitScanForward(&v, mask);
pSrc += v;
}
}
I get this resulting assembly:
pSrc$ = 8
void test(unsigned char * &) PROC ; test, COMDAT
mov rdx, QWORD PTR [rcx]
vmovdqu ymm0, YMMWORD PTR [rdx]
vpmovmskb eax, ymm0
test eax, eax
jne SHORT $LN2@test
mov eax, 1
add rax, rdx
mov QWORD PTR [rcx], rax
vzeroupper ; Why is this being inserted?
ret 0
$LN2@test:
bsf eax, eax
add rax, rdx
mov QWORD PTR [rcx], rax
vzeroupper ; Why is this being inserted?
ret 0
void test(unsigned char * &) ENDP ; test
Why is vzeroupper
being inserted at the end of each scope? I heard that it's because of switching between SSE
and AVX
, but I'm not doing that here. I'm using exclusively AVX code.
I was wondering, does this pose a performance problem?