This is one of those n00b questions where I'm doing something wrong but I don't fully understand yet.
The xxhash32
algorithm has a nice 16 byte inner loop that can be made faster with SIMD, so, as an exercise to myself, this is what I'm trying to do.
The body of the loop looks like this (numBytes
is some multiple of 16):
// C# that gets auto-vectorized. uint4 is a vector of 4 elements
uint4 state = new uint4(Prime1 + Prime2, Prime2, 0, (uint)-Prime1) + seed;
int count = numBytes >> 4;
for (int i = 0; i < count; ++i) {
state += *p++ * Prime2;
state = (state << 13) | (state >> 19);
state *= Prime1;
}
hash = rol(state.x, 1) + rol(state.y, 7) + rol(state.z, 12) + rol(state.w, 18);
I've translated this into the following SSE2/SSE4.1 intrinsics:
auto prime1 = _mm_set1_epi32(kPrime1);
auto prime2 = _mm_set1_epi32(kPrime2);
auto state = _mm_set_epi32(seed + kPrime1 + kPrime2, seed + kPrime2, seed, seed - kPrime1);
int32_t count = size >> 4; // =/16
for (int32_t i = 0; i < count; i++) {
state = _mm_add_epi32(state, _mm_mullo_epi32(_mm_loadu_si128(p128++), prime2));
state = _mm_or_si128(_mm_sll_epi32(state, _mm_cvtsi32_si128(13)), _mm_srl_epi32(state, _mm_cvtsi32_si128(19)));
state = _mm_mullo_epi32(state, prime1);
}
uint32_t temp[4];
_mm_storeu_si128(state, temp);
hash = _lrotl(temp[0], 1) + _lrotl(temp[1], 7) + _lrotl(temp[2], 12) + _lrotl(temp[3], 18);
Here's the disassembly of the inner loop body:
mov rax,qword ptr [p128]
mov qword ptr [rsp+88h],rax
mov rax,qword ptr [rsp+88h]
movdqu xmm0,xmmword ptr [rax]
movdqa xmmword ptr [rsp+90h],xmm0
movdqa xmm0,xmmword ptr [rsp+90h]
movdqa xmmword ptr [rsp+120h],xmm0
mov rax,qword ptr [p128]
add rax,10h
mov qword ptr [p128],rax
movdqa xmm0,xmmword ptr [prime2]
movdqa xmmword ptr [rsp+140h],xmm0
movdqa xmm0,xmmword ptr [rsp+120h]
movdqa xmmword ptr [rsp+130h],xmm0
movdqa xmm0,xmmword ptr [rsp+130h]
pmulld xmm0,xmmword ptr [rsp+140h]
movdqa xmmword ptr [rsp+150h],xmm0
movdqa xmm0,xmmword ptr [rsp+150h]
movdqa xmmword ptr [rsp+160h],xmm0
movdqa xmm0,xmmword ptr [rsp+160h]
movdqa xmmword ptr [rsp+170h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+100h],xmm0
movdqa xmm0,xmmword ptr [rsp+100h]
paddd xmm0,xmmword ptr [rsp+170h]
movdqa xmmword ptr [rsp+180h],xmm0
movdqa xmm0,xmmword ptr [rsp+180h]
movdqa xmmword ptr [rsp+190h],xmm0
movdqa xmm0,xmmword ptr [rsp+190h]
movdqa xmmword ptr [rsp+20h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+1A0h],xmm0
mov eax,13h
movd xmm0,eax
movdqa xmmword ptr [rsp+1B0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1A0h]
psrld xmm0,xmmword ptr [rsp+1B0h]
movdqa xmmword ptr [rsp+1C0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1C0h]
movdqa xmmword ptr [rsp+200h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+1D0h],xmm0
mov eax,0Dh
movd xmm0,eax
movdqa xmmword ptr [rsp+1E0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1D0h]
pslld xmm0,xmmword ptr [rsp+1E0h]
movdqa xmmword ptr [rsp+1F0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1F0h]
movdqa xmmword ptr [rsp+210h],xmm0
movdqa xmm0,xmmword ptr [rsp+200h]
movdqa xmmword ptr [rsp+230h],xmm0
movdqa xmm0,xmmword ptr [rsp+210h]
movdqa xmmword ptr [rsp+220h],xmm0
movdqa xmm0,xmmword ptr [rsp+220h]
por xmm0,xmmword ptr [rsp+230h]
movdqa xmmword ptr [rsp+240h],xmm0
movdqa xmm0,xmmword ptr [rsp+240h]
movdqa xmmword ptr [rsp+250h],xmm0
movdqa xmm0,xmmword ptr [rsp+250h]
movdqa xmmword ptr [rsp+20h],xmm0
movdqa xmm0,xmmword ptr [prime1]
movdqa xmmword ptr [rsp+280h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+270h],xmm0
movdqa xmm0,xmmword ptr [rsp+270h]
pmulld xmm0,xmmword ptr [rsp+280h]
movdqa xmmword ptr [rsp+290h],xmm0
movdqa xmm0,xmmword ptr [rsp+290h]
movdqa xmmword ptr [rsp+2A0h],xmm0
movdqa xmm0,xmmword ptr [rsp+2A0h]
movdqa xmmword ptr [rsp+20h],xmm0
Some questions about the disassembly:
- Why so many
movdqa
instructions (I thought the purpose of intrinsics was that they mapped to specific hardware instructions.)? - Why is only
xmm0
used, it looks to me like it is shuffling memory in and out of the vector pipeline (I'm expecting more xmmN registers to be used)?
This is compiled with Visual C++ 2017, I haven't enabled additional optimizations.
When I run these two snippets over a block of 64 MiB, many times over, the scalar code is about 3 timers faster. This is not what I expect to happen, what have I missed?