In my code, I set a 128-bit variable to zero. But I don't quite understand why it translates to two move instructions in assembly code?
__m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd());
Corresponding assembly code has two move back and forth from xmm0 to 0x40(%rsp).
00709658: 0F 57 C0 xorps %xmm0, %xmm0
0070965B: 66 0F 29 44 24 40 movapd %xmm0, 0x40(%rsp)
00709661: 66 0F 28 44 24 40 movapd 0x40(%rsp), %xmm0
My compiler is Clang 10.0 and no optimization turned on when I ask the question.
Here is minimal implementation of my code.
template <int LEN>
bool SSEEncodeChunk(const char** srcp, char** dstp) {
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(*srcp));
__m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd());
__m128i zero_bytes = _mm_cmpeq_epi8(data, zeros);
bool all_zeros = _mm_testz_si128(zero_bytes, zero_bytes);
if ((!all_zeros)) {
return false;
}
_mm_storeu_si128(reinterpret_cast<__m128i*>(*dstp), data);
*dstp += LEN;
*srcp += LEN;
return true;
}
Update in June 28th, clang version 10.0.0-4ubuntu1, ubuntu2004.
Here is my code in assembly with O0. I also check the options, with -fomit-frame-pointer
. There are indeed more than one moving back and forth from %xmm, offset(%rsp)
. My first post only take part of them out.
(lldb) disassemble --name SSEEncodeChunk
index_type_traits_test`SSEEncodeChunk<16>:
[0x709630] <+0>: subq $0x58, %rsp
[0x709634] <+4>: movq %rdi, -0x38(%rsp)
[0x709639] <+9>: movq %rsi, -0x40(%rsp)
[0x70963e] <+14>: movq -0x38(%rsp), %rax
[0x709643] <+19>: movq (%rax), %rax
[0x709646] <+22>: movq %rax, -0x28(%rsp)
[0x70964b] <+27>: movq -0x28(%rsp), %rax
[0x709650] <+32>: movups (%rax), %xmm0
[0x709653] <+35>: movaps %xmm0, -0x50(%rsp)
[0x709658] <+40>: xorps %xmm0, %xmm0
[0x70965b] <+43>: movapd %xmm0, 0x40(%rsp) ; this is the point I ask the question at the first time
[0x709661] <+49>: movapd 0x40(%rsp), %xmm0
[0x709667] <+55>: movapd %xmm0, -0x60(%rsp)
[0x70966d] <+61>: movaps -0x50(%rsp), %xmm0
[0x709672] <+66>: movaps -0x60(%rsp), %xmm1
[0x709677] <+71>: movaps %xmm0, 0x30(%rsp)
[0x70967c] <+76>: movaps %xmm1, 0x20(%rsp)
[0x709681] <+81>: movaps 0x30(%rsp), %xmm0
[0x709686] <+86>: movaps 0x20(%rsp), %xmm1
[0x70968b] <+91>: pcmpeqb %xmm1, %xmm0
[0x70968f] <+95>: movdqa %xmm0, -0x70(%rsp)
[0x709695] <+101>: movdqa -0x70(%rsp), %xmm0
[0x70969b] <+107>: movdqa -0x70(%rsp), %xmm1
[0x7096a1] <+113>: movdqa %xmm0, 0x10(%rsp)
[0x7096a7] <+119>: movdqa %xmm1, (%rsp)
[0x7096ac] <+124>: movdqa 0x10(%rsp), %xmm0
[0x7096b2] <+130>: movdqa (%rsp), %xmm1
[0x7096b7] <+135>: ptest %xmm1, %xmm0
[0x7096bc] <+140>: sete %cl
[0x7096bf] <+143>: movzbl %cl, %edx
[0x7096c2] <+146>: cmpl $0x0, %edx
[0x7096c5] <+149>: setne %cl
[0x7096c8] <+152>: andb $0x1, %cl
[0x7096cb] <+155>: movb %cl, -0x71(%rsp)
[0x7096cf] <+159>: movb -0x71(%rsp), %cl
[0x7096d3] <+163>: xorb $-0x1, %cl
[0x7096d6] <+166>: testb $0x1, %cl
[0x7096d9] <+169>: jne 0x7096e4
[0x7096df] <+175>: jmp 0x7096ee
[0x7096e4] <+180>: movb $0x0, -0x29(%rsp)
[0x7096e9] <+185>: jmp 0x70973f
[0x7096ee] <+190>: movq -0x40(%rsp), %rax
[0x7096f3] <+195>: movq (%rax), %rax
[0x7096f6] <+198>: movdqa -0x50(%rsp), %xmm0
[0x7096fc] <+204>: movq %rax, -0x8(%rsp)
[0x709701] <+209>: movdqa %xmm0, -0x20(%rsp)
[0x709707] <+215>: movdqa -0x20(%rsp), %xmm0
[0x70970d] <+221>: movq -0x8(%rsp), %rax
[0x709712] <+226>: movdqu %xmm0, (%rax)
[0x709716] <+230>: movq -0x40(%rsp), %rax
[0x70971b] <+235>: movq (%rax), %rcx
[0x70971e] <+238>: addq $0x10, %rcx
[0x709725] <+245>: movq %rcx, (%rax)
[0x709728] <+248>: movq -0x38(%rsp), %rax
[0x70972d] <+253>: movq (%rax), %rcx
[0x709730] <+256>: addq $0x10, %rcx
[0x709737] <+263>: movq %rcx, (%rax)
[0x70973a] <+266>: movb $0x1, -0x29(%rsp)
[0x70973f] <+271>: movb -0x29(%rsp), %al
[0x709743] <+275>: andb $0x1, %al
[0x709745] <+277>: movzbl %al, %eax
[0x709748] <+280>: addq $0x58, %rsp
[0x70974c] <+284>: retq
After turned on -O2
, the back and forth move do disapper
0051E8D5: F3 0F 6F 08 movdqu (%rax), %xmm1
0051E8D9: 66 0F EF C0 pxor %xmm0, %xmm0
0051E8DD: 66 0F 6F D1 movdqa %xmm1, %xmm2
0051E8E1: 66 0F 74 D0 pcmpeqb %xmm0, %xmm2
0051E8E5: 66 0F 38 17 D2 ptest %xmm2, %xmm2
0051E8EA: 0F 85 97 03 00 00 jne 0x51ec87
0051E8F0: F3 0F 7F 0C 3B movdqu %xmm1, (%rbx,%rdi)
...
0051E967: 48 01 F8 addq %rdi, %rax
0051E96A: 48 83 C0 10 addq $0x10, %rax
0051E96E: 48 01 FB addq %rdi, %rbx
0051E971: 48 83 C3 10 addq $0x10, %rbx