I have following snippet which sums all the elements of the array (size is hardcoded and is 32
):
static unsafe int F(int* a)
{
Vector256<int> ymm0 = Avx2.LoadVector256(a + 0);
Vector256<int> ymm1 = Avx2.LoadVector256(a + 8);
Vector256<int> ymm2 = Avx2.LoadVector256(a + 16);
Vector256<int> ymm3 = Avx2.LoadVector256(a + 24);
ymm0 = Avx2.Add(ymm0, ymm1);
ymm2 = Avx2.Add(ymm2, ymm3);
ymm0 = Avx2.Add(ymm0, ymm2);
const int s = 256 / 32;
int* t = stackalloc int[s];
Avx2.Store(t, ymm0);
int r = 0;
for (int i = 0; i < s; ++i)
r += t[i];
return r;
}
this generates following ASM
:
Program.F(Int32*)
L0000: sub rsp, 0x28
L0004: vzeroupper ; Question #1
L0007: vxorps xmm4, xmm4, xmm4
L000b: vmovdqa [rsp], xmm4 ; Question #2
L0010: vmovdqa [rsp+0x10], xmm4 ; Question #2
L0016: xor eax, eax ; Question #3
L0018: mov [rsp+0x20], rax
L001d: mov rax, 0x7d847bd1f9ce ; Question #4
L0027: mov [rsp+0x20], rax
L002c: vmovdqu ymm0, [rcx]
L0030: vmovdqu ymm1, [rcx+0x20]
L0035: vmovdqu ymm2, [rcx+0x40]
L003a: vmovdqu ymm3, [rcx+0x60]
L003f: vpaddd ymm0, ymm0, ymm1
L0043: vpaddd ymm2, ymm2, ymm3
L0047: vpaddd ymm0, ymm0, ymm2
L004b: lea rax, [rsp] ; Question #5
L004f: vmovdqu [rax], ymm0
L0053: xor edx, edx ; Question #5
L0055: xor ecx, ecx ; Question #5
L0057: movsxd r8, ecx
L005a: add edx, [rax+r8*4]
L005e: inc ecx
L0060: cmp ecx, 8
L0063: jl short L0057
L0065: mov eax, edx
L0067: mov rcx, 0x7d847bd1f9ce ; Question #4
L0071: cmp [rsp+0x20], rcx
L0076: je short L007d
L0078: call 0x00007ffc9de2d430 ; Question #6
L007d: nop
L007e: vzeroupper
L0081: add rsp, 0x28
L0085: ret
Questions
- Why do we need
VZEROUPPER
at the beginning. Wouldn't it be perfectly fine without it? - What do the
VMOVDQA
s do in the beginning. Or rather why are they there? - Zeroing out the
EAX
register? Why? Probably related to next lineMOV [RSP+0x20], RAX
, but still can't understand. - What does this mysterious value (
0x7d847bd1f9ce
) do? - There are also lines in between which I can not understand why are they needed (see "Question #5" comments in the code).
- I'm assuming this line (
L0078: call 0x00007ffc9de2d430
) throws an exception. Is there a function or something in my code that can throw an exception?
I know there are lot of question, but I can't separate them because they are related to each other I think. TO BE CRYSTAL CLEAR: I'm just trying to understand the generated ASM
here. I'm not a professional in this area.
Note
- In case you're wondering what
GCC (O2)
generates, here is the result:
int32_t
f(int32_t *a) {
__m256i ymm0;
__m256i ymm1;
__m256i ymm2;
__m256i ymm3;
ymm0 = _mm256_load_si256((__m256i*)(a + 0));
ymm1 = _mm256_load_si256((__m256i*)(a + 8));
ymm2 = _mm256_load_si256((__m256i*)(a + 16));
ymm3 = _mm256_load_si256((__m256i*)(a + 24));
ymm0 = _mm256_add_epi32(ymm0, ymm1);
ymm2 = _mm256_add_epi32(ymm2, ymm3);
ymm0 = _mm256_add_epi32(ymm0, ymm2);
int32_t t[8];
_mm256_store_si256((__m256i*)t, ymm0);
int32_t r;
r = 0;
for (int i = 0; i < 8; ++i)
r += t[i];
return r;
}
And the generated ASM
:
f:
push rbp
xor r8d, r8d
mov rbp, rsp
and rsp, -32
lea rax, [rsp-32]
mov rdx, rsp
vmovdqa ymm1, YMMWORD PTR [rdi+96]
vpaddd ymm0, ymm1, YMMWORD PTR [rdi+64]
vpaddd ymm0, ymm0, YMMWORD PTR [rdi+32]
vpaddd ymm0, ymm0, YMMWORD PTR [rdi]
vmovdqa YMMWORD PTR [rsp-32], ymm0
.L2:
add r8d, DWORD PTR [rax]
add rax, 4
cmp rax, rdx
jne .L2
mov eax, r8d
vzeroupper
leave
ret
I think It optimized (maybe heavily) my code here, but whatever.