Here are two links to godblot(replaced by the actual code), in one case I sum the elements of an array declared on the stack and the second sum elements of array allocated on the heap (please ignore the incorect size for malloc).
Heap (-std=c++17 -O3 -mavx2):
#include <stdio.h>
#include <stdlib.h>
int access(){
int sum;
//int vec[10000];
int *vec = (int*)malloc(10000*sizeof(int));
for(int i=0;i<10000;i++)
sum+=vec[i];
return sum;
}
Assembly for heap:
access():
push rbp
mov edi, 10000
mov rbp, rsp
and rsp, -32
call malloc
vpxor xmm1, xmm1, xmm1
lea rdx, [rax+40000]
.L2:
vmovdqu xmm2, XMMWORD PTR [rax]
vinserti128 ymm0, ymm2, XMMWORD PTR [rax+16], 0x1
add rax, 32
vpaddd ymm1, ymm1, ymm0
cmp rdx, rax
jne .L2
vmovdqa xmm0, xmm1
vextracti128 xmm1, ymm1, 0x1
vpaddd xmm1, xmm0, xmm1
vpsrldq xmm0, xmm1, 8
vpaddd xmm0, xmm1, xmm0
vpsrldq xmm1, xmm0, 4
vpaddd xmm0, xmm0, xmm1
vmovd eax, xmm0
vzeroupper
leave
ret
For stack C code just use the vec declared on the stack that is commented in the previous C code.
Stack() assembly code:
access():
push rbp
vpxor xmm0, xmm0, xmm0
mov rbp, rsp
and rsp, -32
sub rsp, 39880
lea rax, [rsp-120]
lea rdx, [rsp+39880]
.L2:
vpaddd ymm0, ymm0, YMMWORD PTR [rax]
add rax, 32
cmp rdx, rax
jne .L2
vmovdqa xmm1, xmm0
vextracti128 xmm0, ymm0, 0x1
vpaddd xmm0, xmm1, xmm0
vpsrldq xmm1, xmm0, 8
vpaddd xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 4
vpaddd xmm0, xmm0, xmm1
vmovd eax, xmm0
vzeroupper
leave
ret
Any idea why in the heap case it needs to use more instructions. It also seems that it skips 16 bytes of data which it stores in xmm2 register?
If we change the compiler argument from -mavx2 to -mavx512f or remove completly remove it, the code looks similar. Is there an issue with the gcc trunk version?
Clang seems to unroll the loop and generates code that makes sense.