I'm attempting to generate arrays of __m256i
's to reuse in another computation. When I attempt to do that (even with a minimal testcase), I get a segmentation fault - but only if the code is compiled with g++ or clang. If I compile the code with the Intel compiler (version 16.0), no segmentation fault occurs. Here is a test case I created:
int main() {
__m256i *table = new __m256i[10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
table[99] = zeroes;
}
When compiling the above with clang 3.6 and g++ 4.8, a segmentation fault occurs.
Here's the assembly generated by the Intel compiler (from https://gcc.godbolt.org/, icc 13.0):
pushq %rbx #3.12
movq %rsp, %rbx #3.12
andq $-32, %rsp #3.12
pushq %rbp #3.12
pushq %rbp #3.12
movq 8(%rbx), %rbp #3.12
movq %rbp, 8(%rsp) #3.12
movq %rsp, %rbp #3.12
subq $112, %rsp #3.12
movl $3200, %eax #4.38
vzeroupper #4.38
movq %rax, %rdi #4.38
call operator new[](unsigned long) #4.38
movq %rax, -112(%rbp) #4.38
movq -112(%rbp), %rax #4.38
movq %rax, -104(%rbp) #4.20
vxorps %ymm0, %ymm0, %ymm0 #5.22
vmovdqu %ymm0, -80(%rbp) #5.22
vmovdqu -80(%rbp), %ymm0 #5.22
vmovdqu %ymm0, -48(%rbp) #5.20
movl $3168, %eax #6.17
addq -104(%rbp), %rax #6.5
vmovdqu -48(%rbp), %ymm0 #6.17
vmovdqu %ymm0, (%rax) #6.5
movl $0, %eax #7.1
vzeroupper #7.1
leave #7.1
movq %rbx, %rsp #7.1
popq %rbx #7.1
ret #7.1
And here's from clang 3.7:
pushq %rbp
movq %rsp, %rbp
andq $-32, %rsp
subq $192, %rsp
xorl %eax, %eax
movl $3200, %ecx # imm = 0xC80
movl %ecx, %edi
movl %eax, 28(%rsp) # 4-byte Spill
callq operator new[](unsigned long)
movq %rax, 88(%rsp)
movq $0, 168(%rsp)
movq $0, 160(%rsp)
movq $0, 152(%rsp)
movq $0, 144(%rsp)
vmovq 168(%rsp), %xmm0 # xmm0 = mem[0],zero
vmovq 160(%rsp), %xmm1 # xmm1 = mem[0],zero
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq 152(%rsp), %xmm1 # xmm1 = mem[0],zero
vpslldq $8, %xmm1, %xmm1 # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
vmovaps %xmm1, %xmm2
vinserti128 $1, %xmm0, %ymm2, %ymm2
vmovaps %ymm2, 96(%rsp)
vmovaps %ymm2, 32(%rsp)
movq 88(%rsp), %rax
vmovaps %ymm2, 3168(%rax)
movl 28(%rsp), %eax # 4-byte Reload
movq %rbp, %rsp
popq %rbp
vzeroupper
retq
Am I running into a compiler bug in clang/g++? Or am I simply doing something wrong?