Consider the following C program.
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
static void do_stuff(void)
{
const int n = 256;
int *ar = malloc(n * sizeof(int));
for (int i = 0; i < n; i++)
ar[i] = random();
}
int main(void)
{
do_stuff();
__m256i sm = _mm256_setzero_si256();
int sum = 0;
int *vcadd = (int*)&sm;
for (size_t l = 0; l < 8; l++)
sum += vcadd[l];
printf("sum = %d\n", sum);
return 0;
}
I expected this program to print sum = 0
, but when I compile it with gcc -mavx2 src.c -O2
, it sometimes prints sum = 0
, sometimes sum = 18
.
When compiled with -O1
or -O0
, the programs works as expected. It also seems to work fine with -O2
and the do_stuff();
call commented out.
Assembly generated for main
with -O1
(+ comments from me of what I think the instructions do):
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
movl $1024, %edi
call malloc@PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.L2:
call random@PLT
movl %eax, (%rbx)
addq $4, %rbx
cmpq %r12, %rbx
jne .L2
vpxor %xmm0, %xmm0, %xmm0 ; zero out %ymm0
vmovdqa %ymm0, (%rsp) ; store these zeros at %rsp
movq %rsp, %rax ; add up the 8 ints stored at %rsp,..., %rsp + 32 (upper bound exclusive)
leaq 32(%rsp), %rcx ; ^
movl $0, %edx ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
leaq .LC0(%rip), %rsi
movl $1, %edi
movl $0, %eax
call __printf_chk@PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L8
movl $0, %eax
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L8:
.cfi_restore_state
call __stack_chk_fail@PLT
.cfi_endproc
and with -O2
:
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $1024, %edi
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
call malloc@PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.p2align 4,,10
.p2align 3
.L2:
call random@PLT
addq $4, %rbx
movl %eax, -4(%rbx)
cmpq %r12, %rbx
jne .L2
movq %rsp, %rax ; just add up %rsp,..., %rsp + 32 without setting that memory to zero
leaq 32(%rsp), %rcx ; ^
xorl %edx, %edx ; ^
.p2align 4,,10 ; ^
.p2align 3 ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
xorl %eax, %eax
leaq .LC0(%rip), %rsi
movl $1, %edi
call __printf_chk@PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L9
leaq -16(%rbp), %rsp
xorl %eax, %eax
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L9:
.cfi_restore_state
call __stack_chk_fail@PLT
.cfi_endproc
So my question is: Why can the compiler do this optimization? Shouldn't the output always be sum = 0
?
I'm using
gcc (Ubuntu 11.2.0-7ubuntu2) 11.2.0
Solution based on comments
(all below compiled with -O2
)
Using memcpy
as
__m256i sm = _mm256_setzero_si256();
int ar[8];
memcpy(ar, &sm, 32);
copies the data, although in a somewhat convoluted way (?)
vpxor %xmm0, %xmm0, %xmm0
leaq 48(%rsp), %rax
leaq 80(%rsp), %rcx
xorl %edx, %edx
vmovdqa %ymm0, (%rsp)
vmovdqa 16(%rsp), %xmm2
vmovdqa %xmm0, 48(%rsp)
vmovdqa %xmm2, 64(%rsp)
A union
union conv
{
__m256i val;
int ar[8];
};
union conv c;
c.val = _mm256_setzero_si256();
// access c.ar
works too by producing
vpxor %xmm0, %xmm0, %xmm0
leaq 4(%rsp), %rax
leaq 32(%rsp), %rsi
xorl %ecx, %ecx
vmovdqa %ymm0, (%rsp)
Another option is to compile with -fno-strict-aliasing
. In that case, the original code works as I expected.