In this example I'm adding two arrays using AVX2. If I declare the arrays on the stack, it all works as expected. However, when the memory is allocated on the heap it compiles, but throws a Segmentation Fault at runtime.
Compilation succeeds but throws the following warning:
AVX vector return without AVX enabled changes the ABI [-Wpsabi]
I'm trying this on a i7-8550U with AVX2 support, and compiling with the flags: -march=native -mavx2
.
Here is a minimal example using static memory:
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
int main()
{
__attribute__((aligned(32))) uint32_t buffer1[8];
__attribute__((aligned(32))) uint32_t buffer2[8];
__attribute__((aligned(32))) uint32_t buffer3[8];
__m256i* buffer1_s = (__m256i*)buffer1;
__m256i* buffer2_s = (__m256i*)buffer2;
__m256i* buffer3_s = (__m256i*)buffer3;
for (uint32_t i = 0; i < 8; ++i) {
buffer1[i] = rand() % 64;
buffer2[i] = rand() % 64;
}
*buffer3_s = _mm256_add_epi32(*buffer1_s, *buffer2_s);
for (uint32_t i = 0; i < 8; ++i) {
printf("%d ", buffer1[i]);
printf("%d ", buffer2[i]);
printf("%d ", buffer3[i]);
printf("\n");
}
}
And here a broken one allocating dynamic memory:
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
int main()
{
__attribute__((aligned(32))) uint32_t* buffer1 = malloc(8 * sizeof(uint32_t));
__attribute__((aligned(32))) uint32_t* buffer2 = malloc(8 * sizeof(uint32_t));
__attribute__((aligned(32))) uint32_t* buffer3 = malloc(8 * sizeof(uint32_t));
__m256i* buffer1_s = (__m256i*)buffer1;
__m256i* buffer2_s = (__m256i*)buffer2;
__m256i* buffer3_s = (__m256i*)buffer3;
for (uint32_t i = 0; i < 8; ++i) {
buffer1[i] = rand() % 64;
buffer2[i] = rand() % 64;
}
*buffer3_s = _mm256_add_epi32(*buffer1_s, *buffer2_s);
for (uint32_t i = 0; i < 8; ++i) {
printf("%d ", buffer1[i]);
printf("%d ", buffer2[i]);
printf("%d ", buffer3[i]);
printf("\n");
}
free(buffer1);
free(buffer2);
free(buffer3);
}
They are both compiled with:
gcc -O0 -g -march=native -mavx2 -o test_avx2 test_avx2.c
- What is causing the segmentation fault?
- What would be the correct way to use AVX2 on memory that's been malloc'd?
Solution:
Memory needs to be allocated with the correct alignment. The compiler attribute is not enough to force the compiler to do it.
__attribute__((aligned(32))) uint32_t* buffer1 = malloc(8 * sizeof(uint32_t));
should instead be
uint32_t* buffer1 = aligned_alloc(32, 8 * sizeof(uint32_t));