I'm developing a high throughput low latency real-time program that involves several matrix operations.
I have decided to use AVX2 or AVX512 to boost the performance of system. This is my first first attempt to use AVX instruction set of SIMD in general.
I'm using the AVX Intrinsics functions available in g++.
The problem I am facing is when I use _mm256_load_ps
function I get segmentation fault error but when I use _mm256_set_ps
the program runs.
I was told _mm256_load_ps
will have better performance than _mm256_set_ps
in my application.
What am I doing wrong?
This is a program to use AVX2 to add 2 matrices.
Code
#include <immintrin.h>
#include <string.h>
const std::uint64_t MAX_COUNT = 100000;
int main()
{
float mat1[MAX_COUNT], mat2[MAX_COUNT], rslt[MAX_COUNT];
for(int i = 0; i < MAX_COUNT; i++){
mat1[i] = i;
mat2[i] = 100-i;
}
for(int i = 0; i < MAX_COUNT; i +=8)
{
//Working Properly
//auto avx_a = _mm256_set_ps(mat1[i+7], mat1[i+6], mat1[i+5], mat1[i+4], mat1[i+3], mat1[i+2], mat1[i+1], mat1[i+0]);
//Working Properly
//auto avx_b = _mm256_set_ps(mat2[i+7], mat2[i+6], mat2[i+5], mat2[i+4], mat2[i+3], mat2[i+2], mat2[i+1], mat2[i+0]);
//Resulting in segmentation fault
auto avx_a = _mm256_load_ps(&mat1[i]);
//Resulting in segmentation fault
auto avx_b = _mm256_load_ps(&mat2[i]);
auto avx_c = _mm256_add_ps(avx_a, avx_b);
float *result = (float*)&avx_c;
memcpy(&rslt[i], result, 8*sizeof(float));
}
return 0;
}
Aligning Data
__declspec(align(32)) float mat1[MAX_COUNT]
Error
test_2.cpp: In function ‘int main()’:
test_2.cpp:11:21: error: too few arguments to function ‘void* std::align(std::size_t, std::size_t, void*&, std::size_t&)’
11 | __declspec(align(32)) float mat1[MAX_COUNT];
| ~~~~~^~~~
In file included from /usr/include/c++/11/memory:72,
from /usr/include/x86_64-linux-gnu/c++/11/bits/stdc++.h:82,
from test_2.cpp:2:
/usr/include/c++/11/bits/align.h:62:1: note: declared here
62 | align(size_t __align, size_t __size, void*& __ptr, size_t& __space) noexcept
| ^~~~~
test_2.cpp:11:5: error: ‘__declspec’ was not declared in this scope
11 | __declspec(align(32)) float mat1[MAX_COUNT];
| ^~~~~~~~~~