int MAX_DIM = 100;
float a[MAX_DIM][MAX_DIM]__attribute__ ((aligned(16)));
float b[MAX_DIM][MAX_DIM]__attribute__ ((aligned(16)));
float d[MAX_DIM][MAX_DIM]__attribute__ ((aligned(16)));
/*
* I fill these arrays with some values
*/
for(int i=0;i<MAX_DIM;i+=1){
for(int j=0;j<MAX_DIM;j+=4){
for(int k=0;k<MAX_DIM;k+=4){
__m128 result = _mm_load_ps(&d[i][j]);
__m128 a_line = _mm_load_ps(&a[i][k]);
__m128 b_line0 = _mm_load_ps(&b[k][j+0]);
__m128 b_line1 = _mm_loadu_ps(&b[k][j+1]);
__m128 b_line2 = _mm_loadu_ps(&b[k][j+2]);
__m128 b_line3 = _mm_loadu_ps(&b[k][j+3]);
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0x00), b_line0));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0x55), b_line1));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0xaa), b_line2));
result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(a_line, a_line, 0xff), b_line3));
_mm_store_ps(&d[i][j],result);
}
}
}
the above code I made to make matrix multiplication using SSE. the code runs as flows I take 4 elements from row from a multiply it by 4 elements from a column from b and move to the next 4 elements in the row of a and next 4 elements in column b
I get an error Segmentation fault (core dumped)
I don't really know why
I use gcc 5.4.0 on ubuntu 16.04.5
Edit : The segmentation fault was solved by _mm_loadu_ps Also there is something wrong with logic i will be greatfull if someone helps me to find it