I want to speedup the matrix multiplication in C. I have tried to use several methods, such as OpenMP, SIMD, and cache friendliness to optimize it, and now the speed can reach 65x.
The reason I get relatively low speedup is that I use _mm256_storeu_pd()
in the inner most loop and memory writing is expensive.
Can anyone give me some ideas how to avoid using expensive memory writing in the inner most loop so that I can further optimize the code?
void mul_matrix(matrix *result, matrix *mat1, matrix *mat2){
int I = mat1->rows;
int J = mat2->cols;
int K = mat2->rows;
#pragma omp parallel for
for(int i = 0; i < I; i++){
for(int k = 0; k < K; k++){
_m256d vA = _mm256_set1_pd(mat1->data[i * K + k]);
for(int j = 0; j < J / 4 * 4; j += 4){
_m256d sum = _mm256_loadu_pd(result->data + i * J + j);
_m256d vB = _mm256_loadu_pd(mat2->data + k * J + j);
_m256d intermediate = _mm256_mul_pd(vA, vB);
sum = _mm256_add_pd(sum, intermediate);
_mm256_storeu_pd(result->data + i * J + j, sum);
}
for(int x = J / 4 * 4; x < J; x++){
result->data[i * J + x] += mat1 -> data[i * K + k] * mat2 -> data[k * J + x];
}
}
}
}
typedef struct matrix{
int rows;
int cols;
double* data;
}matrix;