I'm trying to use avx-512 to do matrix transpose. But the matrix still can’t be transposed. When I try to tranpose a 16*16 matrix, the code will run i=0 and j=0~15. I think the problem is related to matrix element address (such as &temp_re[j * rowA + i], &matA_re[i * colA + j]). I hope someone can tell me how to modify the matrix element address, so the function can transpose any size of matrix.
Code:
#define AVX 16
//--------------------
#include <immintrin.h>
// C
#include <complex.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <math.h>
void matrix_transpose_avx2(float *matA_re, float *matA_im, int rowA, int colA)
{
float *temp_re = (float *)malloc(rowA * colA * sizeof(float));
float *temp_im = (float *)malloc(rowA * colA * sizeof(float));
memcpy(temp_re, matA_re, (rowA * colA * sizeof(float)));
memcpy(temp_im, matA_im, (rowA * colA * sizeof(float)));
int remaining = rowA % 16; // Elements that can't be processed in AVX512 vectors
int row_limit = rowA - remaining;
__m512 re_vec, im_vec;
for (int i = 0; i < (row_limit/AVX) ; i++)
{
for (int j = 0; j < colA; ++j)
{
re_vec = _mm512_loadu_ps(&temp_re[j * rowA + i * AVX]);
im_vec = _mm512_loadu_ps(&temp_im[j * rowA + i * AVX]);
_mm512_storeu_ps(&matA_re[i * AVX * colA + j], re_vec);
_mm512_storeu_ps(&matA_im[i * AVX * colA + j], im_vec);
for (int k = 0; k < AVX; k++)
{
printf("matA_re[%d] = %.2f, ", i + j * AVX + k, matA_re[i + j * AVX + k]);
printf("matA_im[%d] = %.2f\n", i + j * AVX + k, matA_im[i + j * AVX + k]);
}
}
}
// Process remaining elements
for (int i = row_limit; i < rowA; ++i)
{
for (int j = 0; j < colA; ++j)
{
matA_re[i * colA + j] = temp_re[j * rowA + i];
matA_im[i * colA + j] = temp_im[j * rowA + i];
}
}
free(temp_re);
free(temp_im);
}
I really need Proficient coder's help. Thanks in advance.