I'm trying to use avx-512 to do matrix transpose. But the matrix can't perfectly transposed, it seems to have memory address problems.
I think the problem is related to the memory address part, such as the [j * rowA+i ] in mt code. Hope someone can tell me how to modify the memory address (for example:j * rowA+i).
Code:
#include <immintrin.h>
#include <complex.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
void print_complex_matrix(float *matA_re, float *matA_im, int rowA, int colA)
{
for (int i = 0; i < rowA; i++)
{
for (int j = 0; j < colA; j++)
{
printf("\t%.2f ", matA_re[colA * i + j]);
printf("+ %.2fi", matA_im[colA * i + j]);
}
printf("\n");
}
}
void matrix_transpose_avx(float *matA_re, float *matA_im, int rowA, int colA)
{
float *temp_re = (float *)malloc(rowA * colA * sizeof(float));
float *temp_im = (float *)malloc(rowA * colA * sizeof(float));
memcpy(temp_re, matA_re, (rowA * colA * sizeof(float)));
memcpy(temp_im, matA_im, (rowA * colA * sizeof(float)));
__m512 re_vec, im_vec;
for (int i = 0; i < ((rowA-1)/AVX)+1; i++)
{
for (int j = 0; j < colA ; j++)
{
re_vec = _mm512_loadu_ps(&temp_re[j * rowA + i ]);
im_vec = _mm512_loadu_ps(&temp_im[j * rowA + i ]);
_mm512_storeu_ps(&matA_re[i * colA + j ], re_vec);
_mm512_storeu_ps(&matA_im[i * colA + j ], im_vec);
}
}
free(temp_re);
free(temp_im);
}
int main()
{
int rowA = 4;
int colA = rowA;
float A_re[] = {1, 2, 4, 0, 0, 5, 0, 3, 6, 2, 4, 0, 0, 5, 0, 3};
float A_im[] = {1, 2, 4, 0, 0, 5, 0, 3, 6, 2, 4, 0, 0, 5, 0, 3};
printf(L_BLUE "\n------------mat A--------------------------\n" CLOSE);
print_complex_matrix(&A_re[0], &A_im[0], rowA, colA);
matrix_transpose_avx(A_re, A_im, rowA, colA);
printf(L_BLUE "\n------------mat A(new)--------------------------\n" CLOSE);
print_complex_matrix(&A_re[0], &A_im[0], rowA, colA);
}
Output
------------mat A--------------------------
1.00 + 1.00i 2.00 + 2.00i 4.00 + 4.00i 0.00 + 0.00i
0.00 + 0.00i 5.00 + 5.00i 0.00 + 0.00i 3.00 + 3.00i
6.00 + 6.00i 2.00 + 2.00i 4.00 + 4.00i 0.00 + 0.00i
0.00 + 0.00i 5.00 + 5.00i 0.00 + 0.00i 3.00 + 3.00i
------------mat A(new)--------------------------
1.00 + 5.00i 0.00 + 0.00i 6.00 + 3.00i 0.00 + 0.00i
5.00 + 5.00i 0.00 + 0.00i 3.00 + 3.00i 0.00 + 0.00i
0.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i 1.00 + 0.00i
2.00 + 0.00i 4.00 + 0.00i 0.00 + 0.00i 0.00 + 0.00i
*** stack smashing detected ***: <unknown> terminated
Aborted (core dumped)
I really need Proficient coder's help. Thanks in advance.