I wanted to implement transposition of a matrix by dividing the input matrix into blocks and then transposing them. I referred to the corresponding post A Cache Efficient Matrix Transpose Program? and wrote my code like this:
#include<iostream>
#include<stdlib.h>
#define m 4
#include<sys/time.h>
#include<time.h>
#include<malloc.h>
using namespace std;
int **a, **b, **c;
int count = 0;
clock_t t1, t2;
int blocksize = 2;
int main(){
a = (int **)malloc(m*sizeof(int *));
for(int i = 0;i<m;i++){
a[i] = (int *)malloc(m*sizeof(int));
}
b = (int **)malloc(m*sizeof(int *));
for(int i = 0;i<m;i++){
b[i] = (int *)malloc(m*sizeof(int));
}
for(int i=0; i<m; i++){
for(int j =0; j<m; j++){
a[i][j]=(2*i)+(3*j);
}
}
for(int i=0; i<m; i++){
for(int j =0; j<m; j++){
cout << a[i][j] << "\t";
}
cout << "\n";
}
cout << "\n";
t1 = clock();
// MAIN BLOCK TRANSPOSE CODE
for (int i = 0; i < m; i += blocksize) {
for (int j = 0; j < m; j += blocksize) {
for (int k = i; k < i + blocksize; ++k) {
for (int l = j; l < j + blocksize; ++l) {
b[k + l*m] = a[l + k*m];
}
}
}
}
t2 = clock();
for(int i=0; i<m; i++){
for(int j =0; j<m; j++){
cout << b[i][j] << "\t";
}
cout << "\n";
}
free(a);
free(b);
cout << "\n";
cout << (double)(t2-t1)/CLOCKS_PER_SEC << "\n";
return 0;
}
However, the code is not working as expected. I implemented the code that is said to be working in the corresponding post. Please help if possible.
Input Array:
0 3 6 9
2 5 8 11
4 7 10 13
6 9 12 15
Expected Output Array:
0 2 4 6
3 5 7 9
6 8 10 12
9 11 13 15
Obtained Result:
0 3 6 9
Segmentation fault