I have been trying to implement a distributed matrix transpose program. The main idea is to have a template for each processor (pxq), and split the matrix among the processors using the template, then use block cyclic distribution after distributing the blocks among the processors. What I am trying to do is in the simple paper in the below link, ...
I have checked the below answer from your site:
sending blocks of 2D array in C using MPI
.. seems okay as you used the MPI_create, in my code I actually went to use the MPI_Cart, Cartesian topology but I got stuck in half way because I didn't understand what they did in the paper. How they distributed the blocks among different processors, (how to program the 2D block cyclic among these processors)?
So what my question is if you can help me:
How exactly do I code a 2 dimensional block cyclic (if we say we have 12x12 matrix, each processor having template 3x4)?
Can you check the link above and see how did they distribute the blocks among the processors? Need any help I can get! I am desperate, and finally do I keep going on the Cartesian topology?
Below is my code, part of it, couldn't know whats next step:
#include "mpi.h"
#include <stdio.h>
#define NP 4 // number of processors
#define M_ROW 4 //template of processor row
#define M_COL 3 //template of processor col
int main(int argc, char *argv[])
{
int myid, numprocs;
MPI_Comm comm;
int dim[2], period[2], reorder;
int coord[2];
int A[8][6], array_P[M_ROW][M_COL]; //, AT[8][6];
int n =0, Temp;
int TT[8][6];
int iv, jv, rankid; // for coordinates of each processor in the Cartesian matrix
int k, y, i,j;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
// First: building the matrix
for (i = 0; i < 8; i++)
for (j = 0; j < 6; j++)
{
A[i][j] = n;
n++;
}
//Second to the virtual matrix with each processor having cartesian Coord.
dim[0]= 2; // dimension
dim[1]= 2; // dimensions assign for Cartesian
period[0]=1; period[1]=1; //row periodic + col periodic (each column/row forms a ring)
reorder=1; // here is false in meaning to allow the reordering of the processors
MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &comm);
MPI_Comm_rank(comm, &rankid); // creating rank for each processor location
MPI_Cart_coords(comm, rankid, 2, coord); // creating coordinates for each Prc.
MPI_Barrier(MPI_COMM_WORLD);
iv = coord[0];
jv = coord[1];
//printf("Processor Rank %d receive dimensions (iv,jv)-> iv: %d ,jv: %d \n", myid, coord[0], coord[1]);
for (k=0; k<M_ROW; k++)
{
for (y=0; y<M_COL; y++)
{
i = k + iv*M_ROW;
j = y + jv*M_COL;
//array_P[k][y] = i*10 + j;
array_P[k][y] = A[i][j];
}
}//end loop of filling data
//("Processor %d: Before Transpose:\n", myid);
if(myid == 3)
{
for (k=0; k<M_ROW; k++) // 3 ?? NBLK_R;
{
j = k + iv*M_ROW;
for (y=0; y<M_COL; y++) // 2 ?
{
i = y + jv*M_COL;
printf(" %d ", A[j][i]);
}
printf("\n");
}
}
printf("\n");
//MPI_Alltoall(TT, M_ROW*M_COL, MPI_INT, TT, M_ROW*M_COL, MPI_INT, MPI_COMM_WORLD);
/*
if(myid == 2)
{
for (k=0; k<M_ROW; k++) // 3 ?? NBLK_R;
{
// = k + iv*M_ROW;
for (y=0; y<M_COL; y++) // 2 ?
{
//i = y + jv*M_COL;
//Final[j][i] = array_PT[x][y];// check the arraypt ?
printf(" %d ", array_P[k][y]);
}
printf("\n");
}
} */
//Fourth - transposing the original matrix
for (k=0; k<M_ROW; k++)
{
for (y=0; y<M_COL; y++)
{
i = k + iv*M_ROW;
j = y + jv*M_COL;
Temp = A[i][j];
A[i][j] = A[j][i];
A[j][i] = Temp;
}
}
printf("\n \n");
if(myid == 3)
{
for (k=0; k<M_ROW; k++) // 3 ?? NBLK_R;
{
j = k + iv*M_ROW;
for (y=0; y<M_COL; y++) // 2 ?
{
i = y + jv*M_COL;
printf(" %d ", A[j][i]);
}
printf("\n");
}
}
printf("\n");
//MPI_Barrier(comm);
// send to main process - process 0 in our case - all the array_PT transposed
// ml*nl -> 2*3
//MPI_Send(array_PT,M_COL*M_ROW , MPI_INT, 0, 1, comm);
//MPI_Isend(array_PT,M_COL*M_ROW , MPI_INT, 0, 1, comm, &request);
//MPI_Barrier(MPI_COMM_WORLD);
//int iv_tt , jv_tt;
//******************************
MPI_Finalize();
return 0;
}