I am reading "Using MPI" and try to execute the code myself. There is a grid decomposition code in Chapter 6.3. It compiles with no warnings or errors, and runs with small number processes, but fails with larger numbers, say 30, on my laptop. My laptop is 4 core, hyperthreaded, and 8G RAM. Both versions of la_grid_2d_new
do not work, but the first one tolerate a little larger number, say 35, but fails for 40 processes. I am not sure why. Could you help me please? Thanks a lot.
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
typedef struct
{
int P, Q;
int p, q;
MPI_Comm grid_comm;
MPI_Comm row_comm;
MPI_Comm col_comm;
} LA_Grid_2d;
LA_Grid_2d *la_grid_2d_new(MPI_Comm comm, int P, int Q)
{
LA_Grid_2d *grid;
MPI_Comm row, col;
int my_rank, p, q;
MPI_Comm_rank(comm, &my_rank);
p=my_rank/Q;
q=my_rank%Q;
MPI_Comm_split(comm, p, q, &row);
MPI_Comm_split(comm, q, p, &col);
grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d));
grid->grid_comm=comm;
grid->row_comm=row;
grid->col_comm=col;
grid->P=P;
grid->Q=Q;
grid->p=p;
grid->q=q;
return grid;
}
LA_Grid_2d *la_grid_2d_new_II(MPI_Comm comm, int P, int Q)
{
LA_Grid_2d *grid;
MPI_Comm comm_2d, row, col;
int my_rank, p, q;
int dims[2]={P,Q}, local[2], period[2]={0,0}, remain_dims[2];
MPI_Cart_create(comm, 2, dims, period, 1, &comm_2d);
MPI_Comm_rank(comm, &my_rank);
MPI_Cart_coords(comm_2d, my_rank, 2, local);
p=local[0];
q=local[1];
remain_dims[0]=0;
remain_dims[1]=1;
MPI_Cart_sub(comm_2d, remain_dims, &row);
remain_dims[0]=1;
remain_dims[1]=0;
MPI_Cart_sub(comm_2d, remain_dims, &col);
grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d));
grid->grid_comm=comm;
grid->row_comm=row;
grid->col_comm=col;
grid->P=P;
grid->Q=Q;
grid->p=p;
grid->q=q;
return grid;
}
void la_grid_2d_delete(LA_Grid_2d *grid)
{
free(grid);
}
int main(int argc, char **argv)
{
LA_Grid_2d *pgrid;
int size, rank, dims[2]={0,0}, row, col;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if(rank==0)
printf("size=%d rank=%d\n", size, rank);
MPI_Dims_create(size, 2, dims);
// pgrid=la_grid_2d_new(MPI_COMM_WORLD, dims[0], dims[1]);
pgrid=la_grid_2d_new_II(MPI_COMM_WORLD, dims[0], dims[1]);
if(rank==0)
printf("dims[0]=%d dims[1]=%d\n", dims[0], dims[1]);
MPI_Reduce(&rank, &row, 1, MPI_INT, MPI_SUM, 0, pgrid->row_comm);
MPI_Reduce(&rank, &col, 1, MPI_INT, MPI_SUM, 0, pgrid->col_comm);
la_grid_2d_delete(pgrid);
MPI_Finalize();
if(rank==0)
printf("row=%d col=%d\n", row, col);
return 0;
}
The error messages are:
shuang@phoebe:~/usingMPI$ mpiexec -n 20 ./grid
size=20 rank=0
dims[0]=5 dims[1]=4
row=6 col=40
shuang@phoebe:~/usingMPI$ mpiexec -n 30 ./grid
size=30 rank=0
dims[0]=6 dims[1]=5
[phoebe:14939] *** Process received signal ***
[phoebe:14939] Signal: Floating point exception (8)
[phoebe:14939] Signal code: Integer divide-by-zero (1)
[phoebe:14939] Failing at address: 0x7fb1e599e6f7
[phoebe:14939] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0) [0x7fb1e5714cb0]
[phoebe:14939] [ 1] /usr/lib/libmpi.so.0(mca_topo_base_cart_coords+0x57) [0x7fb1e599e6f7]
[phoebe:14939] [ 2] /usr/lib/libmpi.so.0(mca_topo_base_cart_sub+0x166) [0x7fb1e599ec36]
[phoebe:14939] [ 3] /usr/lib/libmpi.so.0(PMPI_Cart_sub+0xba) [0x7fb1e596f34a]
[phoebe:14939] [ 4] ./grid(la_grid_2d_new_II+0xd6) [0x400df6]
[phoebe:14939] [ 5] ./grid(main+0x98) [0x400f07]
[phoebe:14939] [ 6] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xed) [0x7fb1e536776d]
[phoebe:14939] [ 7] ./grid() [0x400b99]
[phoebe:14939] *** End of error message ***
--------------------------------------------------------------------------
mpiexec noticed that process rank 22 with PID 14939 on node phoebe exited on signal 8 (Floating point exception).
--------------------------------------------------------------------------