0

I am reading "Using MPI" and try to execute the code myself. There is a grid decomposition code in Chapter 6.3. It compiles with no warnings or errors, and runs with small number processes, but fails with larger numbers, say 30, on my laptop. My laptop is 4 core, hyperthreaded, and 8G RAM. Both versions of la_grid_2d_new do not work, but the first one tolerate a little larger number, say 35, but fails for 40 processes. I am not sure why. Could you help me please? Thanks a lot.

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>

typedef struct
{
    int P, Q;
    int p, q;
    MPI_Comm grid_comm;
    MPI_Comm row_comm;
    MPI_Comm col_comm;
} LA_Grid_2d;

LA_Grid_2d *la_grid_2d_new(MPI_Comm comm, int P, int Q)
{
    LA_Grid_2d *grid;
    MPI_Comm row, col;
    int my_rank, p, q;
    MPI_Comm_rank(comm, &my_rank);
    p=my_rank/Q;
    q=my_rank%Q;
    MPI_Comm_split(comm, p, q, &row);
    MPI_Comm_split(comm, q, p, &col);
    grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d));
    grid->grid_comm=comm;
    grid->row_comm=row;
    grid->col_comm=col;
    grid->P=P;
    grid->Q=Q;
    grid->p=p;
    grid->q=q;
    return grid;
}

LA_Grid_2d *la_grid_2d_new_II(MPI_Comm comm, int P, int Q)
{
    LA_Grid_2d *grid;
    MPI_Comm comm_2d, row, col;
    int my_rank, p, q;
    int dims[2]={P,Q}, local[2], period[2]={0,0}, remain_dims[2];

    MPI_Cart_create(comm, 2, dims, period, 1, &comm_2d);
    MPI_Comm_rank(comm, &my_rank);
    MPI_Cart_coords(comm_2d, my_rank, 2, local);
    p=local[0];
    q=local[1];
    remain_dims[0]=0;
    remain_dims[1]=1;
    MPI_Cart_sub(comm_2d, remain_dims, &row);
    remain_dims[0]=1;
    remain_dims[1]=0;
    MPI_Cart_sub(comm_2d, remain_dims, &col);
    grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d));
    grid->grid_comm=comm;
    grid->row_comm=row;
    grid->col_comm=col;
    grid->P=P;
    grid->Q=Q;
    grid->p=p;
    grid->q=q;
    return grid;
}

void la_grid_2d_delete(LA_Grid_2d *grid)
{
    free(grid);
}
int main(int argc, char **argv)
{
    LA_Grid_2d *pgrid;
    int size, rank, dims[2]={0,0}, row, col;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if(rank==0)
        printf("size=%d rank=%d\n", size, rank);
    MPI_Dims_create(size, 2, dims);
//  pgrid=la_grid_2d_new(MPI_COMM_WORLD, dims[0], dims[1]);
    pgrid=la_grid_2d_new_II(MPI_COMM_WORLD, dims[0], dims[1]);
    if(rank==0)
        printf("dims[0]=%d dims[1]=%d\n", dims[0], dims[1]);
    MPI_Reduce(&rank, &row, 1, MPI_INT, MPI_SUM, 0, pgrid->row_comm);
    MPI_Reduce(&rank, &col, 1, MPI_INT, MPI_SUM, 0, pgrid->col_comm);
    la_grid_2d_delete(pgrid);
    MPI_Finalize();
    if(rank==0)
        printf("row=%d col=%d\n", row, col);
    return 0;
}

The error messages are:

shuang@phoebe:~/usingMPI$ mpiexec -n 20 ./grid
size=20 rank=0
dims[0]=5 dims[1]=4
row=6 col=40

shuang@phoebe:~/usingMPI$ mpiexec -n 30 ./grid
size=30 rank=0
dims[0]=6 dims[1]=5
[phoebe:14939] *** Process received signal ***
[phoebe:14939] Signal: Floating point exception (8)
[phoebe:14939] Signal code: Integer divide-by-zero (1)
[phoebe:14939] Failing at address: 0x7fb1e599e6f7
[phoebe:14939] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0) [0x7fb1e5714cb0]
[phoebe:14939] [ 1] /usr/lib/libmpi.so.0(mca_topo_base_cart_coords+0x57) [0x7fb1e599e6f7]
[phoebe:14939] [ 2] /usr/lib/libmpi.so.0(mca_topo_base_cart_sub+0x166) [0x7fb1e599ec36]
[phoebe:14939] [ 3] /usr/lib/libmpi.so.0(PMPI_Cart_sub+0xba) [0x7fb1e596f34a]
[phoebe:14939] [ 4] ./grid(la_grid_2d_new_II+0xd6) [0x400df6]
[phoebe:14939] [ 5] ./grid(main+0x98) [0x400f07]
[phoebe:14939] [ 6] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xed) [0x7fb1e536776d]
[phoebe:14939] [ 7] ./grid() [0x400b99]
[phoebe:14939] *** End of error message ***
--------------------------------------------------------------------------
mpiexec noticed that process rank 22 with PID 14939 on node phoebe exited on signal 8 (Floating point exception).
--------------------------------------------------------------------------
Sean
  • 2,649
  • 3
  • 21
  • 27
  • 1
    What do you mean by "it fails"? – suszterpatt Aug 09 '14 at 10:21
  • what version of mpi are you using? And what is the error? How do you execute the program and the mpi? – askmish Aug 09 '14 at 10:46
  • 1
    Have you decomposed your grid across too many processes ? Perhaps ended up with a `0` from an integer division where you expect a positive integer, and get a positive integer on fewer processes. – High Performance Mark Aug 09 '14 at 11:12
  • @suszterpatt I modified the question. The error was at the bottom of the code. – Sean Aug 09 '14 at 18:12
  • @askmish The version is openmpi 1.4.3 and you can see the error messages now. – Sean Aug 09 '14 at 18:15
  • @HighPerformanceMark I am not sure why the system "ended up with a 0 from an integer division where you expect a positive integer"? – Sean Aug 09 '14 at 18:18
  • @Sean, if you're not sure what Mark is talking about, then you should probably go find out. It's a very important thing to know if you're doing any kind of programming and shows that you might not be ready for more advanced topics yet. – Wesley Bland Aug 11 '14 at 14:26
  • @WesleyBland I double checked the parameters for MPI_Cart_sub and they are all right. Do you have suggesting materials to read on what Mark talks about? – Sean Aug 13 '14 at 04:55
  • Google will turn up hundreds of articles about integer division. – Wesley Bland Aug 13 '14 at 05:52
  • @WesleyBland OK. Thanks, but the only variable involving in division is Q, and it is never zero since I printed all the variables out for checking. I am not sure why zero could become the denominator and causes an exception. – Sean Aug 13 '14 at 06:22
  • @Sean Have you considered trying another build of OpenMPI? I was able to run your code with 250 processes using a RedHat build of OpenMPI 1.5.4. I'm on a RHEL6 box with an i7 with HT disabled and 8GB RAM. When I tried 260 or higher I would start getting messages about exceeding system limitations on pipes or children of a process. – chuck Aug 14 '14 at 19:22
  • @chuck Thank you for testing my code. Unfortunately, all I have now are Ubuntu machines, and OpenMPI 1.4.3 seems to be the one in the app store. I tried on a 4 core non-hyperthreading machine and it works for 40 processes and does not work for more. It is probably not the problem of the code itself, but the setup, I guess. – Sean Aug 18 '14 at 17:43

1 Answers1

2

@Sean If you want to try another OpenMPI you can normally download it and compile with something like

./configure --prefix=/opt/ompi-[version]
make
sudo make install

Since this will install to a non-standard location(for easy removal later) you will need to set LD_LIBRARY_PATH=/opt/ompi-[version]/lib and specify the full path to mpicc and mpirun to ensure you call the right version. Somewhere in the build process it will remind you about setting LD_LIBRARY_PATH.

chuck
  • 735
  • 3
  • 4
  • Thanks for your answer. I downloaded and installed openmpi 1.8.1 and it works for 250 processes and fails for 260 processes, just as you said. Thanks again! – Sean Aug 25 '14 at 07:53