MPI_Gatherv hangs when recomposing a matrix

Question

I have to decompose and recompose a matrix in MPI (I'm using MPICH), and I'm using Scatterv and Gatherv as in the example from this question. Everything works well for small matrices, but when the matrix size increases (starting from 800x800), the program hangs when it reaches MPI_Gatherv. By printing debug messages, I can see that every process passes the call to Gatherv, except the one with rank 0 (the root process in the Gatherv call). Any suggestion? Here's the code:

#include <iostream>
#include <cstring>
#include <fstream>
#include <cstdlib>
#include "mpi.h"
using namespace std;

#define TOP_ROW_TAG 1
#define BOTTOM_ROW_TAG 2
#define LEFT_COL_TAG 3
#define RIGHT_COL_TAG 4

int main(int argc, char ** argv) {
    int me, nproc, width, height, wloc, hloc;
    double k, d,c, wdouble, hdouble, discr, delta_t, t;
    char* initial, end;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &me); 
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
    MPI_Comm cart_top;

    wdouble = atof(argv[1]);
    hdouble = atof(argv[2]);
    discr = atof(argv[3]);
    k = atof(argv[4]);
    d = atof(argv[5]);
    c = atof(argv[6]);
    delta_t = atof(argv[7]);
    t = atof(argv[8]);
    initial = argv[9];
    end = argv[10];
    double p = k/(d*c);
    double dsc = delta_t/(discr*discr);

    width = wdouble / discr;
    height = hdouble / discr; 
    const int NPROWS=4;  /* number of rows in _decomposition_ */
    const int NPCOLS=4;  /* number of cols in _decomposition_ */
    const int BLOCKROWS = width/NPROWS;  /* number of rows in _block_ */
    const int BLOCKCOLS = height/NPCOLS;

    const int dims[2] = {NPROWS, NPCOLS};
    const int periods[2] = {0,0};
    int* mycoords = new int[2];

    int locsz = (width*height)/nproc;

    double* T, *Tnew, *local, *locnew;
    local = new double[BLOCKROWS*BLOCKCOLS];
    locnew = new double[BLOCKROWS*BLOCKCOLS];
    T = new double[width * height];
    Tnew = new double[width * height];

    ifstream infile;
    infile.open(initial);
    if(me==0) {
        cout<<"BLOCKROWS: "<<BLOCKROWS;
        cout<<"BLOCKCOLS: "<<BLOCKCOLS<<endl;
        cout<<"width: "<<width;
        cout<<"height: "<<height<<endl;
        int idx, jdx, temp;
        for (int i=0; i<width*height; i++) {
            string currline;
            getline(infile, currline);
            idx = atoi(strtok(currline.c_str(), " "));
            jdx = atoi(strtok(NULL, " "));
            temp = atof(strtok(NULL, " "));
            T[idx*height+jdx] = temp;
            infile.close();
        }

        MPI_Datatype blocktype;
        MPI_Datatype blocktype2;
        MPI_Datatype coltype, coltype2;


        MPI_Type_vector(BLOCKROWS, 1, BLOCKCOLS, MPI_DOUBLE, &coltype);
        MPI_Type_create_resized( coltype, 0, sizeof(double), &coltype2);
        MPI_Type_commit(&coltype2);


        MPI_Type_vector(BLOCKROWS, BLOCKCOLS, height, MPI_DOUBLE, &blocktype2);
        MPI_Type_create_resized( blocktype2, 0, sizeof(double), &blocktype);
        MPI_Type_commit(&blocktype);

        int disps[NPROWS*NPCOLS];
        int counts[NPROWS*NPCOLS];
        for (int ii=0; ii<NPROWS; ii++) {
            for (int jj=0; jj<NPCOLS; jj++) {
                disps[ii*NPCOLS+jj] = ii*height*BLOCKROWS+jj*BLOCKCOLS;
                counts [ii*NPCOLS+jj] = 1;
            }
        }

        int myrank, lb_i, lb_j, ub_i, ub_j;
        lb_i=0;
        lb_j=0;
        ub_i=BLOCKROWS;
        ub_j=BLOCKCOLS;
        /*
           0= left neighbor;
           1= right neighbor;
           2=top neighbor;
           3=bottom neighbor;
         */
        int neighs[4] = {};
        double* leftcol, *rightcol, *myleftcol, *myrightcol, *toprow, *bottomrow;
        leftcol = new double[BLOCKROWS];
        rightcol= new double[BLOCKROWS];
        myleftcol = new double[BLOCKROWS];
        myrightcol= new double[BLOCKROWS];
        toprow = new double[BLOCKCOLS];
        bottomrow = new double[BLOCKCOLS];

        //Create topology and get neighbor's rank
        MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &cart_top);
        MPI_Barrier(MPI_COMM_WORLD);
        MPI_Comm_rank(cart_top, &myrank);

        MPI_Cart_shift(cart_top, 0, -1, &myrank, &neighs[0]);
        MPI_Cart_shift(cart_top, 0, 1, &myrank, &neighs[1]);
        MPI_Cart_shift(cart_top, 1, 1, &myrank, &neighs[2]);
        MPI_Cart_shift(cart_top, 1, -1, &myrank, &neighs[3]);
        MPI_Scatterv(T, counts, disps, blocktype, local, BLOCKROWS*BLOCKCOLS, 
                MPI_DOUBLE, 0, cart_top);
        double curr_t=0;
        for(double curr_t = 0; curr_t < t; curr_t+=delta_t) {
            MPI_Barrier(cart_top);

            //Send border columns to neighbors
            if(neighs[2] != MPI_PROC_NULL) {
                MPI_Send(&local[BLOCKCOLS-1], 1, coltype2, neighs[2], LEFT_COL_TAG+(int)(curr_t*1000), cart_top);
            }
            if(neighs[3] != MPI_PROC_NULL) {
                MPI_Send(local, 1, coltype2, neighs[3], RIGHT_COL_TAG+(int)(curr_t*1000), cart_top);
            }
            if(neighs[0] != MPI_PROC_NULL) {
                MPI_Send(local, BLOCKCOLS, MPI_DOUBLE, neighs[0], TOP_ROW_TAG+(int)(curr_t*1000), cart_top);
            }
            if(neighs[1] != MPI_PROC_NULL) {
                MPI_Send(&local[(BLOCKROWS-1)*BLOCKCOLS], BLOCKCOLS, MPI_DOUBLE, neighs[1], BOTTOM_ROW_TAG+(int)(curr_t*1000), cart_top);
            }

            if(neighs[3] != MPI_PROC_NULL) {
                MPI_Recv(leftcol, BLOCKROWS, MPI_DOUBLE, neighs[3], LEFT_COL_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
            }
            if(neighs[2] != MPI_PROC_NULL) {
                MPI_Recv(rightcol, BLOCKROWS, MPI_DOUBLE, neighs[2], RIGHT_COL_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
            }
            if(neighs[1] != MPI_PROC_NULL) {
                MPI_Recv(bottomrow, BLOCKCOLS, MPI_DOUBLE, neighs[1], TOP_ROW_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
            }
            if(neighs[0] != MPI_PROC_NULL) {
                MPI_Recv(toprow, BLOCKCOLS, MPI_DOUBLE, neighs[0], BOTTOM_ROW_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
            }

            MPI_Barrier(cart_top);
            double* aux;
            //cout<<" t in process "<<me<<" is " <<t<<endl;
            int i, j;
            MPI_Comm_rank(cart_top, &myrank);
            MPI_Barrier(cart_top);

            for(i=lb_i; i<ub_i; i++) {
                for(j=lb_j; j<ub_j; j++) {
                    double curr,c1,c2,c3,c4;
                    curr = local[i*BLOCKCOLS+j];
                    c1 = i==0 ? toprow[j] : local[(i-1)*BLOCKCOLS+j];
                    c2 = i==BLOCKROWS-1 ? bottomrow[j] : local[(i+1)*BLOCKCOLS+j];
                    c3 = j==0 ? leftcol[i] : local[i*BLOCKCOLS+(j-1)];
                    c4 = j==BLOCKCOLS-1 ? rightcol[i] : local[i*BLOCKCOLS+(j+1)];

                    locnew[i*BLOCKCOLS+j] = curr*(1-4*dsc*p) + dsc*p*(c1+c2+c3+c4);
                    /*if(i==0) locnew[i*BLOCKCOLS+j] = toprow[j];
                      else if(i==BLOCKROWS-1) locnew[i*BLOCKCOLS+j] = bottomrow[j];
                      if(j==0) locnew[i*BLOCKCOLS+j] = leftcol[i];
                      else if(j==BLOCKCOLS-1) locnew[i*BLOCKCOLS+j] = rightcol[i];
                      if(i!=0 && i!=BLOCKROWS-1 && j!=0 && j!=BLOCKCOLS-1)    locnew[i*BLOCKCOLS+j] = local[i*BLOCKCOLS+j];*/
                    /*if(i==0) locnew[i*BLOCKCOLS+j] = (double)5000;
                      else if(i==BLOCKROWS-1) locnew[i*BLOCKCOLS+j] = (double)5000;
                      if(j==0) locnew[i*BLOCKCOLS+j] = (double)5000;
                      else if(j==BLOCKCOLS-1) locnew[i*BLOCKCOLS+j] = (double)5000;
                      if(i!=0 && i!=BLOCKROWS-1 && j!=0 && j!=BLOCKCOLS-1)    locnew[i*BLOCKCOLS+j] = local[i*BLOCKCOLS+j];*/
                }
            }

            aux = local;
            local = locnew;
            locnew = aux;

            MPI_Barrier(cart_top);

            /*  aux = T;
                T=Tnew;
                Tnew = aux;*/

        }
        MPI_Gatherv(local, BLOCKROWS*BLOCKCOLS, MPI_DOUBLE, Tnew, counts, disps, blocktype, 0,cart_top);
        if(me == 0) {
            ofstream outfile;
            outfile.open(argv[10]);
            for(int i=0; i<width; i++) {
                for(int j=0; j<height; j++) {
                    outfile<< i<<" " <<j<<" "<<Tnew[i*height+j]<<endl;
                }

            }
            outfile.close();
        }
        MPI_Finalize();

    }

`disps` and `counts` should be `int[nproc]`, and populated accordingly. are you also sure `T` is large enough ? at first glance, it looks a bit odd its size does not depend on `nproc` — Gilles Gouaillardet, Jan 14 '18 at 12:20
Sorry, I didn't specify it. Nproc is 16, obtained by NPROWS*NPCOLS — DevOlly, Jan 14 '18 at 13:20
you should have `MPI_Type_vector(BLOCKROWS, BLOCKCOLS, height, MPI_DOUBLE, &blocktype2);` (instead of `&blocktype`). please edit your question with the full program that compiles and run. how do you increment the matrix size ? is the provided code crashing ? — Gilles Gouaillardet, Jan 14 '18 at 13:43
Your code here doesn't compile. It looks like you didn't past the full function. Is it possible to shrink this down to a [MVCE](https://stackoverflow.com/help/mcve)? — Wesley Bland, Jan 17 '18 at 19:41

MPI_Gatherv hangs when recomposing a matrix

0 Answers0