Parallelizing on a 2D domain using MPI

Question

I can't seem to get this algorithm to work and I believe that it may be due to 'race condition' but I could be wrong:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>

#define BILLION     1000000000L

double f(double, double);
double g(double, double);

int main(int argc, char** argv){
    FILE *myA, *myB;    
    int rank, size;
    MPI_Init(NULL, NULL);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    if (rank == 0){
        myA = fopen("myA.py", "w");
        myB = fopen("myB.py", "w");
    }

    int m = 255;            // Max number of x values
    int n = 255;            // Max number of y values
    int Tmax = 5;//10000;   // Max number of time steps

    double a = 0, b = 2.5;      // starting and ending points along x-axis
    double c = 0, d = 2.5;      // starting and ending points along y-axis

    double dx = (b - a)/m;      // x partition width
    double dy = (d - c)/n;      // y partition width
    double dt = 1.;             // t partition width

    double D_u = 0.00002;   // diffusion coefficient
    double alpha_u = D_u*dt/(dx*dx), gamma_u = D_u*dt/(dy*dy), beta_u = 1 - 2*alpha_u - 2*gamma_u; // coeffs for fwd Euler method

    double D_v = 0.00001;   // diffusion coefficient
    double alpha_v = D_v*dt/(dx*dx), gamma_v = D_v*dt/(dy*dy), beta_v = 1 - 2*alpha_v - 2*gamma_v; // coeffs for fwd Euler method

    // Parameters:
    double F = 0.040;
    double K = 0.063;

    // Domain:
    double u[m+1][n+1];     // soln to the diffusion equation
    double utmp[m+1][n+1];  // temp storage

    double v[m+1][n+1];     // soln to the diffusion equation
    double vtmp[m+1][n+1];  // temp storage

    int i, j, k;

    // initialize time variables
    struct timespec begin, end;
    double time_lapsed;

    // seed rand
    srand(time(NULL));
    double noise;
    double lowest = -1./100.;
    double highest = 1./100.;
    double range = (highest - lowest);

    // divide up the domain evenly among groups
    int Np = floor((double)m/size);  // Number of rows per processor
    //int res = m % size/2;  // in case extra row in subgroup
    //int bigres = n % 2;  // in case extra row overall

    int istart = rank*Np;
    int iend;

    if (rank == 0){
        istart = 1;
        iend = (rank + 1)*Np;
    }

    else if (rank == size-1){
        iend = m;
    }

    else {
        iend = (rank + 1)*Np;
    }

    if (rank == 0){
        fprintf(myA,"from numpy import array\n");
        fprintf(myA,"\ndef myAi():\n");
        fprintf(myA,"\treturn array([ ");
        clock_gettime(CLOCK_MONOTONIC, &begin); // start timing u
    }

    // Initialization for u:
    for (i = 0; i <= m; i += 1){
        if (rank == 0){
            fprintf(myA,"[ ");
        }
        for (j = 0; j <= n; j += 1){
            // create square
            if ((i >= 117 && i <= 137) && (j >= 117 && j <= 137)){
                noise = (lowest + range*rand()/(RAND_MAX + 1.0));
                if (abs(noise) > 0.01){
                    printf("noise: %f\n",noise);
                }

                utmp[i][j] = 1./2 + noise*(1./2.);//f(a + i*dx,c + j*dy);
                u[i][j] = utmp[i][j];
            }
            else{
                utmp[i][j] = 1.;//f(a + i*dx,c + j*dy);
                u[i][j] = utmp[i][j];
            }
            if (rank == 0){

                // print matrix entries
                if (j != n){
                    fprintf(myA,"%f, ",utmp[i][j]);
                }
                else{
                    fprintf(myA,"%f ",utmp[i][j]);
                }
            }
        }
        if (rank == 0){
            if (i != m){
                fprintf(myA,"],\n");
            }
            else{
                fprintf(myA,"]");
            }
        }
    MPI_Bcast(&u[i][0],(n+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
    MPI_Bcast(&utmp[i][0],(n+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
    }

    if (rank == 0){
        fprintf(myA,"])\n");
        clock_gettime(CLOCK_MONOTONIC, &end);
        time_lapsed = (end.tv_sec - begin.tv_sec) + (double)(end.tv_nsec - begin.tv_nsec)/BILLION;
        printf("\nprint 'Time to initialize u:',%f,'seconds.'\n",time_lapsed);

        clock_gettime(CLOCK_MONOTONIC, &begin); // start timing v

        fprintf(myB,"from numpy import array\n");
        fprintf(myB,"\ndef myBi():\n");
        fprintf(myB,"\treturn array([ ");
    }

    // Initialization for v:
    for (i = 0; i <= m; i += 1){
        if (rank == 0){
            fprintf(myB,"[ ");
        }
        for (j = 0; j <= n; j += 1){
            // create square
            if ((i >= 117 && i <= 137) && (j >= 117 && j <= 137)){
                noise = (lowest + range*rand()/(RAND_MAX + 1.0));
                vtmp[i][j] = 1./4 + noise*(1./4.);//g(a + i*dx,c + j*dy);
                if (abs(noise) > 0.01){
                    printf("noise: %f\n",noise);
                }

                v[i][j] = vtmp[i][j];
            }
            else{
                vtmp[i][j] = 0.;//g(a + i*dx,c + j*dy);
                v[i][j] = vtmp[i][j];
            }

            if (rank == 0){
                // print matrix entries
                if (j != n){
                    fprintf(myB,"%f, ",vtmp[i][j]);
                }
                else{
                    fprintf(myB,"%f ",vtmp[i][j]);
                }
            }
        }
        if (rank == 0){
            if (i != m){
                fprintf(myB,"],\n");
            }
            else{
                fprintf(myB,"]");
            }
        }
    MPI_Bcast(&v[i][0],(n+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
    MPI_Bcast(&vtmp[i][0],(n+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
    }

    if (rank == 0){
        fprintf(myB,"])\n");
        clock_gettime(CLOCK_MONOTONIC, &end);
        time_lapsed = (end.tv_sec - begin.tv_sec) + (double)(end.tv_nsec - begin.tv_nsec)/BILLION;
        printf("\nprint 'Time to initialize v:',%f,'seconds.'\n",time_lapsed);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    // All together now...

    if (iend > m/2){
        if (rank == size-1){
            for (k = 1; k <= Tmax; k++){
                i = istart;
                for (i = istart; i < iend-1; i++){
                    for (j = 1; j < n-1; j++){

                        // Do usual computation with u_i,j = alpha * (u_i-1,j + u_i+1,j) + 
                        u[i][j] = alpha_u*(utmp[i-1][j] + utmp[i+1][j]) + beta_u*utmp[i][j] + gamma_u*(utmp[i][j-1] + utmp[i][j+1]) - u[i][j]*v[i][j]*v[i][j] + F*(1. - u[i][j]);
                        v[i][j] = alpha_v*(vtmp[i-1][j] + vtmp[i+1][j]) + beta_v*vtmp[i][j] + gamma_v*(vtmp[i][j-1] + vtmp[i][j+1]) + u[i][j]*v[i][j]*v[i][j] - (F+K)*v[i][j];
                    }

                    // left-right Periodic boundary conditions:
                    u[i][n-1] = alpha_u*(utmp[i-1][n-1] + utmp[i+1][n-1]) + beta_u*utmp[i][n-1] + gamma_u*(utmp[i][n-2] + utmp[i][0]) - u[i][n-1]*v[i][n-1]*v[i][n-1] + F*(1. - u[i][n-1]);
                    v[i][n-1] = alpha_v*(vtmp[i-1][n-1] + vtmp[i+1][n-1]) + beta_v*vtmp[i][n-1] + gamma_v*(vtmp[i][n-2] + vtmp[i][0]) + u[i][j]*v[i][n-1]*v[i][n-1] - (F+K)*v[i][n-1];
                }

                // top-bottom Periodic Boundary conditions:
                for (j = 1; j < n-1; j++){
                    u[iend-1][j] = alpha_u*(utmp[iend-2][j] + utmp[0][j]) + beta_u*utmp[iend-1][j] + gamma_u*(utmp[iend-1][j-1] + utmp[iend-1][j+1]) - u[iend-1][j]*v[iend-1][j]*v[iend-1][j] + F*(1. - u[iend-1][j]);
                    v[iend-1][j] = alpha_v*(vtmp[iend-2][j] + vtmp[0][j]) + beta_v*vtmp[iend-1][j] + gamma_v*(vtmp[iend-1][j-1] + vtmp[iend-1][j+1]) + u[iend-1][j]*v[iend-1][j]*v[iend-1][j] - (F+K)*v[iend-1][j];
                }

                // top-bottom & left-right Periodic Boundary Conditions
                u[iend-1][n-1] = alpha_u*(utmp[iend-2][n-1] + utmp[0][n-1]) + beta_u*utmp[iend-1][n-1] + gamma_u*(utmp[iend-1][n-2] + utmp[iend-1][0]) - u[iend-1][n-1]*v[iend-1][n-1]*v[iend-1][n-1] + F*(1. - u[iend-1][n-1]);
                v[iend-1][n-1] = alpha_v*(vtmp[iend-2][n-1] + vtmp[0][n-1]) + beta_v*vtmp[iend-1][n-1] + gamma_v*(vtmp[iend-1][n-2] + vtmp[iend-1][0]) + u[iend-1][n-1]*v[iend-1][n-1]*v[iend-1][n-1] - (F+K)*v[iend-1][n-1];

                i = istart;
                for (i = istart; i <= iend; i++){   //istart; i <= iend; i++){
                    for (j = 0; j <= n; j++){
                        utmp[i][j] = u[i][j];
                        vtmp[i][j] = v[i][j];
                    }
                }
            }
        }
        else{
            for (k = 1; k <= Tmax; k++){
                i = istart;
                for (i = istart; i <= iend-1; i++){
                    for (j = 1; j < n-1; j++){

                        // Do usual computation with u_i,j = alpha * (u_i-1,j + u_i+1,j) + 
                        u[i][j] = alpha_u*(utmp[i-1][j] + utmp[i+1][j]) + beta_u*utmp[i][j] + gamma_u*(utmp[i][j-1] + utmp[i][j+1]) - u[i][j]*v[i][j]*v[i][j] + F*(1. - u[i][j]);
                        v[i][j] = alpha_v*(vtmp[i-1][j] + vtmp[i+1][j]) + beta_v*vtmp[i][j] + gamma_v*(vtmp[i][j-1] + vtmp[i][j+1]) + u[i][j]*v[i][j]*v[i][j] - (F+K)*v[i][j];
                    }

                    // left-right Periodic boundary conditions:
                    u[i][n-1] = alpha_u*(utmp[i-1][n-1] + utmp[i+1][n-1]) + beta_u*utmp[i][n-1] + gamma_u*(utmp[i][n-2] + utmp[i][0]) - u[i][n-1]*v[i][n-1]*v[i][n-1] + F*(1. - u[i][n-1]);
                    v[i][n-1] = alpha_v*(vtmp[i-1][n-1] + vtmp[i+1][n-1]) + beta_v*vtmp[i][n-1] + gamma_v*(vtmp[i][n-2] + vtmp[i][0]) + u[i][j]*v[i][n-1]*v[i][n-1] - (F+K)*v[i][n-1];
                }

                i = istart;
                for (i = istart; i <= iend; i++){
                    for (j = 0; j <= n; j++){
                        utmp[i][j] = u[i][j];
                        vtmp[i][j] = v[i][j];
                    }
                }
            }
        }
    }

    else {
        int count;
        for (k = 1; k <= Tmax; k++){
            count = iend-1;
            while (count >= istart){
                //printf("i = %d\n",i);
                for (j = 1; j < n-1; j++){

                    // Do usual computation with u_i,j = alpha * (u_i-1,j + u_i+1,j) + 
                    u[count][j] = alpha_u*(utmp[count-1][j] + utmp[count+1][j]) + beta_u*utmp[count][j] + gamma_u*(utmp[count][j-1] + utmp[count][j+1]) - u[count][j]*v[count][j]*v[count][j] + F*(1. - u[count][j]);
                    v[count][j] = alpha_v*(vtmp[count-1][j] + vtmp[count+1][j]) + beta_v*vtmp[count][j] + gamma_v*(vtmp[count][j-1] + vtmp[count][j+1]) + u[count][j]*v[count][j]*v[count][j] - (F+K)*v[count][j];
                }

                // left-right Periodic boundary conditions:
                u[count][n-1] = alpha_u*(utmp[count-1][n-1] + utmp[count+1][n-1]) + beta_u*utmp[count][n-1] + gamma_u*(utmp[count][n-2] + utmp[count][0]) - u[count][n-1]*v[count][n-1]*v[count][n-1] + F*(1. - u[count][n-1]);
                v[count][n-1] = alpha_v*(vtmp[count-1][n-1] + vtmp[count+1][n-1]) + beta_v*vtmp[count][n-1] + gamma_v*(vtmp[count][n-2] + vtmp[count][0]) + u[count][j]*v[count][n-1]*v[count][n-1] - (F+K)*v[count][n-1];

                count = count-1;    
            }

            i = istart;
            for (i = istart; i <= iend; i++){
                for (j = 0; j <= n; j++){
                    utmp[i][j] = u[i][j];
                    vtmp[i][j] = v[i][j];
                }
            }
        }
    }

    if (rank == 0){

        clock_gettime(CLOCK_MONOTONIC, &end);
        time_lapsed = (end.tv_sec - begin.tv_sec) + (double)(end.tv_nsec - begin.tv_nsec)/BILLION;
        printf("\nprint 'Time for algorithm to complete:',%f,'seconds.'\n",time_lapsed);

        fprintf(myA,"\n");
        fprintf(myA,"\ndef myAf():\n");
        fprintf(myA,"\treturn array([ ");

        for (i = 0; i <= m; i++){
            fprintf(myA,"[ ");
            for (j = 0; j <= n; j++){
                if (j != n){
                    fprintf(myA,"%f, ",utmp[i][j]);
                }
                else{
                    fprintf(myA,"%f ",utmp[i][j]);
                }
            }
            if (i != m){
                fprintf(myA,"],\n");
            }
            else{
                fprintf(myA,"]");
             }
        }

        fprintf(myA,"])\n");

        fprintf(myB,"\ndef myBf():\n");
        fprintf(myB,"\treturn array([");

        for (i = 0; i <= m; i++){
            fprintf(myB,"[ ");
            for (j = 0; j <= n; j++){
                if (j != n){
                    fprintf(myB,"%f, ",vtmp[i][j]);
                }
                else{
                    fprintf(myB,"%f ",vtmp[i][j]);
                }
            }
            if (i != m){
                fprintf(myB,"],\n");
            }
            else{
                fprintf(myB,"]");
             }
        }
        fprintf(myB,"])\n");


        fclose(myA);
        fclose(myB);

    }

    MPI_Finalize();

    return 0;
}

// For experimentation with different initial conditions

double f(double x, double y){
    return x - x*x + y - y*y; //sin(x*x + y*y);
//exp(20*(x-1./2)*(x-1./2) - 20*(y-1./2)*(y-1./2));//x - x*x + y - y*y;
}

double g(double x, double y){
    return sin(x*x + y*y); //sin(x*x + y*y);
//exp(20*(x-1./2)*(x-1./2) - 20*(y-1./2)*(y-1./2));//x - x*x + y - y*y;
}

The algorithm is forward Euler method on a periodic 2D domain (the 2D arrays) and for clarity I left out a lot of the parts, unless more is needed. The initial and final results will be output to a file by the master processor (rank 0 as in code) ready to be plotted.

The idea that I have in mind here is to divide the domain among processors into (# of rows)/(# of processors) chunk sizes with the first half of all processors doing the top half of the domain (starting at the center to the top). Then, the other half of the processors doing the bottom half of the domain (starting at the center to the bottom).

However, only the bottom half of the domain is being updated which leads me to believe that some sort of 'race condition' is going on.

--EDIT--

Original code is being used instead.

I think I know what the problem is. Each processor has it's own 'local' copy of the domain that it's updating. Hence when rank 0 is printing to file, it's printing it's own 'local' version of the domain, which on two processors I would see half of the 'entire picture'.

However what I want is for each processor to update it's piece of the domain then have processor 0 print the entire updated domain to file. How might I go about doing this if this is the issue?

I'm not convinced your problem has anything to do with MPI or a race condition, really. It looks more likely to be a simple bug in the management of loops and indexes (or even wrong interleaving of brackets, `for` loops and `if` and `else` conditions). At the moment, your snippets exposes exactly that, but it might be a copy / paste / edit issue. Could you please post the exact code rather? — Gilles, Dec 13 '15 at 07:46
A gratuitous optimisation advice: currently, all ranks are executing the loops that fill `u`, `v`, `utmp`, and `vtmp`, which makes the use of `MPI_Bcast` redundant. You can do it in rank 0 only and then broadcast the entire arrays by providing pointer to element `[0][0]` and a count of `(n+1)*(m+1)` or you can get rid of the broadcasts altogether and leave each process fill the arrays on its own. — Hristo Iliev, Dec 13 '15 at 11:47
@Gilles it will be more lengthy but I suppose it can be ignored if necessary. — BudRoz, Dec 13 '15 at 13:31
@HristoIliev, I thought I could leave out the broadcasts but I wasn't sure. I will keep that in mind. — BudRoz, Dec 13 '15 at 14:14

score 0 · Answer 1 · edited May 23 '17 at 11:44

OK, I finally had the chance to read the code, and yes, you're correct in your analysis: since process #0 only has its own share of the domain updated, what it prints is only relevant for this part, not for the whole domain. Therefore, you have two options for your code:

To keep its current structure and have process #0 to collect every data from the other processes prior to printing the result into the file; or
To go for a fully parallel approach and have all processes generating their own data, doing the computation, and finally printing their own share in parallel.

Needless to say that the second approach is far better and far more effective too. Moreover, it breaks this useless master-slave approach that you have at the moment (sorry I couldn't resist mentioning that).

So, supposing you'd go for approach number two, what would it translate into? Well,

You would compute your indexes limits pretty-much the way you did it so far, and allocate only the share of the compute array the current process is responsible of plus an extra layer surrounding it, called the ghost layer. This means that if you need to go for larger domains, by increasing the number of compute nodes, you'll also increase the overall amount of data you will be able to deal with. The code becomes scalable in that regard. And since your code's algorithm looks suspiciously like a 5 points stencil, your ghost layers will be 1 cell wide, which won't represent a lot of data transfers.
You would initialise your data and start computing it. Simply, after each iteration of the k time loop, you'd have to exchange data between the various processes for the ghost layers, to propagate what comes from the adjacent domains. NB that this is an issue you already have in your current code (since you skipped this mandatory stage), which makes it wrong irrespective of the final printing issue.
Finally all processes would write in parallel their own share of the domain, using MPI-IO. This is done by creating a per-process "view" of the output file, where the data will fit to assemble the overall result.

Altogether, this represents a bit of work (more than I can allocate for answering a SO question). However, even if you decided not to go for this option, fixing your code (notably managing the data exchanges at each time step) will be almost as lengthy. So I really encourage you to "think parallel" and make it right from all points of view.

Alternatively, if you don't want to take the long road of full MPI parallelisation, and assuming memory consumption is not an issue for you, you can try an OpenMP parallelisation which should be quite straightforward, starting from the sequential version of the code. However, since your code is very likely memory bound, don't expect too much benefit from the OpenMP parallelisation.

Parallelizing on a 2D domain using MPI

1 Answers1