How to multiply matrix using parallel computers (MPI)

Question

I am trying very hard to get my code to run using MPI. I am trying to achieve matrix multiplication.

My code is like this

There are two matrices A and B
Scatter the rows of A
Broadcast matrix B
Compute
Gather

I have written the code...but the code is not running right... I am getting segmentation fault.

I have no idea why this is happening...I tried tweaking the code a lot ...but it seems something is always wrong.

Could some one go over this code and tell me why the code is not working?

I have added the comments:

"Scattering matrices"

"Gathering answers" and so on...so even if you could just go through the scatter part of the program and tell me why it is not right, I'll be thankful!

#define N 512
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include "mpi.h"
void print_results(char *prompt, float result[N][N]);
int main(int argc, char *argv[])
{
int i, j, k;
MPI_Status status;
int process_rank;//rank of a process
int no_of_processes;//no. of processes

int Master_To_Slave = 0;
int Slave_To_Master = 5;

float a[N][N], b[N][N], c[N][N];
char *usage = "Usage: %s file\n";
FILE *fd;
double elapsed_time, start_time, end_time;
struct timeval tv1, tv2;
if (argc < 2) {
fprintf (stderr, usage, argv[0]);
return -1;
}
if ((fd = fopen (argv[1], "r")) == NULL) {
fprintf (stderr, "%s: Cannot open file %s for reading.\n",
argv[0], argv[1]);
fprintf (stderr, usage, argv[0]);
return -1;
}
// Read input from file for matrices a and b.
// The I/O is not timed because this I/O needs
// to be done regardless of whether this program
// is run sequentially on one processor or in
// parallel on many processors. Therefore, it is
// irrelevant when considering speedup.
for (i = 0; i < N; i++)
{
    for (j = 0; j < N; j++)
        {
            fscanf (fd, "%f", &a[i][j]);

        }
}
for (i = 0; i < N; i++)
    for (j = 0; j < N; j++)
        fscanf (fd, "%f", &b[i][j]);

int num_of_rows_A = N;
int num_of_cols_A = N;

int num_of_rows_B = N;
int num_of_cols_B = N;

int lower_index_of_A;
int upper_index_of_A;

//TODO: Add a barrier prior to the time stamp.
MPI_Init(&argc, &argv); //initialize MPI operations
MPI_Barrier(MPI_COMM_WORLD); //Added Barrier


// Take a time stamp
gettimeofday(&tv1, NULL);

//TODO: Scatter the input matrices a and b.


    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank); //get the rank
    MPI_Comm_size(MPI_COMM_WORLD, &no_of_processes); //get number of processes

    if(process_rank==0){
        fprintf (stderr, "Main process started");
        fprintf (stderr, "No. of process %d",no_of_processes);
        fprintf (stderr, "\n\n");
        fprintf (stderr, "\n\n");

    for( i=1; i<no_of_processes;i++)
    {
        int rows_per_process = num_of_rows_A/(no_of_processes-1);

         lower_index_of_A = (i-1)*rows_per_process;
       // fprintf (stderr, "Current lower Index of A %s",lower_index_of_A);

        if(i+1==no_of_processes &&((num_of_rows_A%(no_of_processes-1))!=0))
        {
            upper_index_of_A=num_of_rows_A; 
           // fprintf (stderr, "Current upper Index of A %s",upper_index_of_A);
        }
        else
        {         
            upper_index_of_A=lower_index_of_A+rows_per_process;
           // fprintf (stderr, "Current upper Index of A %s",upper_index_of_A);
        }
        fprintf (stderr, "Lower index of A %d", lower_index_of_A);
        fprintf (stderr, "Upper index of A %d", upper_index_of_A);
        fprintf (stderr, "\n\n");

        MPI_Send(&lower_index_of_A,1,MPI_INT,i,Master_To_Slave,MPI_COMM_WORLD); //send lower index
        MPI_Send(&upper_index_of_A,1,MPI_INT,i,Master_To_Slave+1,MPI_COMM_WORLD);//send upper  index

        MPI_Send(&a[lower_index_of_A][0],(upper_index_of_A-lower_index_of_A)*num_of_cols_A,MPI_DOUBLE,i,Master_To_Slave+2,MPI_COMM_WORLD);//send rows of A
        fprintf (stderr, "Scatter done");

    }
    MPI_Bcast(&b, num_of_rows_A*num_of_cols_B, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    fprintf (stderr, "Broadcast done");
    }
    else
    {


        MPI_Recv(&lower_index_of_A, 1, MPI_INT, 0, Master_To_Slave,MPI_COMM_WORLD, &status);

        MPI_Recv(&upper_index_of_A, 1, MPI_INT, 0, Master_To_Slave+1,MPI_COMM_WORLD,&status);

        MPI_Recv(&a[lower_index_of_A], (upper_index_of_A-lower_index_of_A)*num_of_cols_A, MPI_DOUBLE,0, Master_To_Slave+2,MPI_COMM_WORLD, &status);

        //TODO: Add code to implement matrix multiplication (C=AxB) in parallel.


        for( i=lower_index_of_A;i<upper_index_of_A;i++)
        {
            for( j=0;j<num_of_cols_B;j++)
            {
                for( k=0;k<num_of_rows_B;k++)
                {

                    c[i][j]+=(a[i][k]*b[k][j]);

                }
            }
        }

        MPI_Send(&lower_index_of_A, 1, MPI_INT, 0, Slave_To_Master,MPI_COMM_WORLD);
        MPI_Send(&upper_index_of_A, 1, MPI_INT, 0, Slave_To_Master+1,MPI_COMM_WORLD);
        MPI_Send(&c[lower_index_of_A], (upper_index_of_A-lower_index_of_A)*num_of_cols_B, MPI_DOUBLE,0, Slave_To_Master+2,MPI_COMM_WORLD);
    }




//TODO: Gather partial result back to the master process.
    if(process_rank==0)
    {   
        for(i=1;i<no_of_processes;i++)
        {

            MPI_Recv(&lower_index_of_A, 1, MPI_INT, i, Slave_To_Master, MPI_COMM_WORLD, &status);
            //receive upper bound from a slave
            MPI_Recv(&upper_index_of_A, 1, MPI_INT, i, Slave_To_Master + 1, MPI_COMM_WORLD, &status);
            //receive processed data from a slave
            MPI_Recv(&c[lower_index_of_A][0], (upper_index_of_A - lower_index_of_A) * num_of_cols_B, MPI_DOUBLE, i, Slave_To_Master + 2, MPI_COMM_WORLD, &status);
        }

    }




// Take a time stamp. This won't happen until after the master
// process has gathered all the input from the other processes.
gettimeofday(&tv2, NULL);
elapsed_time = (tv2.tv_sec - tv1.tv_sec) +
((tv2.tv_usec - tv1.tv_usec) / 1000000.0);
printf ("elapsed_time=\t%lf (seconds)\n", elapsed_time);
// print results
print_results("C = ", c);
}
void print_results(char *prompt, float result[N][N])
{
int i, j;
printf ("\n\n%s\n", prompt);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf(" %.2f", result[i][j]);
}
printf ("\n");
}
printf ("\n\n");
}

You might have more success using the MPI functions `MPI_Scatter` and `MPI_Gather`. And this question is becoming something of an FAQ on SO, here's one other relevant Q&A -- http://stackoverflow.com/questions/5512245/mpi-scatter-sending-columns-of-2d-array — High Performance Mark, Jan 28 '13 at 09:03

David · Accepted Answer · 2013-02-06T07:48:41.670

Whenever this type of thing happens, I fire up a debugger.

I always recommend the parallel debugger Allinea DDT but I am biased as I am one of the team developing it, but it helps find this kind of bug. You can also try GDB but that will require more manual intervention to handle the multiple processes.

In your case, the code you've posted doesn't segfault in my MPI (Open MPI), it actually hangs at the MPI_Bcast in proc 0, and the rest hang at the MPI_Sends in their branch of the loop. This is because ranks 1 and above are not calling MPI_Bcast: they need to, to match the sender, rank 0, and receive the data.

Why not download a debugger and see for yourself - once you've fixed this broadcast mismatch, the debugger will halt your program as soon as you get the segmentation fault that you are looking for and show you where the fault lies.

How to multiply matrix using parallel computers (MPI)

1 Answers1