1

I am just trying to learn MPI and I am parallelizing matrix multiplication using MPI. Following is my code

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>

#define CLK CLOCK_MONOTONIC
struct timespec diff(struct timespec start, struct timespec end){
        struct timespec temp;
        if((end.tv_nsec-start.tv_nsec)<0){
                temp.tv_sec = end.tv_sec-start.tv_sec-1;
                temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
        }
        else{
                temp.tv_sec = end.tv_sec-start.tv_sec;
                temp.tv_nsec = end.tv_nsec-start.tv_nsec;
        }
        return temp;
}

int main(int argc, char* argv[])
{
    struct timespec start_e2e, end_e2e, start_alg, end_alg, e2e, alg;
        /* Should start before anything else */
        clock_gettime(CLK, &start_e2e);

        /* Check if enough command-line arguments are taken in. */
        if(argc < 3) {
                printf( "Usage: %s n p \n", argv[0] );
                return -1;
        }

    MPI_Init(NULL, NULL);

    const int n = atoi(argv[1]);
    const int p = atoi(argv[2]);

    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    int** matA = (int **) malloc(n * sizeof(int *));
    int** matB = (int **) malloc(n * sizeof(int *));
    int** matC = (int **) malloc(n * sizeof(int *));
    int i, j;   
    for(i = 0; i < n; i++)
    {
        matA[i] = (int *) malloc(n * sizeof(int));
        matB[i] = (int *) malloc(n * sizeof(int));
        matC[i] = (int *) malloc(n * sizeof(int));
        for(j = 0; j < n; j++)
        {
            matB[i][j] = 1; // Initialize
            matC[i][j] = 0; // Initialize
        }   
    }   

    // Total number of processors
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    if(world_rank == 0)
    {
        for(i = 0; i < n; i++)
        {
            for(j = 0; j < n; j++)
                matA[i][j] = 2;
        }
        int destination;
        double start = MPI_Wtime();

        clock_gettime(CLK, &start_alg); /* Start the algo timer */

        for(destination = 1; destination < world_size; destination++) 
        {   
            int start = destination * (n / world_size);
            int end = (destination + 1) * (n / world_size);
            if(destination == world_size - 1)
                end = n;
            int offset = start;
            int rows = (end - start);
            MPI_Send(&offset, 1, MPI_INT, destination, 1, MPI_COMM_WORLD); // Send offset
            MPI_Send(&rows, 1, MPI_INT, destination, 2, MPI_COMM_WORLD); // Send number of rows
            MPI_Send(&matA[offset][0], rows * n, MPI_INT, destination, 3, MPI_COMM_WORLD); // Send portion of matrix A  
        }
        double sending = MPI_Wtime();

        // Do matrix multiplication specific to master processor
        int k;
        int rows = n / world_size;
        for(i = 0; i < rows; i++) 
        {
            for(j = 0; j < n; j++) 
            {
                for(k = 0; k < n; k++)
                    matC[i][j] += (matA[i][k] * matB[k][j]);
            }
        }

        // Wait for other processors to complete and combine their results
        double receiving = MPI_Wtime();
        int source;
        for(source = 1; source < world_size; source++) 
        {       
            int offset, rows;
            MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive offset
            MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive number of rows
            MPI_Recv(&matC[offset][0], rows * n, MPI_INT, source, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive portion of matrix C
        }
        double end = MPI_Wtime();
        clock_gettime(CLK, &end_alg); /* End the algo timer */
        clock_gettime(CLK, &end_e2e);
            e2e = diff(start_e2e, end_e2e);
            alg = diff(start_alg, end_alg);
        printf("%s,%s,%d,%d,%d,%ld,%d,%ld\n", problem_name, approach_name, n, p, e2e.tv_sec, e2e.tv_nsec, alg.tv_sec, alg.tv_nsec);
    }
    else 
    {
        int offset;
        int rows;
        MPI_Recv(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);     // Receive offset
        MPI_Recv(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);       // Receive number of rows
        MPI_Recv(&matA[offset][0], rows * n, MPI_INT, 0, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);  // Receive portion of matrix A

        int k;

        // Do matrix multiplication
        for(i = offset; i < offset + rows; i++) {
            for(j = 0; j < n; j++) {
                for(k = 0; k < n; k++) {
                    matC[i][j] += (matA[i][k] * matB[k][j]);
                }
            }
        }
        MPI_Send(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD); // Send offset
        MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); // Send number of rows
        MPI_Send(&matC[offset][0], rows * n, MPI_INT, 0, 3, MPI_COMM_WORLD); // Send portion of matrix C
    }
    for(i = 0; i < n; i++) {
        free(matA[i]); 
        free(matB[i]); 
        free(matC[i]);
    }
    printf("End:%d\n", world_rank);
    MPI_Finalize();
}

When I run the program on a cluster having 4 nodes and each node having 16 cores, initially the code works without any errors. But after some random number of runs, the code throws segmentation fault and again the code runs without any error. Even the printf statement before MPI_Finalize() is executed (when I get seg fault) by all the processes and all the rows of output are correctly calculated and received but I don't get the reason why it does not work. Also, on my laptop with only 2 physical cores when I run the code for n,p which gave me seg fault on cluster, the code runs perfectly fine without any seg faults at all.

This is the error trace Sorry for the low-quality image, I didn't have any other way of extracting traces.

Thanks in advance.

Edit: Expected output: Simple matrix multplication of two matrices matA and matB stored in matC. matA has all the entries as 2 and matB has all the entries as 1. So matC should have 2n in all entries where nxn is dimension of matA, matB and matC.

Edit: Error testcase: For following n(dimension),p(number of cores) the code gave seg fault. I think it is random but to make question more clear
1. n = 2048 p = 12
2. n = 64 p = 16
3. n = 1024 p = 28
4. n = 2048 p = 16 and so on

m.shah
  • 11
  • 2
  • You're posting the shot of text-based window. Tell me, was that _easy_? – Sourav Ghosh Dec 15 '17 at 09:43
  • No. I didn't have any other option. @SouravGhosh – m.shah Dec 15 '17 at 09:52
  • 2
    Welcome to Stack Overflow! _Questions seeking debugging help (why isn't this code working?) must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself. Questions without a clear problem statement are not useful to other readers. See: How to create a [mcve]._ – Sourav Ghosh Dec 15 '17 at 09:53
  • Can't you access that machine via Putty or similar that allows copying text from your console? – Gerhardh Dec 15 '17 at 10:01
  • I mean I had forgotten to copy-paste it, just took a photo and didn't have access to cluster later. So this image is all I have right now @Gerhardh – m.shah Dec 15 '17 at 10:09
  • @SouravGhosh updated question. Thanks for the help as I am new to the community. – m.shah Dec 15 '17 at 10:12
  • You need to allocate matrices as 2D arrays, and not as arrays of arrays. – Gilles Gouaillardet Dec 15 '17 at 11:15
  • If you want to send multiple rows of a 2D-matrix with a `MPI_Send` like that, they must be contiguous in memory. See also MPI_Scatter for 2D matrices, there are many questions about that here... In addition to the abysmal presentation of the error message, your code example is **NOT** a [mcve] on many levels. – Zulan Dec 15 '17 at 13:50

0 Answers0