I am just trying to learn MPI and I am parallelizing matrix multiplication using MPI. Following is my code
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#define CLK CLOCK_MONOTONIC
struct timespec diff(struct timespec start, struct timespec end){
struct timespec temp;
if((end.tv_nsec-start.tv_nsec)<0){
temp.tv_sec = end.tv_sec-start.tv_sec-1;
temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
}
else{
temp.tv_sec = end.tv_sec-start.tv_sec;
temp.tv_nsec = end.tv_nsec-start.tv_nsec;
}
return temp;
}
int main(int argc, char* argv[])
{
struct timespec start_e2e, end_e2e, start_alg, end_alg, e2e, alg;
/* Should start before anything else */
clock_gettime(CLK, &start_e2e);
/* Check if enough command-line arguments are taken in. */
if(argc < 3) {
printf( "Usage: %s n p \n", argv[0] );
return -1;
}
MPI_Init(NULL, NULL);
const int n = atoi(argv[1]);
const int p = atoi(argv[2]);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int** matA = (int **) malloc(n * sizeof(int *));
int** matB = (int **) malloc(n * sizeof(int *));
int** matC = (int **) malloc(n * sizeof(int *));
int i, j;
for(i = 0; i < n; i++)
{
matA[i] = (int *) malloc(n * sizeof(int));
matB[i] = (int *) malloc(n * sizeof(int));
matC[i] = (int *) malloc(n * sizeof(int));
for(j = 0; j < n; j++)
{
matB[i][j] = 1; // Initialize
matC[i][j] = 0; // Initialize
}
}
// Total number of processors
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if(world_rank == 0)
{
for(i = 0; i < n; i++)
{
for(j = 0; j < n; j++)
matA[i][j] = 2;
}
int destination;
double start = MPI_Wtime();
clock_gettime(CLK, &start_alg); /* Start the algo timer */
for(destination = 1; destination < world_size; destination++)
{
int start = destination * (n / world_size);
int end = (destination + 1) * (n / world_size);
if(destination == world_size - 1)
end = n;
int offset = start;
int rows = (end - start);
MPI_Send(&offset, 1, MPI_INT, destination, 1, MPI_COMM_WORLD); // Send offset
MPI_Send(&rows, 1, MPI_INT, destination, 2, MPI_COMM_WORLD); // Send number of rows
MPI_Send(&matA[offset][0], rows * n, MPI_INT, destination, 3, MPI_COMM_WORLD); // Send portion of matrix A
}
double sending = MPI_Wtime();
// Do matrix multiplication specific to master processor
int k;
int rows = n / world_size;
for(i = 0; i < rows; i++)
{
for(j = 0; j < n; j++)
{
for(k = 0; k < n; k++)
matC[i][j] += (matA[i][k] * matB[k][j]);
}
}
// Wait for other processors to complete and combine their results
double receiving = MPI_Wtime();
int source;
for(source = 1; source < world_size; source++)
{
int offset, rows;
MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive offset
MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive number of rows
MPI_Recv(&matC[offset][0], rows * n, MPI_INT, source, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive portion of matrix C
}
double end = MPI_Wtime();
clock_gettime(CLK, &end_alg); /* End the algo timer */
clock_gettime(CLK, &end_e2e);
e2e = diff(start_e2e, end_e2e);
alg = diff(start_alg, end_alg);
printf("%s,%s,%d,%d,%d,%ld,%d,%ld\n", problem_name, approach_name, n, p, e2e.tv_sec, e2e.tv_nsec, alg.tv_sec, alg.tv_nsec);
}
else
{
int offset;
int rows;
MPI_Recv(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive offset
MPI_Recv(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive number of rows
MPI_Recv(&matA[offset][0], rows * n, MPI_INT, 0, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive portion of matrix A
int k;
// Do matrix multiplication
for(i = offset; i < offset + rows; i++) {
for(j = 0; j < n; j++) {
for(k = 0; k < n; k++) {
matC[i][j] += (matA[i][k] * matB[k][j]);
}
}
}
MPI_Send(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD); // Send offset
MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); // Send number of rows
MPI_Send(&matC[offset][0], rows * n, MPI_INT, 0, 3, MPI_COMM_WORLD); // Send portion of matrix C
}
for(i = 0; i < n; i++) {
free(matA[i]);
free(matB[i]);
free(matC[i]);
}
printf("End:%d\n", world_rank);
MPI_Finalize();
}
When I run the program on a cluster having 4 nodes and each node having 16 cores, initially the code works without any errors. But after some random number of runs, the code throws segmentation fault and again the code runs without any error. Even the printf statement before MPI_Finalize() is executed (when I get seg fault) by all the processes and all the rows of output are correctly calculated and received but I don't get the reason why it does not work. Also, on my laptop with only 2 physical cores when I run the code for n,p which gave me seg fault on cluster, the code runs perfectly fine without any seg faults at all.
This is the error trace Sorry for the low-quality image, I didn't have any other way of extracting traces.
Thanks in advance.
Edit: Expected output: Simple matrix multplication of two matrices matA and matB stored in matC. matA has all the entries as 2 and matB has all the entries as 1. So matC should have 2n in all entries where nxn is dimension of matA, matB and matC.
Edit: Error testcase: For following n(dimension),p(number of cores) the code gave seg fault. I think it is random but to make question more clear
1. n = 2048 p = 12
2. n = 64 p = 16
3. n = 1024 p = 28
4. n = 2048 p = 16 and so on