I have written a code for Matrix-Vector multiplication. The matrix is divided into blocks of rows based on the number of threads and each block is multiplied by the vector and the vector is stored in an array private to the thread. But my speedup is very poor. For matrix of size 16 X 16, it is below 1.
Can this be due to the fact that i declare my matrix and vector outside as shared variables and that maybe causing race condition/false sharing when each thread tries to read the value from the matrix and vector?
I am bit confused between false sharing and race condition.
#include <stdio.h>
#include <omp.h>
#include <stdlib.h>
#define SIZE 128 // The size should be divisible by thenumber of threads
int main(int argc, char *argv[]) {
int thread_count = strtol(argv[1],NULL,10);
// Declare the variables
int i,j;
long A[SIZE][SIZE], b[SIZE],V[SIZE]={0};
//long Vect[SIZE]={0};
double start, end;
// Generate a matrix of size mxm
for (i=0; i<SIZE; i++)
{ for (j=0; j<SIZE; j++)
A[i][j] = i+j;
}
printf("The Matrix is:\n");
// Print the Matrix
for (i=0; i<SIZE; i++)
{ for (j=0; j<SIZE; j++)
{
printf("%12ld", A[i][j]);
}
printf("\n");
}
// Generate a vector of size m
for (i=0; i<SIZE; i++)
b[i] = i;
printf("The vector is: \n");
// Print a vector
for (i=0; i<SIZE; i++)
printf("%12ld\n", b[i]);
start = omp_get_wtime();
//omp_set_num_threads(NUM_THREADS);
#pragma omp parallel num_threads(thread_count)
{
int i,j,k, id, nthrds;
long Vect[SIZE]={0};
id = omp_get_thread_num();
nthrds = omp_get_num_threads();
for (i=id*SIZE/nthrds; i<(id*SIZE/nthrds + SIZE/nthrds); i++)
{ Vect[i] = 0;
{
for (j=0; j<SIZE; j++)
Vect[i] += A[i][j]*b[j];
}
}
#pragma omp critical
{
for (k=0; k<SIZE; k++)
V[k] += Vect[k];
}
}
end = omp_get_wtime();
printf("The vector obtained after multiplication is:\n");
for (i=0; i<SIZE; i++)
printf("%12ld\n", V[i]);
printf("The time taken for calculation is: %lf\n", end - start);
return 0;
}