I'm new to openMP, and I was trying to parallelize a Gaussian Elimination, and I'm having troubles with performance. I'm compiling the code below using:
gcc -o gaussian_elimination gaussian_elimination.c -lm -lgsl -lgslcblas -fopenmp -Wall
And setting the number of threads on the terminal with export OMP_NUM_THREADS
And my problem is that the parallel version of this code is running way slower than the serial version of the same. I believe that this is because I declared #pragma parallel for inside the external loop, and this would force openMP to create and destroy thread at each iteration, which would be incredibly costly, but I haven't seen any other clear way to do the same kind of operation, and I don't think I can exchange the external loop with the internal parallel ones.
I'm probably missing something, but I have not found any other forum threads here commenting on this particular problem. As far as execution correctness goes, my code seems to be functioning alright, the problem is just performance-wise.
Thanks in Advance
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdbool.h>
#include <time.h>
#include <gsl/gsl_linalg.h>
#include <gsl/gsl_rng.h>
#define DEBUG_MODE false
int random_matrix(double *A, int N,long long int seed);
int print_matrix(double *A, int N);
int print_vector(float *b,int N);
int main(int argc, char **argv){
int N=1000;
int i,j,k,l,i_p,s,err,D=N+1;
long long int seed=9089123498274; // just a fixed seed only not to bother
double *A,pivot,sw,tmp,begin,end,time_spent;
double *Aref,*bref;
gsl_matrix_view gsl_m;
gsl_vector_view gsl_b;
gsl_vector *gsl_x;
gsl_permutation *gsl_p;
/* Input */
//scanf("%d",&N);
A = (double*)malloc(N*(N+1)*sizeof(double));
if(A==NULL){
printf("Matrix A not allocated\n");
return 1;
}
Aref = (double*)malloc(N*N*sizeof(double));
if(Aref==NULL){
printf("Matrix A not allocated\n");
return 1;
}
bref = (double*)malloc(N*sizeof(double));
if(bref==NULL){
printf("Vector B not allocated\n");
return 2;
}
/*
for(i=0;i<N;i+=1)
for(j=0;j<N;j+=1)
scanf("%f",&(A[i*N+j]));
for(i=0;i<N;i+=1)
scanf("%f",&(b[i]));
*/
/*
for(i=0;i<N*N;i++)
A[i]=(float) a_data[i];
for(i=0;i<N;i+=1)
b[i]=(float) b_data[i]; */
err= random_matrix(A,N,seed);
if(err!=0)
return err;
for(i=0;i<N;i++)
for(j=0;j<N;j+=1)
Aref[i*N+j]= A[i*D+j];
for(i=0;i<N;i+=1)
bref[i]= A[i*D+N];//b[i];
printf("GSL reference:\n");
gsl_m = gsl_matrix_view_array (Aref, N, N);
gsl_b = gsl_vector_view_array (bref, N);
gsl_x = gsl_vector_alloc (N);
gsl_p = gsl_permutation_alloc(N);
begin = clock();
gsl_linalg_LU_decomp(&gsl_m.matrix, gsl_p, &s);
gsl_linalg_LU_solve(&gsl_m.matrix, gsl_p, &gsl_b.vector, gsl_x);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("gsl matrix solver: %lf s\n",time_spent);
if(DEBUG_MODE==true)
gsl_vector_fprintf(stdout,gsl_x,"%f");
gsl_permutation_free(gsl_p);
gsl_vector_free(gsl_x);
begin = omp_get_wtime();
for(i=0;i<N;i+=1){
i_p = i;
pivot = fabs(A[i*D+i]);
for(j=i;j<N;j+=1)
if(pivot<fabs(A[j*D+i])){
pivot = fabs(A[j*D+i]);
i_p = j;
}
#pragma omp parallel for shared(i,N,A,i_p) private(j,sw)
for(j=i;j<D;j+=1){
sw = A[i*D+j];
A[i*D+j] = A[i_p*D+j];
A[i_p*D+j] = sw;
}
pivot=A[i*D+i];
#pragma omp parallel for shared(i,D,pivot,A) private(j)
for(j=0;j<D;j++)
A[i*D+j]=A[i*D+j]/pivot;
#pragma omp parallel for shared(i,A,N,D) private(tmp,j,k,l)
for(j=i+1;j<N+i;j++){
k=j%N;
tmp=A[k*D+i];
for(l=0;l<D;l+=1)
A[k*D+l]=A[k*D+l]-tmp*A[i*D+l];
}
}
end = omp_get_wtime();
time_spent = (end - begin);
printf("omp matrix solver: %lf s\n",time_spent);
/* Output */
if(DEBUG_MODE==true){
printf("\nCalculated: \n");
for(i=0;i<N;i+=1)
printf("%.6f \n",A[i*(N+1)+N]);
printf("\n");
}
free(A);
return 0;
}
int random_matrix(double *A, int N,long long int seed){
int i,j;
const gsl_rng_type * T;
gsl_rng *r;
gsl_rng_env_setup();
T = gsl_rng_default;
r = gsl_rng_alloc (T);
for(i=0;i<N;i++)
for(j=0;j<=N;j++)
A[i*(N+1)+j]= gsl_rng_uniform (r);
gsl_rng_free (r);
return 0;
}
int print_matrix(double *A, int N){
int i,j;
for(i=0;i<N;i++)
for(j=0;j<=N+1;j++){
if(j==0 || j==N || j==N+1)
printf(" | ");
printf("%.2f ",A[i*(N+1)+j]);
if(j==N+1)
printf("\n");
}
return 0;
}
int print_vector(float *b,int N){
int i;
for(i=0;i<N;i+=1)
printf("%f\n", b[i]);
return 0;
}
I updated the code above with the omp_get_wtime(), and now it reads as the wtime diminishing as I include more and more threads, so, it does behave as it should, although not as clean as I would like.
For 1000 x 1000 matrices I get 0.25 s for the GSL lib, 4.4 s for the serial omp run and 1.5 s for the 4-thread run.
For 3000 x 3000 matrices, I get ~ 9s for the GSL lib, ~ 117 s for the serial omp run and ~ 44 s for the 4 thread-run, thus at least adding more threads indeed speeds up the program!
Thanks a lot everyone