I know that it exists this thread openMP performance
but here my example is very simple
C code:
int MaFunc(size_t szGlobalWorkSize)
{
int iGID = 0;
float *pfResult = (float *)calloc(szGlobalWorkSize * 100, sizeof(float));
float fValue = 0.5f;
struct timeval tim;
gettimeofday(&tim, NULL);
double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);
#pragma omp parallel for
for (iGID = 0; iGID < (int)szGlobalWorkSize * 100; iGID++)
{
pfResult[iGID] = fValue;
// printf("Element %d traité par le thread %d \n",iGID,omp_get_thread_num());
}
gettimeofday(&tim, NULL);
double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf("%.6lf Time OMP\n", tLaunch2-tLaunch1);
}
Timing of this example increases when i use openMP 0.015s without openMP against 0.045 sec with openMP (szGlobalworkSize = 131072)
I use this line of gcc: gcc -march=native -fopenmp -O3 MyCode.c -lm
gcc (GCC) 4.8.2 20140120 (Red Hat 4.8.2-15)
Edit1:
int MyFunc2()
{
int iGID = 0;
int j = 0;
//float *pfResult = (float *)calloc(szGlobalWorkSize * 100, sizeof(float));
float *pfResult = (float *)valloc(szGlobalWorkSize * 100* sizeof(float));
float fValue = 0.5f;
struct timeval tim;
gettimeofday(&tim, NULL);
double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);
double time = omp_get_wtime();
int iChunk = getpagesize();
int iSize = ((int)szGlobalWorkSize * 100) / iChunk;
// #pragma omp parallel
#pragma omp parallel for
for (iGID = 0; iGID < iSize; iGID++)
{
for (j = 0; j < iChunk; j++)
{
pfResult[iGID * iChunk + j] = fValue;
//pfResult[iGID] = fValue;
}
// printf("Element %d traité par le thread %d \n",iGID,omp_get_thread_num());
}
time = omp_get_wtime() - time;
gettimeofday(&tim, NULL);
double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf("%.6lf Time OMP\n", tLaunch2-tLaunch1);
printf("Pagesize=%d\n", getpagesize());
printf("%.6lf Time OMP2\n", time);
}
also same time with chunk with memalign
Edit 2 with timing by thread
#pragma omp parallel private(dLocalTime)
{
pdTime[omp_get_thread_num()] = omp_get_wtime();
printf("Thread Begin %d Time %f\n", omp_get_thread_num(), pdTime[omp_get_thread_num()] );
#pragma omp for
for (iGID = 0; iGID < iSize; iGID++)
{
// for (j = 0; j < iChunk; j++)
{
// pfResult[iGID * iChunk + j] = fValue;
pfResult[iGID] = fValue;
}
}
//dLocalTime = (omp_get_wtime() - dLocalTime);
pdTime[omp_get_thread_num()] = (omp_get_wtime() - pdTime[omp_get_thread_num()]);
printf("Thread End %d Time %f\n", omp_get_thread_num(), pdTime[omp_get_thread_num()]);
// printf("End Element %d traité par le thread %d \n",0,tid);
}
Each threads takes 0.015 for a total of 0.045 so there is a fix part in openmp of 0.03 It is strange that even with huge dimensions we see this fix part of openmp and thread which have less work takes same time that the whole size (48 threads here)
Thanks