I did a program that does matrices multiplications (without optimization).
for(i=0; i<a_r; i++)
for(j=0;j<b_c; j++)
for(k=0;k<a_c; k++)
c[i][j]=c[i][j]+a[i][k]*b[k][j];
Depending on how I allocate the memory, the time of computation is different.
It is important to notice three points:
- I record only the time of computation and not the time of allocation (plus the time of allocation is negligible compared with the time of computation).
- I initialize the matrices with random numbers before computing. I use huge matrices (2500 int*2500 int). I choose this size to use the RAM memory but not the swap.
- This phenomenon disappears if it doesn't use RAM.
I test this program with three different allocations:
Test 1: Global allocation
Int a[2500][2500], b[2500][2500], c[2500][2500];
Main {matrix multiplication a*b=c}
The computing time is quite constant (around 41s).
Test 2: dynamic allocation (arrays)
Main{
int **a,**b,**c;
a=(int **) malloc(sizeof(int)*2500);
for( i=0;i<a_c; i++)
a[i]=(int *) malloc(sizeof(int)*2500);
…
matrix multiplication a*b=c
}
When I execute the program several times in a raw I obtain these running times: 260 seconds, 180, 110, 110… If I wait around 5 seconds and launch again the program I obtain the same results.
Test 3: dynamic allocation (lines)
Main{
int *a, *b, *c;
a=(int *) malloc(sizeof(int)*2500*2500);
… (same for b and c)
matrix multiplication a*b=c
}
The computing time is quite constant (around 44s).
I think test 2 is less efficient because of the way the data are stored in memory. Like explain in this article(the website is out today) or in this question. Some way to go through the memory are more efficient and as a result some way to allocate memory give you a more efficient program.
But (in test 2), I have no clue to figure why the program is faster over time. Does anyone have an idea to explain this phenomenon? Thanks in advance.
P.S. I did these tests on an Intel Xeon E5-2625 with CentOS 6.3 and kernel Linux 3.11.1. EDIT : Frequency scaling is disabled on my computer. The frequency is constant.
Here is the code:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <float.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/resource.h>
/****************************************************
* Random generate a random int between _Min and _Max
*/
int Random(int _iMin,int _iMax)
{
return (_iMin + (rand()%(_iMax-_iMin+1)));
}
/****************************************************
* Return the struct of getrusage.
*/
struct rusage give_ru()
{
struct rusage ru;
getrusage(RUSAGE_SELF, &ru);
//getrusage(RUSAGE_CHILDREN, &ru);
return ru;
}
/****************************************************
* Do a matrix multiplication a*b=c
* This program isn't optimized.
*/
int main()
{
/*
* Initialize variables and array sizes.
*/
int **a,**b,**c;
printf("\nenter Taille de la matrice : 2500");
int a_r=2500,a_c=2500,b_r=2500,b_c=2500;
clock_t start,end;
struct rusage start1,end1;
/* variables to store time difference between
start of matrix multiplication and end of multiplication. */
double dif; /*variable to calculate the time difference between the parallelization */
int i,j,k;
if(a_c!=b_r )
{
printf("\ncan not multiply");
goto again;
}
/*
* Memory allocation.
*/
start =clock();
a=(int **) malloc(sizeof(int *)*a_r);
if (a == NULL)
{
printf("Allocation of matrix a failed \n");
exit(0);
}
for( i=0;i<a_c; i++)
{
a[i]=(int *) malloc(sizeof(int)*a_c);
if (a[i] == NULL)
{
printf("Allocation of matrix a[%d] failed \n",i);
exit(0);
}
}
b=(int **) malloc(sizeof(int *)*b_r);
if (b == NULL)
{
printf("Allocation of matrix b failed \n");
exit(0);
}
for( i=0;i<b_c; i++)
{
b[i]=(int *) malloc(sizeof(int)*b_c);
if (b[i] == NULL)
{
printf("Allocation of matrix b[%d] failed \n",i);
exit(0);
}
}
c=(int **) malloc(sizeof(int *)*a_r);
if (c == NULL)
{
printf("Allocation of matrix c failed \n");
exit(0);
}
for( i=0;i< b_c; i++)
{
c[i]=(int *) malloc(sizeof(int)*b_c);
if (c[i] == NULL)
{
printf("Allocation of matrix c[%d] failed \n",i);
exit(0);
}
}
/* Memory is lock*/
printf("Starting mlockall.\n");
if(mlockall(MCL_CURRENT | MCL_FUTURE)<0) {
perror("mlockall");
return 1;
}
/*
* Matrix initialization (with random integer).
*/
printf("Initializing matrices...\n");
//initializing first matrix
srand(time(NULL));
for(i=0;i<a_r; i++)
{
for(j=0;j<a_c; j++)
{
a[i][j] = Random(0,100);//i+j;
//printf("%d \n", a[i][j]);
}
}
// initializing second matrix
srand(time(NULL));
for(i=0;i<b_r; i++)
{
for(j=0;j<b_c; j++)
{
b[i][j] = Random(0,100);//i*j;
}
}
/*initialize product matrix */
srand(time(NULL));
for(i=0;i<a_r; i++)
{
for(j=0;j< b_c; j++)
{
c[i][j]= Random(0,100);//0;
}
}
end= clock(); //end the timer
dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
printf("Malloc and initialization took %f sec. time.\n", dif);
/*
* Matrix Multiplication.
*/
start =clock(); //start the timer
start1 = give_ru(); //start the timer
/* multiply matrix one and matrix two */
for(i=0;i<a_r; i++)
for(j=0;j<a_c; j++)
for(k=0;k<b_c; k++)
c[i][j]=c[i][j]+a[i][k]*b[k][j];
end1 = give_ru(); //end the timer
end = clock(); //end the timer
dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
struct timeval timeUserStart = start1.ru_utime;
double utimeStart = (double)timeUserStart.tv_sec + (double)timeUserStart.tv_usec / 1000000.0;
struct timeval timeSystemStart = start1.ru_stime;
double stimeStart = (double)timeSystemStart.tv_sec + (double)timeSystemStart.tv_usec / 1000000.0;
struct timeval timeUserEnd = end1.ru_utime;
double utimeEnd = (double)timeUserEnd.tv_sec + (double)timeUserEnd.tv_usec / 1000000.0;
struct timeval timeSystemEnd = end1.ru_stime;
double stimeEnd = (double)timeSystemEnd.tv_sec + (double)timeSystemEnd.tv_usec / 1000000.0;
double difuTime = utimeEnd - utimeStart;
double difsTime = stimeEnd - stimeStart;
long pageReclaims = end1.ru_minflt;//start1.ru_minflt-
long pageFaults = end1.ru_majflt;//start1.ru_majflt-
printf("Parallelization took %f sec. time.\n", dif);
printf("Time User : %f .\n", difuTime );
printf("Time System : %f .\n", difsTime );
printf("Page reclaims : %ld .\n", end1.ru_minflt);
printf("Page faults : %ld .\n", end1.ru_majflt);
/*
* free memory.
*/
printf("Finished, cleaning up \n");
munlockall();
/*free memory*/
for(i=0;i<a_r; i++)
{
free(a[i]);
a[i] = NULL;
}
free(a);
a = NULL;
for(i=0;i<a_c; i++)
{
free(b[i]);
b[i] = NULL;
}
free(b);
b = NULL;
for(i=0;i<b_c; i++)
{
free(c[i]);
c[i] = NULL;
}
free(c);
c = NULL;
return 0;
}