I have the following piece of code that gives the resulting time from multiplying 2 matrix of 1024x1024 fields:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUM 1024
float a[NUM][NUM],b[NUM][NUM],c[NUM][NUM];
void initialize_matrix(float m[][NUM]);
void load_matrix(float m[][NUM]);
int main() {
int i,j,k;
clock_t t_inicial,t_final;
load_matrix(a);
load_matrix(b);
initialize_matrix(c);
printf("Starting matrix multiplication 1024x1024...\n\n");
t_inicial=clock();
for(i=0;i<NUM;i++)
for(j=0;j<NUM;j++)
for(k=0;k<NUM;k++)
c[i][j] =c[i][j] + a[i][k] * b[k][j];
t_final=clock();
printf("Matrix multiplication finished in: %3.6f seconds",((float) t_final- (float)t_inicial)/CLOCKS_PER_SEC);
}
void initialize_matrix(float m[][NUM]) {
int i,j;
for(i=0;i<NUM;i++)
for(j=0;j<NUM;j++)
m[i][j]=0.0;
return;
}
void load_matrix(float m[][NUM]) {
int i,j;
#pragma omp parallel for
for(i=0;i<NUM;i++)
for(j=0;j<NUM;j++)
m[i][j]=(float) 10*rand()/(float) rand();
return;
}
This code takes 24 seconds to solve it, I've been told that there's some problem with the cache memory. How can I improve this code so it takes less time? It's not very nice that it takes so long.