I made a program that multiplies matrices of the same dimension using (p)threads. The program accepts command line flags -N n -M m where n is the size of the matrix arrays and m is the number of threads (computing threshold). The program compiles and runs but I get strange times for elapsed time, USR time, SYS time, and USR+SYS time. I am testing sizes n = {1000,2000,4000} with each threshold m = {1,2,4}.
I should be seeing the elapsed time reduced and a fairly constant USR+SYS time for each value of n but that is not the case. The output will fluctuate but the problem is that a higher threshold doesn't result in a reduction of elapsed time. Am I implementing threads wrong or is there an issue with my timing?
compile with: -pthread
./* -N n -M m
main
#include<stdio.h>
#include<stdlib.h>
#include<unistd.h>
#include<pthread.h>
#include<sys/time.h>
#include<sys/resource.h>
struct matrix {
double **Matrix_A;
double **Matrix_B;
int begin;
int end;
int n;
};
void *calculon(void *mtrx) {
struct matrix *f_mat = (struct matrix *)mtrx;
// transfer data
int f_begin = f_mat->begin;
int f_end = f_mat->end;
int f_n = f_mat->n;
// definition of temp matrix
double ** Matrix_C;
Matrix_C = (double**)malloc(sizeof(double)*f_n);
int f_pholder;
for(f_pholder=0; f_pholder < f_n; f_pholder++)
Matrix_C[f_pholder] = (double*)malloc(sizeof(double)*f_n);
int x, y, z;
for(x = f_begin; x < f_end; x++)
for(y = 0; y < f_n; y++)
for(z = 0; z < f_n; z++)
Matrix_C[x][y] += f_mat->Matrix_A[x][z]*f_mat->Matrix_B[z][y];
for(f_pholder = 0; f_pholder < f_n; f_pholder++)
free(Matrix_C[f_pholder]);
free(Matrix_C);
}
int main(int argc, char **argv) {
char *p;
int c, i, j, x, y, n, m, pholder, n_m, make_thread;
int m_begin = 0;
int m_end = 0;
while((c=getopt(argc, argv, "NM")) != -1) {
switch(c) {
case 'N':
n = strtol(argv[optind], &p, 10);
break;
case 'M':
m = strtol(argv[optind], &p, 10);
break;
default:
printf("\n**WARNING**\nUsage: -N n -M m");
break;
}
}
if(m > n)
printf("\n**WARNING**\nUsage: -N n -M m\n=> m > n");
else if(n%m != 0)
printf("\n**WARNING**\nUsage: -N n -M m\n=> n % m = 0");
else {
n_m = n/m;
// initialize input matrices
double ** thread_matrixA;
double ** thread_matrixB;
// allocate rows onto heap
thread_matrixA=(double**)malloc(sizeof(double)*n);
thread_matrixB=(double**)malloc(sizeof(double)*n);
// allocate columns onto heap
for(pholder = 0; pholder < n; pholder++) {
thread_matrixA[pholder]=(double*)malloc(sizeof(double)*n);
thread_matrixB[pholder]=(double*)malloc(sizeof(double)*n);
}
// populate input matrices with random numbers
for(i = 0; i < n; i++)
for(j = 0; j < n; j++)
thread_matrixA[i][j] = (double)rand()/RAND_MAX+1;
for(x = 0; x < n; x++)
for(y = 0; y < n; y++)
thread_matrixB[x][y] = (double)rand()/RAND_MAX+1;
printf("\n*** Matrix will be of size %d x %d *** \n", n, n);
printf("*** Creating matrix with %d thread(s) ***\n", m);
struct rusage r_usage;
struct timeval usage;
struct timeval time1, time2;
struct timeval cpu_time1, cpu_time2;
struct timeval sys_time1, sys_time2;
struct matrix mat;
pthread_t thread_lord[m];
// begin timing
getrusage(RUSAGE_SELF, &r_usage);
cpu_time1 = r_usage.ru_utime;
sys_time1 = r_usage.ru_stime;
gettimeofday(&time1, NULL);
for(make_thread = 0; make_thread < m; make_thread++) {
m_begin += n_m;
// assign values to struct
mat.Matrix_A = thread_matrixA;
mat.Matrix_B = thread_matrixB;
mat.n = n;
mat.begin = m_begin;
mat.end = m_end;
// create threads
pthread_create(&thread_lord[make_thread], NULL, calculon, (void *)&mat);
m_begin = (m_end + 1);
}
// wait for thread to finish before joining
for(i = 0; i < m; i++)
pthread_join(thread_lord[i], NULL);
// end timing
getrusage(RUSAGE_SELF, &r_usage);
cpu_time2 = r_usage.ru_utime;
sys_time2 = r_usage.ru_stime;
gettimeofday(&time2, NULL);
printf("\nUser time: %f seconds\n", ((cpu_time2.tv_sec * 1000000 + cpu_time2.tv_usec) - (cpu_time1.tv_sec * 1000000 + cpu_time1.tv_usec))/1e6);
printf("System time: %f seconds\n", ((sys_time2.tv_sec * 1000000 + sys_time2.tv_usec) - (sys_time1.tv_sec * 1000000 + sys_time1.tv_usec))/1e6);
printf("Wallclock time: %f seconds\n\n", ((time2.tv_sec * 1000000 + time2.tv_usec) - (time1.tv_sec * 1000000 + time1.tv_usec))/1e6);
// deallocate matrices
for(pholder = 0; pholder < n; pholder++) {
free(thread_matrixA[pholder]);
free(thread_matrixB[pholder]);
}
free(thread_matrixA);
free(thread_matrixB);
}
return 0;
}