The program mentioned below calculates the vector-vector dot product with sequential, CPU parallel (using OpenMP) and GPU parallel (Cuda) approaches. The following code segments show how the each of these functions are invoked and how the elapsed time is calculated.
#define SEQUENTIAL "-s"
#define PARALLEL "-p"
#define CUDA "-c"
#define VERIFY "-v"
#define TEST_AND_COMPARE "-t"
#define GET_TIME(x); if (clock_gettime(CLOCK_MONOTONIC, &(x)) < 0) { perror("clock_gettime( ):");exit(EXIT_FAILURE);}
int main(int argc, char **argv) {
struct timespec t1, t2, t3, t4;
unsigned long sec, nsec;
float comp_time;
//invoking the sequential version
if (!strcmp(argv[1], SEQUENTIAL)) {
GET_TIME(t1);
sequentialVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("N=%d: Time(ms)=%.5f \n", N, comp_time);
}
//invoking the parallel version
else if (!strcmp(argv[1], PARALLEL)) {
noOfThreads = atoi(argv[2]);
GET_TIME(t1);
parallelVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("N=%d: Threads=%d: Time(ms)=%.5f \n", N, noOfThreads,
comp_time);
}
//the cuda invoke goes here...
//comparing the answers received by each method of calculation
else if (!strcmp(argv[1], TEST_AND_COMPARE)) {
precision answer1, answer2, answer3;
GET_TIME(t1);
answer1 = sequentialVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f \n", "Serial", N, answer1, comp_time);
noOfThreads = atoi(argv[2]);
GET_TIME(t3);
answer2 = parallelVersion();
GET_TIME(t4);
comp_time = elapsed_time_msec(&t3, &t4, &sec, &nsec);
printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f Threads=%d \n", "Parallel", N, answer2, comp_time, noOfThreads);
}
}
float elapsed_time_msec(struct timespec *begin, struct timespec *end,
unsigned long *sec, unsigned long *nsec) {
if (end->tv_nsec < begin->tv_nsec) {
*nsec = 1000000000 - (begin->tv_nsec - end->tv_nsec);
*sec = end->tv_sec - begin->tv_sec - 1;
} else {
*nsec = end->tv_nsec - begin->tv_nsec;
*sec = end->tv_sec - begin->tv_sec;
}
return (float) (*sec) * 1000 + ((float) (*nsec)) / 1000000;
}
The Makefile
for the above mentioned program is as follows.
#specifying single or double precision
ifeq ($(double),)
precision=
else
precision=-D USE_DOUBLES
endif
#specifying the problem size
ifeq ($(N),)
problem-size=-D PROBLEM_SIZE=1000000
else
problem-size=-D PROBLEM_SIZE=${N}
endif
dot:
nvcc dot-product.cu -arch compute_11 -Xcompiler -fopenmp -O3 $(problem-size) $(precision) -o prog
The code is compiled as make dot
with the default N, and run with ./prog -s
the output is shown as
`N=1000000: Time(ms)=0.00010`
But with the same N, when the program is run with ./prog -t 6
the serial time consumption shows expected behaviour as shown below
Serial N=1000000: Ans=2249052.500000: Time(ms)=2.19174
Parallel N=1000000: Ans=2248955.500000: Time(ms)=0.53915 Threads=6
Cuda N=1000000: Ans=2248959.750000: Time(ms)=0.09935
Why is it behaving like this?