#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>
#include <intrin.h>
#include <stdint.h>
uint64_t rdtsc()
{
return __rdtsc();
}
void init_matrix(int *a,int size)
{
for(int i=0;i<size;i++)
a[i]=i;
}
void print_matrix(int *a,int rows,int columns)
{
for(int i=0;i<rows;i++){
for(int j=0;j<columns;j++){
printf("%d ",a[j+i*columns]);
}
printf("\n");
}
}
__global__ void add_matrix(int *c,int *a,int *b,int rows,int columns)
{
//printf("Thread Launched %d\n",threadIdx.x);
int x = threadIdx.x+blockIdx.x*blockDim.x;
int y= threadIdx.y+blockIdx.y*blockDim.y;
int i=x+y*columns;
c[i]=a[i]+b[i];
}
int main()
{
int rows=1<<10,columns=1<<10;
int *h_a,*h_b,*h_c;
int blockx=512,blocky=512;
int num_bytes=rows*columns*sizeof(int);
h_a=(int *)malloc(num_bytes);
h_b=(int *)malloc(num_bytes);
h_c=(int *)malloc(num_bytes);
init_matrix(h_a,rows*columns);
init_matrix(h_b,rows*columns);
int *d_a,*d_b,*d_c;
dim3 block(blockx,blocky);
dim3 grid(rows/block.x,columns/block.x);
cudaMalloc((void**)&d_a, num_bytes);
cudaMalloc((void**)&d_b, num_bytes);
cudaMalloc((void**)&d_c, num_bytes);
cudaMemcpy(d_a,h_a,num_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,num_bytes, cudaMemcpyHostToDevice);
unsigned __int64 a,b;
a=rdtsc();
add_matrix<<<grid,block>>>(d_c,d_a,d_b,rows,columns);
b=rdtsc();
printf("Cycles Taken=%d\n",b-a);
cudaDeviceSynchronize();
cudaMemcpy(h_c,d_c,num_bytes,cudaMemcpyDeviceToHost);
system("pause");
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
cudaDeviceReset();
return 0;
}
Above is a matrix addition kernel which I am using to determine the execution time of the kernel on GPU in Cycles.
blockx=1,blocky=1 Cycles = 436343455
blockx=4,blocky=4 Cycles = 32447213
blockx=32,blocky=32 Cycles= 8421874
blockx=128,blocky=128 Cycles =71655
blockx=256,blocky=256 Cycles =73000
blockx=512, blocky=512 Cycles 70002
The above shows the number of cycles the kernel took with changing block.x and block.y dimensions. The execution time is much less in case of relatively larger block dimensions. Can anybody explain me why is it so. Is it because of the reason that warps do not get utilized properly?
P.S- These results are from an ASUS ROG laptop having i7-4710HQ and GTX 860M.
Thanks