I've got NVS 5400M and I'm trying to get reliable time measurement results for cuda addition on matrix (instance 1000 x 1000).
__global__ void MatAdd(int** A, int** B, int** C) {
int i = threadIdx.x;
int j = threadIdx.y;
C[i][j] = A[i][j] + B[i][j]; }
And I'm doing measurement like:
int numBlocks = 1;
dim3 threadsPerBlock(1000, 1000);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
MatAdd <<<numBlocks, threadsPerBlock>>>(pA, pB, pC);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << setprecision(10) << "GPU Time [ms] " << time << endl;
and the result is: 0.001504000043 ms, which is relatively small. My question is am I doing it right?