Cuda Performance measuring - Elapsed time returns zero

Question

I wrote a few kernel function and wonder how many miliseconds to process these functions.

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define N 8000

void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}

__global__ void add(int* a, int *b) {
    int add = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        add = a[tid] + b[tid];
    }
}

__global__ void subtract(int* a, int *b) {
    int subtract = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        subtract = a[tid] - b[tid];
    }
}

__global__ void multiply(int* a, int *b) {
    int multiply = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        multiply = a[tid] * b[tid];
    }
}

__global__ void divide(int* a, int *b) {
    int divide = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        divide = a[tid] / b[tid];
    }
}

__global__ void modu(int* a, int *b) {
    int modulus = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        modulus = a[tid] % b[tid];
    }
}

__global__ void neg(int *data) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        data[tid] = -data[tid];
    }
}

float duration(int *devA, int *devB, int blocksPerGrid, int threadsPerBlock) {

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devA);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devB);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return elapsedTime;
}

int main(void) {

    int a[N], b[N];
    float dur = 0;



    int *devA, *devB;

    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));

    fillArray(a, N);
    fillArray(b, N);

    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devA, b, N * sizeof(int), cudaMemcpyHostToDevice);



    dur = duration(a, b, N, 1);

    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";

    return 0;
}

Milisecond always return zero. Why? What I'm missing here? If a i remove the neg functions from the duration duration function. It returns 0.15687 ms. I think it is a small number to process these functions. whats wrong with that program?

After edit, I did this:

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

const int N = 8000;

void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}

__global__ void add(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

__global__ void subtract(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] - b[tid];
    }
}

__global__ void multiply(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] * b[tid];
    }
}

__global__ void divide(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] / b[tid];
    }
}

__global__ void modu(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] % b[tid];
    }
}

__global__ void neg(int *data, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = -data[tid];
    }
}

float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    double hArrayC[N];

    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return elapsedTime;
}

int main(void) {

    int a[N], b[N],c[N];
    float dur = 0;

    int *devA, *devB,*devC;

    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));
    cudaMalloc((void**) &devC, N * sizeof(int));

    fillArray(a, N);
    fillArray(b, N);

    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devC, c, N * sizeof(int), cudaMemcpyHostToDevice);




    dur = duration(devA, devB, devC,N, 1);

    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";



    cudaFree(devA);
    cudaFree(devB);
    return 0;
}

score 0 · Answer 1 · answered May 13 '13 at 06:32

Cuda tasks are running on device without blocking the CPU thread. So the cuda call will block only when you try to get computed data from device memory and it's not ready yet. Or when you explicitly synchronize you CPU thread with GPU using cudaDeviceSyncronize() call. If you want to measure the calculation time you need to synchronize before stopping the timer.

If you will be interested in measuring memory copy time you need to synchronize after calculation started and before the copy timer have started or the calculation time will be shown as copy time.

You can use the profiler that is included in cuda SDK to measure the time of all cuda calls.

score 0 · Accepted Answer · edited Jun 20 '20 at 09:12

0

Your kernels are not doing anything, since you only store results in registers. When compiling, you get some warnings:

kernel.cu(13): warning: variable "add" was set but never used

Also, if you want to see some better timings, use NVIDIA's profiler: either nvprof (CLI) or nvvp (GUI).

$ nvprof ./kernel

======== NVPROF is profiling kernel...
======== Command: kernel
Global memory version: Process completed in 0 for a data set of 8000 integers.
======== Profiling result:
  Time(%)     Time   Calls       Avg       Min       Max  Name
  100.00   18.46us       2    9.23us    6.02us   12.45us  [CUDA memcpy HtoD]
    0.00       0ns       1       0ns       0ns       0ns  multiply(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  add(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  modu(int*, int*)
    0.00       0ns       2       0ns       0ns       0ns  neg(int*)
    0.00       0ns       1       0ns       0ns       0ns  subtract(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  divide(int*, int*)

You are also using N blocks per grid, and 1 thread per block. You should consider reading the answer to this question.

UPDATE

Concerning the vector addition (and the other simple operations) in itself, you should either study the vectorAdd sample of the CUDA SDK, or use Thrust. The first option will teach you how to use CUDA, and the second option will show you the kind of high-level operations you can do with Thrust. If I were you, I would do both.

edited Jun 20 '20 at 09:12

Community

1
1

answered May 13 '13 at 06:35

BenC

8,729
3
49
68

so how can make kernel worked? Do i need to copy sum values to host side? – asdasd May 13 '13 at 06:39
Your result are compatible with my instructors. How can i worked these kernel worked? Do i need to copy? – asdasd May 13 '13 at 06:41
The real question is: what are you trying to do? You provide `a` and `b` to the GPU kernels, but you do not store the results, so you should also provide a `c` array which will contain the results of the operations. Once the computation is over, you copy `c` back to the host. If this is not just CUDA exercises and you want to do operations on vectors in a "serious" project, you should consider using [Thrust](http://docs.nvidia.com/cuda/thrust/). It is easier to use (STL-like), faster to develop, and it provides many operations on vectors. – BenC May 13 '13 at 06:42
can you please check the question again. I edited the question and did what you say. Is this right? – asdasd May 13 '13 at 06:57
I don't see the difference... If you want to see how it is done, check the [VectorAdd](http://docs.nvidia.com/cuda/cuda-samples/index.html#vector-addition) sample of the CUDA SDK. There is everything you need in that example. – BenC May 13 '13 at 07:00
If you use `nvprof`/`nvvp`, you will be able to profile your code more accurately. `nvvp` can analyze your program and give you some indications on what you should try to optimize. You will for instance see that you are only using 1 thread per block, and 8000 blocks per grid, which is wrong. Read this question: http://stackoverflow.com/questions/4391162/cuda-determining-threads-per-block-blocks-per-grid – BenC May 13 '13 at 07:13
In my assignment instructor says that While executing kernels launch as many blocks as the number of data. In each block launch only one thread. So i did like that – asdasd May 13 '13 at 07:21
Well, CUDA runs instructions on groups of 32 threads called warps. If you only run 1 thread per block, 1 warp will be assigned, and 31 threads will be idle... All of this is explained in the question I linked. – BenC May 13 '13 at 07:24
There is still a few things wrong I suppose (`double hArrayC[N];` should be an array of `int` etc.), but the biggest issues have been tackled normally. For the rest, study the samples in the CUDA SDK, watch the relevant webinars on [this page](https://developer.nvidia.com/gpu-computing-webinars) and you will be fine. – BenC May 13 '13 at 07:31
what do you think about this : http://stackoverflow.com/questions/16517513/cuda-performance-always-return-different-values – asdasd May 13 '13 at 08:20

score -1 · Answer 3 · answered May 13 '13 at 06:23

-1

Try use float (or double) variables and arrays instead of int to store all arithmetic variables and operations. Sometimes the time interval is too small that integer value will always round to zero.

answered May 13 '13 at 06:23

Neoh

15,906
14
66
78

can you cast to `int` just for the mod function? – Neoh May 13 '13 at 06:32
i made all variables double but still returns 0 – asdasd May 13 '13 at 06:37

Cuda Performance measuring - Elapsed time returns zero

3 Answers3

UPDATE

Linked