0

This my code:

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

const int N = 8000;

void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}

__global__ void add(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

__global__ void subtract(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] - b[tid];
    }
}

__global__ void multiply(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] * b[tid];
    }
}

__global__ void divide(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] / b[tid];
    }
}

__global__ void modu(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] % b[tid];
    }
}

__global__ void neg(int *data, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = -data[tid];
    }
}

float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    int hArrayC[N];

    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return elapsedTime;
}

int main(void) {

    int *a, *b;
    a = new int[N];
    b = new int [N];

    float dur = 0;

    int *devA, *devB,*devC;

    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));
    cudaMalloc((void**) &devC, N * sizeof(int));

    fillArray(a, N);
    fillArray(b, N);

    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);


    dur = duration(devA, devB, devC,N, 1);

    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";



    cudaFree(devA);
    cudaFree(devB);
    delete [] a;
    delete [] b;

    return 0;
}

What i want to know the total miliseconds in duration function. But miliseconds always return in different values. Sometimes it is 10 ms sometimes it is 0.78652 sometimes it is 30 miliseconds.Why? What is wrong with my code?

asdasd
  • 55
  • 1
  • 1
  • 7
  • First, life would be easier for you if you used `nvprof` as I already told you. Second, you should read the comments of this question: http://stackoverflow.com/questions/16500703/execution-time-issue-in-cuda-benchmarks#comment23709370_16500703. Try to set the GPU into persistence mode (`nvidia-smi -pm 1`). – BenC May 13 '13 at 08:24
  • i know but i HAVE TO do it in this way. – asdasd May 13 '13 at 08:25
  • You could try to call a dummy kernel before the kernels that you are timing, thus the overhead of driver loading should not influence your timings. – BenC May 13 '13 at 08:26
  • can i solve this programatically? – asdasd May 13 '13 at 08:28
  • can you give us more specifications on your system? which gpu is used on which operating system. the only things wrong with your code is that you only start one thread per block. what does the trick but is a performance hole. try starting more threads per block e.g. 125 blocks with each invoking 64 threads. and free devC at the end for a nice cleanup. – Michael Haidl May 13 '13 at 10:29
  • @kronos: these issues were tackled in his other question (http://stackoverflow.com/questions/16515894/cuda-performance-measuring-elapsed-time-returns-zero) – BenC May 14 '13 at 01:27

1 Answers1

1

This may be caused by the loading/unloading of the NVIDIA drivers. Think of it as an initialization step for the GPU.

You can either set your GPU to persistence mode:

nvidia-smi -pm 1

Or you could run a dummy kernel before timing your GPU code to trigger the loading of the drivers:

__global__ void dummy()
{
    // This kernel does nothing, this is just a "warm-up"
}

// Before your cudaEventRecord etc.
dummy<<<blocksPerGrid, threadsPerBlock>>>();

Or maybe just use cudaThreadSynchronize() before timing your kernels.

BenC
  • 8,729
  • 3
  • 49
  • 68
  • What about the results from `nvprof`? Are they always consistent? – BenC May 13 '13 at 08:43
  • What is your machine config? What kind of GPU are you using? Is this windows or linux? Is the GPU also hosting a display? Which driver and CUDA version are you using? When I run your code, I get 0.76 or so pretty consistently. I never see 10, 20, or 30. Since you are running multiple kernels within your timing sequence, it's possible that display driver is taking the GPU away briefly to perform display tasks, and this could show up in your timing. I think if you time each kernel individually with start and stop points, and sum the elapsed times, you will see less variation. – Robert Crovella May 14 '13 at 01:05