-1

I'm using VS2019 and have an NVIDIA GeForce GPU. I tried the code from this link: https://towardsdatascience.com/writing-lightning-fast-code-with-cuda-c18677dcdd5f

However, I want to try using cudaMalloc instead of using managed memory with cudaMallocManaged

I tried the code below:

__global__
void add(int n, float* x, float* y)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
}


int main()
{
    int N = 1 << 20;   

    float* x, * y;
    cudaMalloc(&x, N * sizeof(float));
    cudaMalloc(&y, N * sizeof(float));
    cudaMemset(x,1.0, N * sizeof(float)); //want to set x as an array of 1.0s
    cudaMemset(y,2.0, N * sizeof(float)); //want to set y as an array of 2.0s

    int device = -1;
    cudaGetDevice(&device);

    int blockSize = 1024;
    int numBlocks = (N + blockSize - 1) / blockSize;
    auto t1 = std::chrono::high_resolution_clock::now();
    add << <numBlocks, blockSize >> > (N, x, y);

    cudaDeviceSynchronize();
    auto t2 = std::chrono::high_resolution_clock::now(); 
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count(); 

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
        maxError = fmax(maxError, fabs(y[i] - 3.0f));
    std::cout << "Max error: " << maxError << std::endl;
    std::cout << "duration CUDA: "<<duration; 

    cudaFree(x);
    cudaFree(y);

    return 0;
}

But I'm getting an unhandled exception error at maxError = fmax(maxError, fabs(y[i] - 3.0f));, I'm guessing because I didn't use cudaMemset correctly? How should I modify it?

user5739619
  • 1,748
  • 5
  • 26
  • 40
  • You can't substitute device memory for managed memory on the host, and you can't use cudaMemset to set floats – talonmies Feb 07 '20 at 20:38
  • so what am I supposed to do instead then? – user5739619 Feb 07 '20 at 21:27
  • You can't access device memory on the host. You must explicitly copy to a host variable after the kernel, and you should probably initialize host memory and copy to the device. And maybe read some API documentation – talonmies Feb 07 '20 at 21:32

1 Answers1

1

In no particular order:

  1. Device memory (i.e. memory allocated with cudaMalloc) can't be accessed directly on the host, so your maxError calculations are illegal because y is a pointer in device memory. To perform the error check, you require a copy of y to a local host copy of the memory before running the loop
  2. cudaMemset sets bytes, not words (just like regular memset). You either need to set values on the host and copy them to the device, or in another kernel, or use something like thrust::fill_n.
  3. In the spirit of your previous question, there is typically setup latency in the first call of a kernel, so perform a warm-up before timing

Doing these three things gets me this:

int main()
{
    int N = 1 << 20;   

    std::vector<float> xh(N, 1.0f);
    std::vector<float> yh(N, 2.0f);

    float* x, * y;
    cudaMalloc(&x, N * sizeof(float));
    cudaMemcpy(x, &xh[0], N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMalloc(&y, N * sizeof(float));
    cudaMemcpy(y, &yh[0], N * sizeof(float), cudaMemcpyHostToDevice);

    int blockSize, numBlocks;
    cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, add);

    for(int rep=0; rep<10; rep++) {
        auto t1 = std::chrono::high_resolution_clock::now();
        add << <numBlocks, blockSize >> > (N, x, y);
        cudaDeviceSynchronize();
        auto t2 = std::chrono::high_resolution_clock::now(); 
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count(); 
        std::cout << rep << " duration CUDA: " << duration <<std::endl; 
    }

    cudaMemcpy(&yh[0], y, N * sizeof(float), cudaMemcpyDeviceToHost);

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
        maxError = fmax(maxError, fabs(yh[i] - 12.0f));
    std::cout << "Max error: " << maxError << std::endl;

    cudaFree(x);
    cudaFree(y);

    cudaDeviceReset();

    return 0;
}

And compiling it and running it gets me this:

$ nvcc -arch=sm_52 -std=c++11 -o devmem devmem.cu
$ ./devmem 
0 duration CUDA: 155
1 duration CUDA: 94
2 duration CUDA: 95
3 duration CUDA: 94
4 duration CUDA: 94
5 duration CUDA: 93
6 duration CUDA: 93
7 duration CUDA: 99
8 duration CUDA: 92
9 duration CUDA: 93
Max error: 0

Compared to the timings in my last answer to you, you can see that using device memory provides speedup over managed memory on my system. As always, your results might vary.

talonmies
  • 70,661
  • 34
  • 192
  • 269