1

My goal is to write a simple ray tracer with the phong shading model with CUDA in C++. It is supposed to calculate the appropriate colors and write them into a frame buffer on the GPU and afterwards I write the values in the frame buffer into a .ppm file on the CPU. The image size I have is 512x512 so for the thread layout in the kernel call I used the following arguments: dim3 thread_blocks(16, 16) and dim3 threads_per_block(32, 32).

This should in theory give me access to (16*16) * (32*32) threads which is equal to the amount of pixels in the image (512 * 512). But this gives me a CUDA error with the error code 700 for cudaMemcpy on the line where I copy the data back from the device to the host. Using a smaller amount of threads_per_block like dim3 threads_per_block(16, 16) works without an error but will of course only render 1/4th of the image.

I have tried other thread layouts as well and even the ones that were specifically explained for a 2D image yielded the same error, so that's where I need help.

The kernel call:

void run_kernel(const int size, Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
    // empty_kernel<<<dim3(16, 16, 1), dim3(32, 32, 1)>>>();
    // cudaDeviceSynchronize();

    Vec3f* fb_device = nullptr;
    Sphere* spheres_dv = nullptr;
    Light* light_dv = nullptr;
    Vec3f* origin_dv = nullptr;

    checkErrorsCuda(cudaMalloc((void**) &fb_device, sizeof(Vec3f) * size));
    checkErrorsCuda(cudaMemcpy((void*) fb_device, fb, sizeof(Vec3f) * size, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &spheres_dv, sizeof(Sphere) * 3));
    checkErrorsCuda(cudaMemcpy((void*) spheres_dv, spheres, sizeof(Sphere) * 3, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &light_dv, sizeof(Light) * 1));
    checkErrorsCuda(cudaMemcpy((void*) light_dv, light, sizeof(Light) * 1, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &origin_dv, sizeof(Vec3f) * 1));
    checkErrorsCuda(cudaMemcpy((void*) origin_dv, origin, sizeof(Vec3f) * 1, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cast_ray<<<dim3(16, 16), dim3(32, 32)>>>(fb_device, spheres_dv, light_dv, origin_dv);
    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("%f ms\n", time);

    checkErrorsCuda(cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost));

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    checkErrorsCuda(cudaFree(fb_device));
    checkErrorsCuda(cudaFree(spheres_dv));
    checkErrorsCuda(cudaFree(light_dv));
    checkErrorsCuda(cudaFree(origin_dv));
}

The cast_ray function:

__global__ void cast_ray(Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    int j = (blockIdx.y * blockDim.y) + threadIdx.y;

    int tid = (j*WIDTH) + i;
    if(i >= WIDTH || j >= HEIGHT) return;

    Vec3f ij(2 * (float((i) + 0.5) / (WIDTH - 1)) - 1, 1 - 2 * (float((j) + 0.5) / (HEIGHT - 1)), -1);
    Vec3f *dir = new Vec3f(ij - *origin);
    Ray r(*origin, *dir);

    float intersections[3];
    int hp = -1;
    for(int ii = 0; ii < 3; ii++) {
        intersections[ii] = r.has_intersection(spheres[ii]);
    }

    int asize = sizeof(intersections) / sizeof(*intersections);
    if(asize == 1) {
        hp = intersections[0] < 0 ? -1 : 0;
    } else {
        if(asize != 0) {
            float min_val = 100.0;
            for (int ii = 0; ii < asize; ii++) {
                if (intersections[ii] < 0.0) continue;
                else if (intersections[ii] < min_val) {
                    min_val = intersections[ii];
                    hp = ii;
                }
            }
        }
    }

    if(hp == -1) {
        fb[tid] = Color(94, 156, 255);
    } else {
        auto color = get_color_at(r, intersections[hp], light, spheres[hp], spheres);
        fb[tid] = color;
    }
}

The error message: CUDA error at ./main.cu::195 with error code 700 for cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost)(). (The corresponding line is the cudaMemcpy after the printf in the kernel call function)

With cuda-memcheck I get the following information:

========= Error: process didn't terminate successfully
========= Out-of-range Shared or Local Address
=========     at 0x00000100 in __cuda_syscall_mc_dyn_globallock_check
=========     by thread (0,7,0) in block (2,5,0)

(This was tried on a RTX 2060 SUPER)

xkevio
  • 75
  • 1
  • 9
  • 1. for questions like this you are supposed to provide a [mcve]. What you have shown is not one. 2. When using `cuda-memcheck` I suggest the method [here](https://stackoverflow.com/questions/27277365). 3. regarding `new` in the kernel: A. shouldn't you have a corresponding `delete`? B. in-kernel `new` is subject to limitations of the [device heap](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations). – Robert Crovella Feb 01 '21 at 18:18
  • 4. If you are on a display gpu, you may be running into a kernel duration limit. eg. a [WDDM timeout](https://docs.nvidia.com/gameworks/content/developertools/desktop/timeout_detection_recovery.htm) – Robert Crovella Feb 01 '21 at 18:18
  • @RobertCrovella I apologize for not providing the minimal reproducible example. I do feel silly for another reason though and that is precisely the issue you described with using `new`. After I added the corresponding `delete` it worked though much slower than it should be. Then I realized that the Vec3f pointer I had in the kernel was a relict of older code and that it didn't need to be a pointer anymore and after that, it ran much faster. Thank you anyway! – xkevio Feb 01 '21 at 18:30

1 Answers1

2

Changing Vec3f *dir = new Vec3f(ij - *origin); to Vec3f dir(ij - *origin); solved the issue! dir being a pointer was a remnant of previous iterations of the code that weren't needed anymore, but even then don't forget to delete all your new's.

xkevio
  • 75
  • 1
  • 9