Passing an array of different-sized objects to a CUDA kernel

Question

I have an array of pointers to an abstract class A. These pointers point to objects of class B,C and they all derive from A. The sizes of these derived classes are not the same, so they have a function called size that returns a size_t of their size.

MCVE:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <vector>

class A {
public:
    float a;
    A() {}
    __host__ __device__ virtual size_t size() = 0;
};

class B : public A {
public:
    float b;
    B() {}
    __host__ __device__ virtual size_t size() override { return sizeof(*this); }
};

class C : public A {
public:
    float c, d;
    C() {}
    __host__ __device__ virtual size_t size() override { return sizeof(*this); }
};

__global__ void testKernel(A** objects, int numObjects) {
    for (int i = 0; i < numObjects; i++) {
        printf("%d\n", objects[i]->size());
    }
}

int main()
{
    std::vector<A*> host_pointers;
    host_pointers.push_back(new B());
    host_pointers.push_back(new C());

    cudaError_t cudaStatus;

    std::vector<A*> device_pointers;
    for (auto obj : host_pointers) {
        A* device_pointer;

        cudaStatus = cudaMalloc((void**)&device_pointer, obj->size());
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMalloc failed for size %d\n", obj->size());
            exit(-1);
        }

        cudaStatus = cudaMemcpy(device_pointer, obj, obj->size(), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed for size %d\n", obj->size());
            exit(-1);
        }
        device_pointers.push_back(device_pointer);
    }
    ///By this point, both objects should have been copied over
    ///to device memory, and I should have valid pointers to them
    A** array_of_device_pointers;

    cudaStatus = cudaMalloc((void**)&array_of_device_pointers, device_pointers.size() * sizeof(A*));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed\n");
        exit(-1);
    }

    cudaStatus = cudaMemcpy(array_of_device_pointers, device_pointers.data(), device_pointers.size() * sizeof(A*), cudaMemcpyHostToDevice);

    testKernel<<<1, 1>>>(array_of_device_pointers, device_pointers.size());

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "kernel failed, reason: %s\n", cudaGetErrorString(cudaStatus));
    }
    
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSynchronize failed\n");
        exit(-1);
    }
}

What I want to do: have objects derived from A in a kernel

How I'm trying to do it: copy the objects to device memory one by one (since their sizes aren't the same), then copy an array of their device pointers to device memory, and then pass a device pointer to that onto a kernel

What I experience: When I run the program with the Nsight debugger, it stops at the line printf("%d\n", objects[i]->size());, (I'm guessing) meaning that object[0] is not a valid pointer.

Not sure if it matters with something this simple, but I'm running this on a GPU with compute capability 8.6, though compiling for compute capability 5.2

A side note: `sizeof(this)` will give you the size of `this` which is a pointer, not the size of your class object. You probably meant `sizeof(*this)` . — wohlstad, Apr 21 '23 at 10:50
"If an object is created in host code, invoking a virtual function for that object in device code has undefined behavior." [CUDA C++ Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#virtual-functions). The pointers in the vtables copied to the device with the objects are probably pointing to the host versions of the virtual functions. — paleonix, Apr 21 '23 at 12:42

Passing an array of different-sized objects to a CUDA kernel

0 Answers0