Using unified memory on the contents pointed out by a double pointer in CUDA

Question

Please refer to the code below -

#include <iostream>
#include <curand_kernel.h>
#include "cuda_common.cuh"

class textures{
public:
    __device__ virtual int value(int random_info)const = 0;
    __device__ virtual textures* Clone()=0;
    __device__ __host__ virtual void print_info()const = 0;
};

class texture1:public textures{
public:
    __device__ texture1(int inf):info(inf){}
    __device__ virtual textures* Clone()override{
        return new texture1(*this);
    }
    __device__ virtual int value(int random_info)const{
        int _info_=random_info;
        return _info_;
    } 

    __device__ __host__ virtual void print_info()const override{
        printf("\nI am a texture1");
    }

    int info;
};

class texture2:public textures{
public:
    __device__ texture2(int inf,int inf2):info(inf),info2(inf2){}
    __device__ virtual textures* Clone()override{
        return new texture2(*this);
    }
    __device__ virtual int value(int random_info)const{
        int _info_=random_info;
        return _info_+1;
    } 

    __device__ __host__ virtual void print_info()const override{
        printf("\nI am a texture2");
    }

    int info,info2;
};

__global__ void device_init(textures **t_list,int size){

    curandState localState;

    curand_init(1984,0,0,&localState);

    for(int i=0;i<size;i++)
    {
        float rand_no = curand_uniform(&localState);

        if(rand_no<=0.5f)
        {
            t_list[i] = new texture1(3);
        }
        else
        {
            t_list[i] = new texture2(4,3);
        }
    }
}


__global__ void device_show(textures **t_list,int size){
    printf("\n\nFor the device");
    for(int i=0;i<size;i++)
    {
    
        t_list[i]->print_info();
        printf("\n");
    }

}

void host_show(textures **t_list,int size)
{
    printf("\n\nFor the host");
    for(int i=0;i<size;i++)
    {
        t_list[i]->print_info();
        printf("\n");
    }
}



int main(){
    int size = 10;

    textures **t_list;
    cudaMallocManaged(&t_list,size*sizeof(textures*));

    device_init<<<1,1>>>(t_list,size);
    cudaDeviceSynchronize();

    device_show<<<1,1>>>(t_list,size);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaDeviceSynchronize());


    host_show(t_list,size);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaDeviceSynchronize());

}

(I have just started learning CUDA so please bear with me if some of my assumptions seem too non sensical)

My question -

Is there a way by which I can copy my double pointers and the subsequent memory that it points to from the device memory into my host memory?

I referred to a lot of answers and suggestions on copying such data but the fact that I am instantiating my device pointers in a kernel and on top of that using it to point to the objects of a class which is deriving from an abstract class makes it very difficult for me to actually understand what is actually happening in the background.

I have not used cudaMallocManaged() before and am very confused about its functionality in my scenario.

What I have tried till now-

I think I understand why I am actually facing this problem, the reason could be that I am dynamically allocating memory to my pointers in my kernel due it which even though my double pointers are stored in the unified memory the content that it points to is actually in the device memory?

I am not at all sure what exactly is happening but taking that case as the possible problem here I tried executing the following code but got an invalid __global__ read at t_list[i]->print_info(); in the device_show function

The code -

#include <iostream>
#include <curand_kernel.h>
#include "cuda_common.cuh"

class textures{
public:
    __device__ virtual int value(int random_info)const = 0;
    __device__ virtual textures* Clone()=0;
    __device__ __host__ virtual void print_info()const = 0;
};

class texture1:public textures{
public:
    __device__ texture1(int inf):info(inf){}
    __device__ virtual textures* Clone()override{
        return new texture1(*this);
    }
    __device__ virtual int value(int random_info)const{
        int _info_=random_info;
        return _info_;
    } 

    __device__ __host__ virtual void print_info()const override{
        printf("\nI am a texture1");
    }

    int info;
};

class texture2:public textures{
public:
    __device__ texture2(int inf,int inf2):info(inf),info2(inf2){}
    __device__ virtual textures* Clone()override{
        return new texture2(*this);
    }
    __device__ virtual int value(int random_info)const{
        int _info_=random_info;
        return _info_+1;
    } 

    __device__ __host__ virtual void print_info()const override{
        printf("\nI am a texture2");
    }

    int info,info2;
};

__global__ void device_init(textures **t_list,int size){

    curandState localState;

    curand_init(1984,0,0,&localState);

    for(int i=0;i<size;i++)
    {
        float rand_no = curand_uniform(&localState);

        if(rand_no<=0.5f)
        {
            *t_list[i] = texture1(3);
        }
        else
        {
            *t_list[i] = texture2(4,3);
        }
    }
}


__global__ void device_show(textures **t_list,int size){
    printf("\n\nFor the device");
    for(int i=0;i<size;i++)
    {

        t_list[i]->print_info();
        printf("\n");
    }

}

void host_show(textures **t_list,int size)
{
    printf("\n\nFor the host");
    for(int i=0;i<size;i++)
    {
        t_list[i]->print_info();
        printf("\n");
    }
}



int main(){
    int size = 10;

    textures **t_list;
    cudaMallocManaged(&t_list,size*sizeof(textures*));

    for(int i=0;i<size;i++)
        cudaMallocManaged(&t_list[i],sizeof(textures));

    device_init<<<1,1>>>(t_list,size);
    cudaDeviceSynchronize();

    device_show<<<1,1>>>(t_list,size);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaDeviceSynchronize());


    host_show(t_list,size);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaDeviceSynchronize());

}

The Error-

========= Invalid __global__ read of size 8 bytes
=========     at 0x1e0 in C:/Users/sonas/Documents/Capstone 2022-23/Ray-tracing-in-a-distributed-framework/cleanup.cu:590:device_show(textures **, int)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x10 is out of bounds
=========     and is 30,07,31,59,664 bytes before the nearest allocation at 0x700800000 of size 512 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:cuEventRecordWithFlags [0x7ff9d154e5c8]
=========                in C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_91804b01bce922dd\nvcuda64.dll
=========     Host Frame: [0x28c3]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x2786]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x2404]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x1ccb]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x18b7]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x119b]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x13e4]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x155c8]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame:BaseThreadInitThunk [0x7ffa73c726bd]
=========                in C:\WINDOWS\System32\KERNEL32.DLL
=========     Host Frame:RtlUserThreadStart [0x7ffa7510dfb8]
=========                in C:\WINDOWS\SYSTEM32\ntdll.dll
=========
========= Program hit cudaErrorUnknown (error 999) due to "unknown error" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:cuProfilerStop [0x7ff9d1698935]
=========                in C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_91804b01bce922dd\nvcuda64.dll
=========     Host Frame: [0x1faa]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x1405]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame: [0x155c8]
=========                in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
=========     Host Frame:BaseThreadInitThunk [0x7ffa73c726bd]
=========                in C:\WINDOWS\System32\KERNEL32.DLL
=========     Host Frame:RtlUserThreadStart [0x7ffa7510dfb8]
=========                in C:\WINDOWS\SYSTEM32\ntdll.dll
=========
GPUassert: unknown error 999 cleanup.cu 622
========= Target application returned an error
========= ERROR SUMMARY: 2 errors

But still I am not able to successfully access my memory in the host.

Constraints-

This code is just a representation of a design problem that I am facing in my project so I must initialize my data in the kernel itself given some constraints that are there in my actual project.

And same goes with how my classes are arranged in my code

in-kernel `new`, `malloc`, and `cudaMalloc` allocate from the device heap, which cannot participate in a host-launched `cudaMemcpy`-type operation. The straightforward refactoring, since there is no variability here, is to pre-allocate what you need in host code. If you have already tried that, then you should provide a complete example of **that**. — Robert Crovella, Mar 14 '23 at 16:32

score 3 · Accepted Answer · answered Mar 14 '23 at 20:19

There are several problems.

At least on my system, the sizeof(textures) and sizeof(texture1) and sizeof(texture2) are not all the same. Therefore this is problematic:
```
cudaMallocManaged(&t_list[i],sizeof(textures));
```
For demonstration, we can make that sizeof(texture2).
From what I can tell, this does not do a full object copy:
```
*t_list[i] = texture1(3);
```
I suspect it is not copying the vtable/vtable pointer.

there are at least several methods to fix that, the one I will suggest is using placement new:
```
new(t_list[i]) texture1(3);
```
Your code is stepping on a limitation in CUDA that prevents you from initializing an object on the device, and then using one of its virtual methods in host code, or initializing an object on the host, and then using one of its virtual methods in device code. I don't have a solution for this, really. If you want to get crazy you can manually fix up the vtable/ptr before using it on the host (but then you'll have to do the same before using it again on the device). So rather than propose something silly, I would say the bottom line is this combination of wanting to use polymorphic objects on both the host and the device is not a good design pattern for CUDA (and managed memory makes it even easier to step into the issue). Don't do that.

Here's an example that addresses the first 2 items above:

$ cat t2220.cu
#include <iostream>
#include <curand_kernel.h>
#include <new>
#define gpuErrchk(x) x

class textures{
public:
    int tt = 0;
    __device__ virtual int value(int random_info)const = 0;
    __device__ virtual textures* Clone() = 0;
    __device__  virtual void print_info() const = 0;
    __host__ void print_host_info() {printf("tt = %d\n", tt);}
    __device__ textures():tt(0){}
    __device__ textures(int x):tt(x){}
};

class texture1:public textures{
public:
    __device__ texture1(int inf):textures(1), info(inf) {}
    __device__ virtual textures* Clone()override{
        return new texture1(*this);
    }
    __device__ virtual int value(int random_info)const{
        int _info_=random_info;
        return _info_;
    }

    __device__ virtual void print_info()const override{
        printf("\nI am a texture1");
    }
    int info;
};

class texture2:public textures{
public:
    __device__ texture2(int inf,int inf2):textures(2), info(inf),info2(inf2){ }
    __device__ virtual textures* Clone()override{
        return new texture2(*this);
    }
    __device__ virtual int value(int random_info)const{
        int _info_=random_info;
        return _info_+1;
    }

    __device__ virtual void print_info()const override{
        printf("\nI am a texture2");
    }
    int info,info2;
};

__global__ void device_init(textures **t_list,int size){

    curandState localState;

    curand_init(1984,0,0,&localState);

    for(int i=0;i<size;i++)
    {
        float rand_no = curand_uniform(&localState);

        if(rand_no<=0.5f)
        {
            new(t_list[i])texture1(3);
        }
        else
        {
            new(t_list[i])texture2(3,4);
        }
    }
}


__global__ void device_show(textures **t_list,int size){
    printf("\n\nFor the device\n");
    for(int i=0;i<size;i++)
    {

        t_list[i]->print_info();
        printf("\n");
    }

}

void host_show(textures **t_list,int size)
// this won't work for objects initialized on the device
{
    printf("\n\nFor the host");
    for(int i=0;i<size;i++)
    {
// this won't work for objects initialized on the device
        //t_list[i]->print_info();
        t_list[i]->print_host_info();
        printf("\n");
    }
}



int main(){
    int size = 10;
    printf("%lu, %lu, %lu\n", sizeof(textures), sizeof(texture1), sizeof(texture2));
    textures **t_list;
    cudaMallocManaged(&t_list,size*sizeof(textures*));

    for(int i=0;i<size;i++)
        cudaMallocManaged(&t_list[i],sizeof(texture2));

    device_init<<<1,1>>>(t_list,size);
    cudaDeviceSynchronize();

    device_show<<<1,1>>>(t_list,size);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaDeviceSynchronize());

    host_show(t_list,size);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaDeviceSynchronize());
}
$ nvcc -arch=sm_35 -lineinfo t2220.cu -o t2220
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t2220
========= CUDA-MEMCHECK
16, 16, 24


For the device

I am a texture1

I am a texture1

I am a texture1

I am a texture2

I am a texture1

I am a texture1

I am a texture2

I am a texture2

I am a texture1

I am a texture2


For the hosttt = 1

tt = 1

tt = 1

tt = 2

tt = 1

tt = 1

tt = 2

tt = 2

tt = 1

tt = 2

========= ERROR SUMMARY: 0 errors
$

(Yes, I added some other items in to set a base class member according to the derived type. It's not really relevant.)

Thank you for your reply , I would look into it – kratia Mar 15 '23 at 09:50 — kratia, Mar 15 '23 at 09:50

Using unified memory on the contents pointed out by a double pointer in CUDA

1 Answers1