Please refer to the code below -
#include <iostream>
#include <curand_kernel.h>
#include "cuda_common.cuh"
class textures{
public:
__device__ virtual int value(int random_info)const = 0;
__device__ virtual textures* Clone()=0;
__device__ __host__ virtual void print_info()const = 0;
};
class texture1:public textures{
public:
__device__ texture1(int inf):info(inf){}
__device__ virtual textures* Clone()override{
return new texture1(*this);
}
__device__ virtual int value(int random_info)const{
int _info_=random_info;
return _info_;
}
__device__ __host__ virtual void print_info()const override{
printf("\nI am a texture1");
}
int info;
};
class texture2:public textures{
public:
__device__ texture2(int inf,int inf2):info(inf),info2(inf2){}
__device__ virtual textures* Clone()override{
return new texture2(*this);
}
__device__ virtual int value(int random_info)const{
int _info_=random_info;
return _info_+1;
}
__device__ __host__ virtual void print_info()const override{
printf("\nI am a texture2");
}
int info,info2;
};
__global__ void device_init(textures **t_list,int size){
curandState localState;
curand_init(1984,0,0,&localState);
for(int i=0;i<size;i++)
{
float rand_no = curand_uniform(&localState);
if(rand_no<=0.5f)
{
t_list[i] = new texture1(3);
}
else
{
t_list[i] = new texture2(4,3);
}
}
}
__global__ void device_show(textures **t_list,int size){
printf("\n\nFor the device");
for(int i=0;i<size;i++)
{
t_list[i]->print_info();
printf("\n");
}
}
void host_show(textures **t_list,int size)
{
printf("\n\nFor the host");
for(int i=0;i<size;i++)
{
t_list[i]->print_info();
printf("\n");
}
}
int main(){
int size = 10;
textures **t_list;
cudaMallocManaged(&t_list,size*sizeof(textures*));
device_init<<<1,1>>>(t_list,size);
cudaDeviceSynchronize();
device_show<<<1,1>>>(t_list,size);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
host_show(t_list,size);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
}
(I have just started learning CUDA so please bear with me if some of my assumptions seem too non sensical)
My question -
Is there a way by which I can copy my double pointers and the subsequent memory that it points to from the device memory into my host memory?
I referred to a lot of answers and suggestions on copying such data but the fact that I am instantiating my device pointers in a kernel and on top of that using it to point to the objects of a class which is deriving from an abstract class makes it very difficult for me to actually understand what is actually happening in the background.
I have not used cudaMallocManaged() before and am very confused about its functionality in my scenario.
What I have tried till now-
I think I understand why I am actually facing this problem, the reason could be that I am dynamically allocating memory to my pointers in my kernel due it which even though my double pointers are stored in the unified memory the content that it points to is actually in the device memory?
I am not at all sure what exactly is happening but taking that case as the possible problem here I tried executing the following code but got an invalid __global__ read at t_list[i]->print_info();
in the device_show function
The code -
#include <iostream>
#include <curand_kernel.h>
#include "cuda_common.cuh"
class textures{
public:
__device__ virtual int value(int random_info)const = 0;
__device__ virtual textures* Clone()=0;
__device__ __host__ virtual void print_info()const = 0;
};
class texture1:public textures{
public:
__device__ texture1(int inf):info(inf){}
__device__ virtual textures* Clone()override{
return new texture1(*this);
}
__device__ virtual int value(int random_info)const{
int _info_=random_info;
return _info_;
}
__device__ __host__ virtual void print_info()const override{
printf("\nI am a texture1");
}
int info;
};
class texture2:public textures{
public:
__device__ texture2(int inf,int inf2):info(inf),info2(inf2){}
__device__ virtual textures* Clone()override{
return new texture2(*this);
}
__device__ virtual int value(int random_info)const{
int _info_=random_info;
return _info_+1;
}
__device__ __host__ virtual void print_info()const override{
printf("\nI am a texture2");
}
int info,info2;
};
__global__ void device_init(textures **t_list,int size){
curandState localState;
curand_init(1984,0,0,&localState);
for(int i=0;i<size;i++)
{
float rand_no = curand_uniform(&localState);
if(rand_no<=0.5f)
{
*t_list[i] = texture1(3);
}
else
{
*t_list[i] = texture2(4,3);
}
}
}
__global__ void device_show(textures **t_list,int size){
printf("\n\nFor the device");
for(int i=0;i<size;i++)
{
t_list[i]->print_info();
printf("\n");
}
}
void host_show(textures **t_list,int size)
{
printf("\n\nFor the host");
for(int i=0;i<size;i++)
{
t_list[i]->print_info();
printf("\n");
}
}
int main(){
int size = 10;
textures **t_list;
cudaMallocManaged(&t_list,size*sizeof(textures*));
for(int i=0;i<size;i++)
cudaMallocManaged(&t_list[i],sizeof(textures));
device_init<<<1,1>>>(t_list,size);
cudaDeviceSynchronize();
device_show<<<1,1>>>(t_list,size);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
host_show(t_list,size);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
}
The Error-
========= Invalid __global__ read of size 8 bytes
========= at 0x1e0 in C:/Users/sonas/Documents/Capstone 2022-23/Ray-tracing-in-a-distributed-framework/cleanup.cu:590:device_show(textures **, int)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x10 is out of bounds
========= and is 30,07,31,59,664 bytes before the nearest allocation at 0x700800000 of size 512 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:cuEventRecordWithFlags [0x7ff9d154e5c8]
========= in C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_91804b01bce922dd\nvcuda64.dll
========= Host Frame: [0x28c3]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x2786]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x2404]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x1ccb]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x18b7]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x119b]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x13e4]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x155c8]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame:BaseThreadInitThunk [0x7ffa73c726bd]
========= in C:\WINDOWS\System32\KERNEL32.DLL
========= Host Frame:RtlUserThreadStart [0x7ffa7510dfb8]
========= in C:\WINDOWS\SYSTEM32\ntdll.dll
=========
========= Program hit cudaErrorUnknown (error 999) due to "unknown error" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:cuProfilerStop [0x7ff9d1698935]
========= in C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_91804b01bce922dd\nvcuda64.dll
========= Host Frame: [0x1faa]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x1405]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame: [0x155c8]
========= in C:\Users\sonas\Documents\Capstone 2022-23\Ray-tracing-in-a-distributed-framework\exefiles\debug.exe
========= Host Frame:BaseThreadInitThunk [0x7ffa73c726bd]
========= in C:\WINDOWS\System32\KERNEL32.DLL
========= Host Frame:RtlUserThreadStart [0x7ffa7510dfb8]
========= in C:\WINDOWS\SYSTEM32\ntdll.dll
=========
GPUassert: unknown error 999 cleanup.cu 622
========= Target application returned an error
========= ERROR SUMMARY: 2 errors
But still I am not able to successfully access my memory in the host.
Constraints-
This code is just a representation of a design problem that I am facing in my project so I must initialize my data in the kernel itself given some constraints that are there in my actual project.
And same goes with how my classes are arranged in my code