I'm going to take a stab at this, with the clarification that I've never done this and this is just my understanding.
__device__
calls can made from a kernel (__global__
). You can not have __global__
member functions of a class.
What you can have is a __global__
init call, but it cannot allocate new memory.
IF you want to initialize the a memory block with a constructor, the best thing to use would be a placement new:
class Point
{
public:
__host__ __device__ Point() {}
__host__ __device__ Point(int a,int b) : x(a), y(b)
{
}
int x,y;
private:
};
__global__ void init_point(void* buffer,int a, int b)
{
new(buffer) Point(a,b);
}
#include <iostream>
int main()
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int cuda_count = 0;
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess)
{
if (prop.major >= 1) { cuda_count++;}
std::cout << "[" << i << "] --" << prop.name << std::endl;
}
}
if(cuda_count == 0) {
fprintf(stderr, "There is no device supporting CUDA.\n");
return -1;
}
std::cout << std::endl << "Select device" << std::endl;
std::cin >> i;
cudaSetDevice(i);
printf("CUDA initialized.\n");
void* buff;
cudaMalloc(&buff,sizeof(Point));
init_point<<<1,1>>>(buff,10,20);
cudaThreadSynchronize();
Point cpu_point;
cudaMemcpy(&cpu_point,buff,sizeof(Point),cudaMemcpyDeviceToHost);
std::cout << cpu_point.x << std::endl;
std::cout << cpu_point.y << std::endl;
getchar();
getchar();
return 0;
}
Obviously, this could be expanded to init_point can initialize points in a multithreaded fashion.
Be warned that array-of-structures is typically much slower than structure-of-array design on Cuda architectures.