below is a simplified version of a problem that I am trying to solve. Both code snipets compile, but #2 throws an "illegal memory access". Basically, if an array is encapsulated in a structure, passing a pointer to that structure to cudaMalloc creates all kind of problems -- at least the way I do it. I am pretty sure this is due to the fact that the address of dum
in the code below is on the host, and so is not accessible inside the kernel. Problem is, I don't know how to create a device version of dum
... E.g., using cudaMalloc( (void**)&dum , sizeof(dummy) * 1 )
instead of the new dummy
syntax below does not solve the problem. I think I am getting confused with the double pointer used by cudaMalloc
.
Of course it may seem silly in this example to encapsulate an array of double in a structure, in the actual code I really need to do this though.
struct dummy
{
double *arr;
};
void allocate( dummy *dum , int n )
{
cudaMalloc( (double**)&(dum->arr) , sizeof(double) * n );
}
__global__ void test( double val , dummy *dum , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
dum->arr[ii] = val;
}
__global__ void test2( double val , double *arr , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
arr[ii] = val;
}
int main()
{
int n = 10;
dummy *dum = new dummy;
/* CODE 1: the piece of code below works */
double *p;
gpu_err_chk( cudaMalloc( &p , sizeof(double) * n ) );
test2<<< 1 , 1 >>>( 123.0 , p , n );
gpu_err_chk( cudaDeviceSynchronize() );
/* CODE 2: the piece of code below does not... */
allocate( dum , n );
test<<< 1 , 1 >>>( 123.0 , dum , n );
gpu_err_chk( cudaDeviceSynchronize() );
return 1;
}