For what you have shown so far, using a __device__
pointer variable just creates needless complexity. Just use an ordinary dynamic allocation using cudaMalloc
for device storage, and otherwise follow an approach similar to any of the CUDA sample codes such as vectorAdd. Here is an example:
$ cat t1315.cu
#include <stdio.h>
#define A_LARGE_NUMBER 10
struct values{
int one, two, three;
};
values *vals;
__global__ void myKernel(values *d_vals){
printf("%d\n", d_vals[0].one);
}
void PopulateWithData(){
for (int i = 0; i < A_LARGE_NUMBER; i++){
vals[i].one = 1;
vals[i].two = 2;
vals[i].three = 3;
}
}
int main(){
vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER);
PopulateWithData(); //populates vals with random data
values* d_ptr;
cudaMalloc((void**)&d_ptr, A_LARGE_NUMBER * sizeof(values));
cudaMemcpy(d_ptr, vals, A_LARGE_NUMBER *sizeof(values),cudaMemcpyHostToDevice);
dim3 blocksPerGrid(1,1);
dim3 threadsPerBlock(1, 1);
myKernel<< <blocksPerGrid, threadsPerBlock >> >(d_ptr);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t1315 t1315.cu
$ cuda-memcheck ./t1315
========= CUDA-MEMCHECK
1
========= ERROR SUMMARY: 0 errors
$
You had a variety of other basic (non-CUDA) coding errors in what you had shown, I'm not going to try and run through them all.
If you really want to retain your __device__
pointer variable, and use that to point to the device data (array of structs) then you will still need to use cudaMalloc
, and the overall process takes additional steps. You can follow the example worked out in the answer here.
Following that example, here's a set of changes to the above code to make it work with a __device__
pointer variable instead of a pointer passed as a kernel parameter:
$ cat t1315.cu
#include <stdio.h>
#define A_LARGE_NUMBER 10
struct values{
int one, two, three;
};
values *vals;
__device__ values *d_vals;
__global__ void myKernel(){
printf("%d\n", d_vals[0].one);
}
void PopulateWithData(){
for (int i = 0; i < A_LARGE_NUMBER; i++){
vals[i].one = 1;
vals[i].two = 2;
vals[i].three = 3;
}
}
int main(){
vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER);
PopulateWithData(); //populates vals with random data
values* d_ptr;
cudaMalloc((void**)&d_ptr, A_LARGE_NUMBER * sizeof(values));
cudaMemcpy(d_ptr, vals, A_LARGE_NUMBER *sizeof(values),cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_vals, &d_ptr, sizeof(values*));
dim3 blocksPerGrid(1,1);
dim3 threadsPerBlock(1, 1);
myKernel<< <blocksPerGrid, threadsPerBlock >> >();
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t1315 t1315.cu
$ cuda-memcheck ./t1315
========= CUDA-MEMCHECK
1
========= ERROR SUMMARY: 0 errors
$