I think you have more or less everything correct. You may be confused about array access in general, and how to make different threads access different elements of an array. This code is legal:
__global__ void AddInts(struct Group_Output_Places *Dev_Group_Places){
struct Group_Output_Places GPU_Group_Places;
GPU_Group_Places = *Dev_Group_Places;
}
but there is no way you could get that code to ever access anything other than the first element of the array (Dev_Group_Places
).
In CUDA, kernels usually create a globally unique thread index, and then it is common to use that thread index to index into an array. Something like this:
#include <stdio.h>
const int num_places = 32;
struct Group_Output_Places
{
float Parameter[3];
int Place_ID[3];
};
__global__ void AddInts(struct Group_Output_Places *Dev_Group_Places, int num_places){
int i = threadIdx.x+blockDim.x*blockIdx.x; // create globally unique ID
if (i < num_places){
struct Group_Output_Places GPU_Group_Places;
GPU_Group_Places = Dev_Group_Places[i]; //index into array
printf("from thread %d, place id: %d\n", i, GPU_Group_Places.Place_ID[0]);
}}
int main(){
struct Group_Output_Places Group_Places[num_places];
for (int i = 0; i < num_places; i++){
Group_Places[i].Parameter[0] = 360.2f; // f at the end tells it it is a float so it doesnt complain about it being a double
Group_Places[i].Place_ID[0] = i+1;
Group_Places[i].Parameter[1] = 128.4f;
Group_Places[i].Place_ID[1] = 2;
}
struct Group_Output_Places *Dev_Group_Places;
cudaMalloc((void**)&Dev_Group_Places, sizeof(struct Group_Output_Places)* num_places);
cudaMemcpy(Dev_Group_Places, &Group_Places, sizeof(struct Group_Output_Places)* num_places, cudaMemcpyHostToDevice); // sizeof(Group_Output_Places)* 31 becuase it is an array
if (num_places <= 1024)
AddInts << <1, num_places >> >(Dev_Group_Places, num_places);
cudaDeviceSynchronize();
}
$ nvcc -o t1085 t1085.cu
$ cuda-memcheck ./t1085
========= CUDA-MEMCHECK
from thread 0, place id: 1
from thread 16, place id: 17
from thread 1, place id: 2
from thread 2, place id: 3
from thread 17, place id: 18
from thread 3, place id: 4
from thread 4, place id: 5
from thread 18, place id: 19
from thread 5, place id: 6
from thread 6, place id: 7
from thread 19, place id: 20
from thread 7, place id: 8
from thread 8, place id: 9
from thread 20, place id: 21
from thread 9, place id: 10
from thread 10, place id: 11
from thread 21, place id: 22
from thread 11, place id: 12
from thread 12, place id: 13
from thread 22, place id: 23
from thread 13, place id: 14
from thread 14, place id: 15
from thread 23, place id: 24
from thread 15, place id: 16
from thread 24, place id: 25
from thread 25, place id: 26
from thread 26, place id: 27
from thread 27, place id: 28
from thread 28, place id: 29
from thread 29, place id: 30
from thread 30, place id: 31
from thread 31, place id: 32
========= ERROR SUMMARY: 0 errors
$
Any time you are having trouble with a CUDA code, it's good practice to use proper cuda error checking. At a minimum, run your code with cuda-memcheck
as I have done above.