I'm fairly new to CUDA and i've been looking around to create and array of structs of arrays and i found a couple solutions , but none gives me a clear idea .
here Harrism explained a pass by value for a struct which works fine, but when trying to add this approach to it i get illegal memory access .
what im trying to achieve is an array of structs each struct with a pointer to a dynamically allocated array populated on the host and my kernel to be able to read values of array from desired index of AoS and use it in calculations inside the kernel .
what have I not understood from these 2 codes and how would I be able to join these ideas together ?
what i tried (an attempt with array of 2 structs with 1 array each):
#include <stdio.h>
#include <stdlib.h>
#define N 10
__inline __host__ void gpuAssert(cudaError_t code, char *file, int line,
bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
typedef struct StructA {
int* arr;
} StructA;
__global__ void kernel2(StructA *in)
{
in[0].arr[threadIdx.x] = 0;
in[1].arr[threadIdx.x] = 1;
printf("d_arr = %d , d_arr2 = %d \n",in[0].arr[threadIdx.x],in[1].arr[threadIdx.x]);
}
int main(){
int* h_arr;
int* h_arr2;
h_arr = (int*)malloc(N*sizeof(int));
h_arr2 = (int*)malloc(N*sizeof(int));
StructA *h_a;
h_a = (StructA*)malloc(sizeof(StructA) * 2);
int *d_arr;
int *d_arr2;
h_arr[0]=1;h_arr[1]=2;h_arr[2]=3,h_arr[3]=4,h_arr[4]=5;h_arr[5]=6;h_arr[6]=7;h_arr[7]=8;h_arr[8]=9;h_arr[9]=10;
h_arr2[0]=1;h_arr2[1]=2;h_arr2[2]=3,h_arr2[3]=4,h_arr2[4]=5;h_arr2[5]=6;h_arr2[6]=7;h_arr2[7]=8;h_arr2[8]=9;h_arr2[9]=10;
// 1. Allocate device array.
gpuErrchk(cudaMalloc((void**) &(d_arr), sizeof(int)*N));
gpuErrchk(cudaMalloc((void**) &(d_arr2), sizeof(int)*N));
// 2. Copy array contents from host to device.
gpuErrchk(cudaMemcpy(d_arr, h_arr, sizeof(int)*N, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_arr2, h_arr2, sizeof(int)*N, cudaMemcpyHostToDevice));
// 3. Point to device pointer in host struct.
h_a[0].arr = d_arr;
h_a[1].arr = d_arr2;
// 4. Call kernel with host struct as argument
kernel2<<<1,N>>>(h_a);
gpuErrchk(cudaPeekAtLastError());
//gpuErrchk(cudaDeviceSynchronize());
// 5. Copy pointer from device to host.
gpuErrchk(cudaMemcpy(h_arr, d_arr, sizeof(int)*N, cudaMemcpyDeviceToHost));
// 6. Point to host pointer in host struct
// (or do something else with it if this is not needed)
//h_a.arr = h_arr;
printf("\n%d %d %d %d %d %d %d %d %d %d \n",h_arr[0],h_arr[1],h_arr[2],h_arr[3],h_arr[4],h_arr[5],h_arr[6],h_arr[7],h_arr[8],h_arr[9]);
printf("\n%d %d %d %d %d %d %d %d %d %d \n",h_arr2[0],h_arr2[1],h_arr2[2],h_arr2[3],h_arr2[4],h_arr2[5],h_arr2[6],h_arr2[7],h_arr2[8],h_arr2[9]);
return 0;
}