If you are going to run this code on either a compute capability 2.x or 3,x device, with a recent version of CUDA, your kernel code is very nearly correct. The C++ new
operator is supported in CUDA 4.x and 5.0 on Fermi and Kepler hardware. Note that memory which is allocated using new
or malloc
is allocated on runtime heap on the device. It has the lifespan of the context in which is was created, but you currently cannot directly access it from the CUDA host API (so via cudaMemcpy
or similar).
I turned your structure and kernel into a simple example code which you can try for yourself to see how it works:
#include <cstdio>
struct myStruct {
float *data;
};
__device__
void fill(float * x, unsigned int n)
{
for(int i=0; i<n; i++) x[i] = (float)i;
}
__global__
void kernel(myStruct *input, const unsigned int imax)
{
for(unsigned int i=0,N=1; i<imax; i++, N*=2) {
float * p = new float[N];
fill(p, N);
input[i].data = p;
}
}
__global__
void kernel2(myStruct *input, float *output, const unsigned int imax)
{
for(unsigned int i=0,N=1; i<imax; i++, N*=2) {
output[i] = input[i].data[N-1];
}
}
inline void gpuAssert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
int main(void)
{
const unsigned int nvals = 16;
struct myStruct * _s;
float * _f, * f;
gpuErrchk( cudaMalloc((void **)&_s, sizeof(struct myStruct) * size_t(nvals)) );
size_t sz = sizeof(float) * size_t(nvals);
gpuErrchk( cudaMalloc((void **)&_f, sz) );
f = new float[nvals];
kernel<<<1,1>>>(_s, nvals);
gpuErrchk( cudaPeekAtLastError() );
kernel2<<<1,1>>>(_s, _f, nvals);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaMemcpy(f, _f, sz, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaDeviceReset() );
for(int i=0; i<nvals; i++) {
fprintf(stdout, "%d %f\n", i, f[i]);
}
return 0;
}
A few points to note:
- This code will only compile and run with CUDA 4.x or 5.0 on a Fermi or Kepler GPU
- You must pass the correct architecture for your GPU to nvcc to compile it (for example I used
nvcc -arch=sm_30 -Xptxas="-v" -o dynstruct dynstruct.cu
to compile for a GTX 670 on linux)
- The example code uses a "gather" kernel to copy data from the structure in runtime heap to an allocation which the host API can access so that the results can be printed out. This is a work around for the limitation I mentioned earlier regarding
cudaMemcpy
not being able to copy directly from addresses in runtime heap memory. I was hoping this might be fixed in CUDA 5.0, but the most recent release candidate still has this restriction.