I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc
and cudaMemcpy
s of the variables in the struct from host to device.
A simple version (the real one has various structs and variables/arrays inside) of the struct is:
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
My problem is that I must be doing something wrong in the memory allocation and copy of the struct. When I copy the variables withing Graph
, I can see that they are properly copied (by accessing it in a kernel as in the example below). For example, I can check that graph.nBoundary=3
.
However, I can only see this if I do not allocate and copy the memory of Node *
. If I do, I get -858993460
instead of 3
. Interestingly, Node *
is not wrongly allocated, as I can inspect the value of say graph.node[0].pos[0]
and it has the correct value.
This only happens with the graph.nBoundary
. All the other variables remain with the correct numerical values, but this one gets "wronged" when running the cudaMemcpy
of the Node*
.
What am I doing wrong and why does this happen? How do I fix it?
Let me know if you need more information.
MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph,unsigned int * d_res){
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++){
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%d", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph){
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
//copy constants
gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));
// copy boundary
unsigned int * d_auxboundary, *h_auxboundary;
h_auxboundary = inGraph->boundary;
gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));
//Create nodes
Node * auxnode;
gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));
// Crate auxiliary pointers to grab them from host and pass them to device
float ** d_position, ** h_position;
d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
for (int i = 0; i < inGraph->nNode; i++){
// Positions
h_position[i] = inGraph->node[i].position;
gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////// If I comment the following section, nBoundary can be read by the kernel
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));
return outGraph;
}