1

I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc and cudaMemcpys of the variables in the struct from host to device.

A simple version (the real one has various structs and variables/arrays inside) of the struct is:

struct Node {

    float* position;

};

struct Graph{
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};

My problem is that I must be doing something wrong in the memory allocation and copy of the struct. When I copy the variables withing Graph, I can see that they are properly copied (by accessing it in a kernel as in the example below). For example, I can check that graph.nBoundary=3.

However, I can only see this if I do not allocate and copy the memory of Node *. If I do, I get -858993460 instead of 3. Interestingly, Node * is not wrongly allocated, as I can inspect the value of say graph.node[0].pos[0] and it has the correct value.

This only happens with the graph.nBoundary. All the other variables remain with the correct numerical values, but this one gets "wronged" when running the cudaMemcpy of the Node*.

What am I doing wrong and why does this happen? How do I fix it?

Let me know if you need more information.


MCVE:

#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

    float* position;

};

struct Graph{
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(Graph* graph,unsigned int * d_res){
    d_res[0] = graph->nBoundary;

};
int main()
{

    // Generate some fake data on the CPU
    Graph graph;
    graph.node = (Node*)malloc(2 * sizeof(Node));
    graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
    for (int i = 0; i < 3; i++){
        graph.boundary[i] = i + 10;
    }
    graph.nBoundary = 3;
    graph.nNode = 2;
    for (int i = 0; i < 2; i++){
        // They can have different sizes in the original code
        graph.node[i].position = (float*)malloc(3 * sizeof(float));
        graph.node[i].position[0] = 45;
        graph.node[i].position[1] = 1;
        graph.node[i].position[2] = 2;
    }

    // allocate GPU memory
    Graph * d_graph = cudaGraphMalloc(&graph);
    // some dummy variables to test on GPU.
    unsigned int * d_res, *h_res;
    cudaMalloc((void **)&d_res, sizeof(unsigned int));
    h_res = (unsigned int*)malloc(sizeof(unsigned int));

    //Run kernel
    testKernel << <1, 1 >> >(d_graph, d_res);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

    printf("%u\n", graph.nBoundary);
    printf("%d", h_res[0]);

    return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph){
    Graph* outGraph;
    gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));

    //copy constants
    gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));


    // copy boundary
    unsigned int * d_auxboundary, *h_auxboundary;
    h_auxboundary = inGraph->boundary;
    gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
    gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));


    //Create nodes 
    Node * auxnode;
    gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));

    // Crate auxiliary pointers to grab them from host and pass them to device
    float ** d_position, ** h_position;
    d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
    h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));

    for (int i = 0; i < inGraph->nNode; i++){

        // Positions
        h_position[i] = inGraph->node[i].position;
        gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
        gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));

    }
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ////////////// If I comment the following section, nBoundary can be read by the kernel
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////

    gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));



    return outGraph;
}
sgarizvi
  • 16,623
  • 9
  • 64
  • 98
Ander Biguri
  • 35,140
  • 11
  • 74
  • 120
  • 1
    That copy loop at the end of your setup routine is totally broken. Why are you executing hostToDevice copies between device pointers? (and magically taking the host address of those pointers to fix the runtime errors only means you are copying random garbage from the host stack). If you can't keep track of what is a host pointer and what is a device pointer, you would be better off using managed memory as you originally asked about. – talonmies Jul 18 '18 at 18:23
  • In fact, the whole cudaGraphMalloc is full of that same mistake – talonmies Jul 18 '18 at 18:37
  • @talonmies ah, I was so focused in the pointers that my mind slipped. Simple mistake, no need to be rude. If I understand correctly, you are suggesting to change the flag to `cudaMemcpyDeviceToDevice` when I am copying from device pointers to device pointers, which of course, makes sense. However I get "invalid input" error when I do that (on line `gpuErrchk(cudaMemcpy(&outGraph->boundary, &d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));`) thus it does not fix whatever is wrong with the code – Ander Biguri Jul 18 '18 at 18:53
  • There is no rudeness in that comment -- it is a constructive suggestion if you are focussed on writing code which actually works in the simplest possible way. But in what you quote `&d_auxboundary` is wrong. It should just be `d_auxboundary`. Like I said, if you are finding this hard, use managed memory, that is what it was created for. – talonmies Jul 18 '18 at 19:01
  • @talonmies thanks, sorry for misunderstanding. I need to do this for structures that are generated on CPU, so managed memory seems not to be the thing for me. Regardless, I am trying to learn better CUDA, so this is a good way, I rather make it work in here. I'll check your suggestions to see if I can fix it – Ander Biguri Jul 18 '18 at 19:05
  • @talonmies sorry to bother, but I still get the same behavior, with your spotted mistakes. – Ander Biguri Jul 18 '18 at 19:18
  • The last copy is hosing everything because it is copying the wrong size. It should only copy `sizeof(Node *)`. – talonmies Jul 18 '18 at 19:47
  • @talonmies ah, true, thanks for spotting that too. That indeed solves the problem I described in the post. However I dont seem to be able to access any of the node variables in the kernel such as `graph->node[0].position[0]` without "illegal memory access", – Ander Biguri Jul 18 '18 at 20:21
  • Well, you didn't copy it, so surely that isn't a surpise? – talonmies Jul 18 '18 at 22:14
  • @talonmies that said, why does `gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));` not copy them? – Ander Biguri Jul 18 '18 at 23:26

1 Answers1

2

The problem is in the function cudaGraphMalloc where you are trying to allocate device memory to the members of outGraph which has already been allocated on the device. In process of doing so, you are de-referencing a device pointer on host which is illegal.

To allocate device memory to members of struct type variable which exists on the device, we first have to create a temporary host variable of that struct type, then allocate device memory to its members, and then copy it to the struct which exists on the device.

I have answered a similar question here. Please take a look at it.

The fixed code may look like this:

#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

    float* position;

};

struct Graph {
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(Graph* graph, unsigned int * d_res) {
    d_res[0] = graph->nBoundary;

};
int main()
{

    // Generate some fake data on the CPU
    Graph graph;
    graph.node = (Node*)malloc(2 * sizeof(Node));
    graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
    for (int i = 0; i < 3; i++) {
        graph.boundary[i] = i + 10;
    }
    graph.nBoundary = 3;
    graph.nNode = 2;
    for (int i = 0; i < 2; i++) {
        // They can have different sizes in the original code
        graph.node[i].position = (float*)malloc(3 * sizeof(float));
        graph.node[i].position[0] = 45;
        graph.node[i].position[1] = 1;
        graph.node[i].position[2] = 2;
    }

    // allocate GPU memory
    Graph * d_graph = cudaGraphMalloc(&graph);
    // some dummy variables to test on GPU.
    unsigned int * d_res, *h_res;
    cudaMalloc((void **)&d_res, sizeof(unsigned int));
    h_res = (unsigned int*)malloc(sizeof(unsigned int));

    //Run kernel
    testKernel << <1, 1 >> >(d_graph, d_res);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

    printf("%u\n", graph.nBoundary);
    printf("%u\n", h_res[0]);

    return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph) 
{
    //Create auxiliary Graph variable on host
    Graph temp;

    //copy constants
    temp.nNode = inGraph->nNode;
    temp.nBoundary = inGraph->nBoundary;

    // copy boundary
    gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
    gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));


    //Create nodes 
    size_t nodeBytesTotal = temp.nNode * sizeof(Node);
    gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));

    for (int i = 0; i < temp.nNode; i++)
    {
        //Create auxiliary node on host
        Node auxNodeHost;

        //Allocate device memory to position member of auxillary node
        size_t nodeBytes = 3 * sizeof(float);
        gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
        gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));

        //Copy auxillary host node to device
        Node* dPtr = temp.node + i;
        gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
    }


    Graph* outGraph;
    gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
    gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));

    return outGraph;
}

Be advised that you will have to keep the host copies of internal device pointers (i.e. the auxiliary host variables). This is because you will have to free the device memory later and since you will only have a device copy of Graph in the main code, you won't be able to access its members from the host to call cudaFree on them. In this case the variable Node auxNodeHost (created in each iteration) and Graph temp are those variables.

The above code does not do that and is just for demonstration purpose.

Tested on Windows 10, Visual Studio 2015, CUDA 9.2, NVIDIA Driver 397.44.

sgarizvi
  • 16,623
  • 9
  • 64
  • 98
  • If I dont understand wrongly, `auxNodeHost.position` and `outGraph->node[0].position` (lets assume `nNode==1`) is the same memory address. Why do I need to store `temp` or `auxNodeHost` to `cudaFree`? Can I not `cudaFree` the members of `outGraph`? – Ander Biguri Jul 19 '18 at 09:47
  • @AnderBiguri... Yes, that is correct if `nNode==1`. Both point to the same memory address on the device. You need `temp` and `auxNodeHost` for `cudaFree` because you cannot dereference a device variable from the host. It means that doing `cudaFree(outGraph->boundary);` is illegal. The proper way will be `cudaFree(temp.boundary);`. – sgarizvi Jul 19 '18 at 10:01
  • Ahh I see. Its the `outGraph->boundary` that is illegal, not the freeing. Thanks! This mistake was the biggest cause of my misconception. – Ander Biguri Jul 19 '18 at 10:14