3

This is the first time I am implementing structures in CUDA. In the following program I am copying a structure to the GPU and performing a basic operation on the data, and copying back the result to the Host.

#include<stdio.h>

inline cudaError_t checkCuda(cudaError_t result)
{
    #if defined(DEBUG) || defined(_DEBUG)
        if (result != cudaSuccess) {
            fprintf(stderr, "CUDA Runtime Error: %sn", cudaGetErrorString(result));
        assert(result == cudaSuccess);
        }
    #endif
    return result;
}

typedef struct myStruct {
    int* a;
    int b;
}MyStruct;

__global__ void structOperation(MyStruct *d_data){
    int idx = threadIdx.x;

    d_data->a[idx] += 10;
}

int main(){
    MyStruct *h_data, *d_data, *out_data;

    size_t structSize = sizeof(MyStruct);
    size_t intSize = sizeof(int);


    h_data = (MyStruct *) malloc(structSize * 1);
    h_data->b = 32;
    h_data->a = (int *)malloc(intSize * h_data->b);

    out_data = (MyStruct *) malloc(structSize * 1);
    out_data->b = 32;
    out_data->a = (int *)malloc(intSize * out_data->b);

    for(int i = 0; i<32; i++){
        h_data->a[i] = i;   
    }

    //Memory allocation for the Struct
    checkCuda(cudaMalloc(&d_data, sizeof(MyStruct) * 1));
    checkCuda(cudaMalloc(&(d_data->a), sizeof(int) * 32));


    checkCuda(cudaMemcpy(&d_data, &h_data, sizeof(MyStruct) * 1, cudaMemcpyHostToDevice));
    checkCuda(cudaMemcpy(&(d_data->a), &(h_data->a), sizeof(int) * 32, cudaMemcpyHostToDevice)); 


    structOperation<<<1,32>>>(d_data);


    checkCuda(cudaMemcpy(&out_data, &d_data, sizeof(myStruct) * 1, cudaMemcpyDeviceToHost));
  //cudaMemcpy(&(out_data->a), &(d_data->a), sizeof(int) * d_data->b, cudaMemcpyDeviceToHost); 

    printf("\nDataElements : ");
    for(int i = 0; i<32; i++){
        printf("    %d",out_data->a[i]);
    }
    printf("\n");
}

I am getting 'Segmentation Fault' as the result of execution. I guess I am operating the structure incorrectly. What is the proper way to implement?

sandeep.ganage
  • 1,409
  • 2
  • 21
  • 47
  • 1
    use [error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) to find out which CUDA call failed – m.s. Jun 30 '15 at 08:45
  • @m.s. Added cudaError checks. Its actually some memory conflict issue which I am not able to understand. Still gives me the Segmentation Fault error. – sandeep.ganage Jun 30 '15 at 08:57
  • What error message? At which line? – pQB Jun 30 '15 at 09:01
  • @pQB As I have written it twice, it gives 'Segmentation Fault' error. – sandeep.ganage Jun 30 '15 at 09:07
  • Oh, I missed it. I was looking for an an error thrown by the CUDA API (my fault). The segmentation fault probably indicates that `d_data->a[idx] +=10` is not a valid memory address. – pQB Jun 30 '15 at 09:17
  • 1
    Your question will be probably marked as duplicated. See http://stackoverflow.com/questions/5666652/how-to-pass-array-of-struct-to-gpu and http://stackoverflow.com/questions/9309195/copying-a-struct-containing-pointers-to-cuda-device. – pQB Jun 30 '15 at 09:20
  • 1
    You are accessing device memory from host when you do this: `&(d_data->a)` in `cudaMalloc` and `cudaMemcpy` calls. – sgarizvi Jun 30 '15 at 09:36

1 Answers1

8

There are several invalid memory access in the provided code.

  1. Accessing device memory (allocated using cudaMalloc) from host like d_data->a will cause undefined behavior (segmentation fault etc.).
  2. cudaMemcpy takes pointers as arguments, not address of pointer. So cudaMemcpy(&d_data, &h_data... should be replaced with cudaMemcpy(d_data, h_data....

Allocating a device object with a device pointer as a member is a bit tricky. It can be achieved as follows:

  1. Allocate a temporary host object (MyStruct temp).
  2. Allocate device memory to the member we want on device (cudaMalloc(&temp.a, bytes)).
  3. Allocate device object (cudaMalloc(&d_data, sizeof(MyStruct)).
  4. Copy temporary host object to the device object (cudaMemcpy(d_data, &temp, sizeof(MyStruct), cudaMemcpyHostToDevice)).

Keep in mind that when you modify the contents of d_data->a on the device, temp.a will also be modified because they are actually pointing to same memory location on device.

Your final main function will look something like this:

int main(){
    MyStruct *h_data, *d_data, *out_data;

    size_t structSize = sizeof(MyStruct);
    size_t intSize = sizeof(int);


    h_data = (MyStruct *) malloc(structSize * 1);
    h_data->b = 32;
    h_data->a = (int *)malloc(intSize * h_data->b);

    out_data = (MyStruct *) malloc(structSize * 1);
    out_data->b = 32;
    out_data->a = (int *)malloc(intSize * out_data->b);

    for(int i = 0; i<32; i++){
        h_data->a[i] = i;   
    }

    //Create temporary MyStruct object on host and allocate memory to its member "a" on device
    MyStruct temp;
    temp.b = h_data->b;
    checkCuda(cudaMalloc(&temp.a, 32 * sizeof(int)));

    //Copy host data to temp.a
    checkCuda(cudaMemcpy(temp.a, h_data->a, 32 * sizeof(int), cudaMemcpyHostToDevice));

    //Memory allocation for the device MyStruct
    checkCuda(cudaMalloc(&d_data, sizeof(MyStruct) * 1));
    //Copy actual object to device
    checkCuda(cudaMemcpy(d_data, &temp, sizeof(MyStruct) * 1, cudaMemcpyHostToDevice));


    structOperation<<<1,32>>>(d_data);

    //temp.a will be updated after kernel launch
    checkCuda(cudaMemcpy(out_data->a, temp.a, 32 * sizeof(int), cudaMemcpyDeviceToHost)); 

    printf("\nDataElements : ");
    for(int i = 0; i<32; i++)
    {
        printf("    %d",out_data->a[i]);
    }
    printf("\n");

    checkCuda(cudaFree(temp.a));
    checkCuda(cudaFree(d_data));

    free(h_data->a);
    free(out_data->a);
    free(h_data); 
    free(out_data);
}
sgarizvi
  • 16,623
  • 9
  • 64
  • 98
  • Now this raises several questions. 1. How can a member of a structure object (temp here) have its memory declared on device memory, though the object is declared on host memory? Its not even pointer to the structure object, its actual structure object. 2. Why cant we directly perform the structure copy from host to device, by just performing the data transfer between h_data and d_data and not using temp? – sandeep.ganage Jul 03 '15 at 09:42
  • @sandeep.ganage The member of host object is just a simple pointer. When we allocate device memory, the pointer itself resides on the host, but the memory region pointed by it is present on the device. To allocate device memory to a member of a structure which does not have a constructor, the structure object should reside on the host, otherwise how would we access its member for allocation? The type of structure (pointer or object) does not have anything to do with the explicit memory allocation of its member. – sgarizvi Jul 06 '15 at 07:17
  • @sandeep.ganage.... Allocating device memory (using `cudaMalloc`) to `d_data` does not initialize its data members. We have to initialize its members explicitly. If we allocate device memory to `d_data`, we cannot access its members on host for allocation of its members. Thats why we need an intermediate host object (`temp`). – sgarizvi Jul 06 '15 at 07:25