Pointer to device array inside host struct

Question

I'm trying to create a struct that will hold both the host and device arrays in one place and should reside on the host. I later intend expand it to be a element of a linked list. The basic struct looks like this:

typedef struct Data{
    double *h;
    double *d;
} Data;

Where *h points to an array of doubles on the host and *d points to an array of doubles on the device.

There are various answers on SO about copying whole structs to the the device (CUDA cudaMemcpy Struct of Arrays) but none of them quite do what I need. I have the following code but keep getting illegal memory access errors.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "cuda.h"

/*
* CUDA Error stuff
*/

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


#define HANDLE_NULL( a ) {if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

//malloc error code
int errMsg(const char *message, int errorCode)
{
    printf("%s\n", message);
    return errorCode;
}

typedef struct Data{
    double *h;
    double *d;
} Data;

__global__ void kernel(Data *d)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<100){
        d->d[tid] = 2;
    }

}

int main()
{
    Data *d;
    d = (Data*)malloc(sizeof(Data));

    d->h = (double*)malloc(sizeof(double)*100);
    HANDLE_ERROR( cudaMalloc((void**) &(d->d), 100*sizeof(double)) );

    for(int i=0; i<100; i++){
        d->h[i] = i;
    }

    HANDLE_ERROR( cudaMemcpy(d->d, d->h, 100*sizeof(double), cudaMemcpyHostToDevice) );

    printf("%f\n", d->h[1]);

    kernel<<<1, 102>>>(d);

    printf("done\n");

    {
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\"->\n",
               cudaGetErrorString(cudaerr));
    }

    HANDLE_ERROR( cudaMemcpy(d->h, d->d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
    printf("%f\n", d->h[99]);


    return 0;
}

The output I get is:

1.000000
done
kernel launch failed with error "an illegal memory access was encountered"->
an illegal memory access was encountered in linkedListGPU.cu at line 77

I suspect I have just messed up my pointers a bit. The error handling code is from the Wiley introduction to CUDA book, if there code is not allowed on here I'll remove it.

Thanks.

You are passing a host pointer to the device and trying to access it within the kernel. That obviously isn't ever going to work — talonmies, Apr 01 '17 at 19:45
Thank you, just changing the kernel call to (d->d) and tweaking the kernel code fixed it. Sorry about that, I've been getting in a muddle with structs on the device at the moment. — Tom Wilshaw, Apr 01 '17 at 20:46

score 1 · Accepted Answer · edited May 23 '17 at 12:17

The problem is d itself is a pointer to a host-allocated struct (where the d and h pointers are contained. When you pass the d struct pointer to the kernel like so:

kernel<<<1, 102>>>(d);
                   ^
                   this is a pointer to memory on the host

and then attempt to dereference that pointer in device code here:

    d->...;
     ^ 
     This operator dereferences the pointer to the left of it

you get an illegal memory access.

There are at least 2 obvious ways to fix this:

Pass the struct by value instead of by pointer.

Here is an example:

$ cat t1311.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "cuda.h"

/*
* CUDA Error stuff
*/

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


#define HANDLE_NULL( a ) {if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

//malloc error code
int errMsg(const char *message, int errorCode)
{
    printf("%s\n", message);
    return errorCode;
}

typedef struct Data{
    double *h;
    double *d;
} Data;

__global__ void kernel(Data d)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<100){
        d.d[tid] = 2;
    }

}

int main()
{
    Data d;

    d.h = (double*)malloc(sizeof(double)*100);
    HANDLE_ERROR( cudaMalloc((void**) &(d.d), 100*sizeof(double)) );

    for(int i=0; i<100; i++){
        d.h[i] = i;
    }

    HANDLE_ERROR( cudaMemcpy(d.d, d.h, 100*sizeof(double), cudaMemcpyHostToDevice) );

    printf("%f\n", d.h[1]);

    kernel<<<1, 102>>>(d);

    printf("done\n");

    {
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\"->\n",
               cudaGetErrorString(cudaerr));
    }

    HANDLE_ERROR( cudaMemcpy(d.h, d.d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
    printf("%f\n", d.h[99]);


    return 0;
}
$ nvcc -arch=sm_35 -o t1311 t1311.cu
$ cuda-memcheck ./t1311
========= CUDA-MEMCHECK
1.000000
done
2.000000
========= ERROR SUMMARY: 0 errors
$

Make a device copy of the struct that the d host pointer points to:

Here is an example:

$ cat t1311.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "cuda.h"

/*
* CUDA Error stuff
*/

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


#define HANDLE_NULL( a ) {if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

//malloc error code
int errMsg(const char *message, int errorCode)
{
    printf("%s\n", message);
    return errorCode;
}

typedef struct Data{
    double *h;
    double *d;
} Data;

__global__ void kernel(Data *d)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<100){
        d->d[tid] = 2;
    }

}

int main()
{
    Data *d, *dev_d;
    d = (Data*)malloc(sizeof(Data));
    HANDLE_ERROR(cudaMalloc(&dev_d, sizeof(Data)));
    d->h = (double*)malloc(sizeof(double)*100);
    HANDLE_ERROR( cudaMalloc((void**) &(d->d), 100*sizeof(double)) );

    for(int i=0; i<100; i++){
        d->h[i] = i;
    }

    HANDLE_ERROR( cudaMemcpy(d->d, d->h, 100*sizeof(double), cudaMemcpyHostToDevice) );
    HANDLE_ERROR(cudaMemcpy(dev_d, d, sizeof(Data), cudaMemcpyHostToDevice));
    printf("%f\n", d->h[1]);

    kernel<<<1, 102>>>(dev_d);

    printf("done\n");

    {
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\"->\n",
               cudaGetErrorString(cudaerr));
    }

    HANDLE_ERROR( cudaMemcpy(d->h, d->d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
    printf("%f\n", d->h[99]);


    return 0;
}
$ nvcc -arch=sm_35 -o t1311 t1311.cu
$ cuda-memcheck ./t1311
========= CUDA-MEMCHECK
1.000000
done
2.000000
========= ERROR SUMMARY: 0 errors
$

As an aside, you can follow the method outlined here to carry your debug process a bit farther.

Pointer to device array inside host struct

1 Answers1