2

To be more clear, what I want is passing the pointers and all the data they pointed to device. To test how I can achieve this goal, I wrote a simple class:

class vecarray{
    public:
        int * vecptr[N];                //array of pointers pointing to array
        int dim[N];                     //store length of each array pointed to
        __device__ __host__ vecarray(); //constructor
        __device__ __host__ int sum();  //sum up all the elements in the array being              
                                       //pointed to
}

vecarray::vecarray(){
    for(int i = 0; i<N; i++)
    {
        vecptr[i] = NULL;
        dim[i] = 0;
    }
}

int vecarray::sum(){
    int i=0, j=0, s=0;
    for (i=0; i<N; i++)
        for(j=0; j < dim[i]; j++)
            s += vecptr[i][j];
    return s;
}

Then I use this class in the following code:

#define N 2
__global__ void addvecarray( vecarray * v, int *s){
    *s = v->sum();
}

int main(){                                 //copy *V to device, do sum() and pass back 
    vecarray *v, *dev_v;                    //the result by dev_v
    v = new vecarray;
    dev_v = new vecarray;
    int a[3] = {1,2,3};                     //initialize v manually
    int b[4] = {4,5,6,7};
    int result = 0;
    int * dev_result;
    v->vecptr[0] = a;
    v->vecptr[1] = b;
    v->dim[0] = 3; v->dim[1] = 4;


    cudaMalloc((void**)&dev_v, sizeof(vecarray));      

    cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object 

    for(int i = 0; i < N; i++){
        cudaMalloc((void**)&(dev_v->vecptr[i]), v->dim[i]*sizeof(int));
    }

    for(int i = 0; i<N; i++ ){                   //copy arrays
    cudaMemcpy(dev_v->vecptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice));
    }
    addvecarray<<<1,1>>>(dev_v, dev_result);

    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    printf("the result is %d\n", result);
}

The code passed nvcc compiler, but failed with segmentation fault when running. I've checked the problem lies in the two cudaMalloc and cudaMemcpy opertation in the for-loop. So my question is how should I pass this object to CUDA? Thanks in advance.

Stone
  • 345
  • 1
  • 6
  • 14
  • I believe this question is a duplicate of [this one](http://stackoverflow.com/questions/14284964/cuda-how-to-allocate-memory-for-data-member-of-a-class/14286341#14286341). In the line of code where you have a for loop that is performing a cudaMalloc operation, you are passing as a pointer to cudaMalloc a pointer that *already lives in device memory*. Instead you need to create a separate set of int pointers on the host, cudaMalloc these, then cudaMemcpy them to the device in the appropriate locations in your vecarray object instantiated at `dev_v`. – Robert Crovella Feb 09 '13 at 21:03

1 Answers1

4

Your code had several errors in it. As I mentioned in the comments, one of the key errors is in how you are allocating memory for the data regions referenced by pointers within the class. The key mistake there is that you are passing a pointer to cudaMalloc that already lives in device memory. We can fix that by creating an extra set of pointers that we will use to allocate the needed device storage for the arrays that are pointed to within the class. In addition there were a few other errors, such as the fact that you had no properly allocated device storage for dev_result. The following code fixes all the errors I could find and I believe gives the correct result. I've also added a reference form of cuda error checking that you may find useful to use in your projects:

#include <stdio.h>

#define N 2
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

using namespace std;

class vecarray{
    public:
        int *vecptr[N];                //array of pointers pointing to array
        int dim[N];                     //store length of each array pointed to

        __device__ __host__ vecarray(); //constructor
        __device__ __host__ int sum();  //sum up all the elements in the array being
                                       //pointed to
};

vecarray::vecarray(){
    for(int i = 0; i<N; i++)
    {
        vecptr[i] = NULL;
        dim[i] = 0;
    }
}

__device__ __host__ int vecarray::sum(){
    int i=0, j=0, s=0;
    for (i=0; i<N; i++)
        for(j=0; j < dim[i]; j++)
            s += vecptr[i][j];
    return s;
}

__global__ void addvecarray( vecarray * v, int *s){
    *s = v->sum();
}

int main(){                                 //copy *V to device, do sum() and pass back
    vecarray *v, *dev_v;                    //the result by dev_v
    v = new vecarray;
    int a[3] = {1,2,3};                     //initialize v manually
    int b[4] = {4,5,6,7};
    int result = 0;
    int *dev_result;
    v->vecptr[0] = a;
    v->vecptr[1] = b;
    v->dim[0] = 3; v->dim[1] = 4;
    int *vptr[N];

    cudaMalloc((void**)&dev_v, sizeof(vecarray));
    cudaCheckErrors("cudaMalloc1 fail");
    cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object
    cudaCheckErrors("cudaMemcpy1 fail");

    for(int i = 0; i < N; i++){
        cudaMalloc((void**)&(vptr[i]), v->dim[i]*sizeof(int));
        cudaCheckErrors("cudaMalloc2 fail");
        cudaMemcpy(&(dev_v->vecptr[i]), &vptr[i], sizeof(int*), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy2 fail");
    }

    for(int i = 0; i<N; i++ ){                   //copy arrays
        cudaMemcpy(vptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy3 fail");
    }
    cudaMalloc((void **)&dev_result, sizeof(int));
    cudaCheckErrors("cudaMalloc3 fail");
    addvecarray<<<1,1>>>(dev_v, dev_result);

    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy4 fail");
    printf("the result is %d\n", result);
    return 0;
}
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • thanks a lot. The code works. I noticed that in the thread of duplicate question you mentioned above, Eric mentioned in the first comment that the cause of this kind of error is that we are trying to dereference the pointer pointing to an device address on host, which is also what I heard at some other places. But here in the first for loop of your code above, the first parameter of cudaMemcpy, &(dev_v->vecptr[i]) also does a dereference operation first inside the bracket as it is equal to &((*dev).vecptr[i]). If this is true, isn't there a conflict here between the statement and the code? – Stone Feb 10 '13 at 08:10
  • Also, may I ask why we can't pass as a pointer to cudaMalloc() a pointer that already pointed to an address? What I have in mind is that this is to prevent memory leak. Because if we move the pointer away pointing to the new address cudaMalloc() returns, we lose track of the variables reside in the previous address pointed by the same pointer, and we can no longer access that memory anymore. Is that correct? Thanks again. – Stone Feb 10 '13 at 08:23
  • Oh, pleas ignore the second question above. I just figured out why, just as you said in the other thread. Sorry about it. – Stone Feb 10 '13 at 08:28
  • 1
    For your dereferencing question, cudaMemcpy is a special case, in that we can pass a device pointer as a target to the copy operation, which will get dereferenced as part of the copy process. Ordinary host code should not dereference device pointers, but cudaMemcpy is a special case. Otherwise if we could not dereference a device pointer in this way, how could we copy data to it from the host? But cudaMalloc doesn't allow this. The address of a pointer passed to cudaMalloc should reside on the host. – Robert Crovella Feb 10 '13 at 13:52
  • @RobertCrovella one question about `int *vptr[N];`, how to do the same if N is a variable?(making array size dynamic) – Mohamed Sakr Jun 15 '14 at 00:01