cudaMemcpy returns cudaErrorInvalidArgument when reading from Device to Host, unclear why

Question

first post here. I'm currently working on a project that requires writing a large 2d array (on the order of 1,000,000x7) into my GPU, doing some computation, and returning it to the Host. Since I want to do so quickly and with such a large array, I attempted to flatten the array to help pass it into the GPU fairly straightforwardly. The array successfully writes (or at least cudaMalloc and cudaMemcpy both return cudaSuccess when I write to the device), but when I try to read it out cudaMemcpy returns an invalid argument error.

I've not been able to figure out why this is, since I think I should be writing a valid 1d array (flattened) onto the device and reading it back out, and I thought I was feeding the right arguments to do this. The only results for this error I've found online involve swapping the dst and src arguments for cudaMemcpy, but I think I've got those right here.

This is a simplified version of my code that reproduces the problem:

#include <iostream>

using namespace std;

void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){ 

    arr = new float*[rows];

    arr[0] = new float[rows * cols];

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols;
}

void write2dArrayToGPU(float ** arr, float * devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMalloc((void**)&devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed";

    if(cudaSuccess != cudaMemcpy(devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed";
}

void read2dArrayFromGPU(float ** arr, float * devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMemcpy(arr[0], devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl;
}

int main(){

int R = 100;
int C = 7;

cout << "Allocating an " << R << "x" << C << " array ...";
float ** arrA;
alloc2dArray(arrA, R, C);


cout << "Assigning some values ...";
for(int i = 0; i < R; i++){
    for(int j = 0; j < C; j++){
        arrA[i][j] = i*C + j;
    }
}
cout << "Done!" << endl;


cout << "Writing to the GPU ...";
float * Darr = 0;
write2dArrayToGPU(arrA, Darr, R, C);
cout << " Done!" << endl;

cout << "Allocating second " << R << "x" << C << " array ...";
float ** arrB;
alloc2dArray(arrB, R, C);
cout << "Done!" << endl;

cout << "Reading from the GPU into the new array ...";
read2dArrayFromGPU(arrB, Darr, R, C);


}

I compile and run this on my laptop with

 $nvcc -arch=sm_30 test.cu -o test
 $optirun cuda-memcheck ./test

and get the result:

========= CUDA-MEMCHECK
Allocating an 100x7 array ...Assigning some values ...Done!
Writing to the GPU ... Done!
Allocating second 100x7 array ...Done!
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy. 
=========     Saved host backtrace up to driver entry point at error
Reading from the GPU into the new array ...=========     Host Frame:/usr/lib64/nvidia-bumblebee/libcuda.so.1 [0x2ef343]
cudaMemcpy Read Failed=========     Host Frame:./test [0x38c6f]
=========     Host Frame:./test [0x2f08]
=========     Host Frame:./test [0x3135]
=========     Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf1) [0x20401]
=========     Host Frame:./test [0x2c6a]

=========
========= ERROR SUMMARY: 1 error

I'm moderately new to CUDA, and still learning, so any help would be appreciated, thanks!

you can't pass-by-value `devPtr` as a single pointer argument to a function, do a `cudaMalloc` on that pointer, and then expect the assigned pointer value to show up in the calling environment. This is a common mistake with pass-by-value, and there are certainly other similar questions. Such as [this one](https://stackoverflow.com/questions/22826380/cuda-allocation-and-return-array-from-gpu-to-cpu). You might want to study the answer there, your question is arguably a duplicate of that one. — Robert Crovella, Jul 11 '17 at 23:51

tgorororo · Answer 1 · 2017-07-12T15:44:41.170

Thanks to Robert Crovella for pointing me in the right direction with a comment above, and linking a similar question.

The gist is that by passing devPtr by value rather than by pointer or by reference into my GPU write and read functions, the cudaMalloc and cudaMemcpy functions were acting only on a copy in the function scope.

Two solutions - (both of these run without throwing errors for me)

First: Pass devPtr by reference into write2dArrayToGPU and read2dArrayFromGPU the solution then looks like.

#include <iostream>

using namespace std;


void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){

    arr = new float*[rows];

    arr[0] = new float[rows * cols];

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols;
}

//changed float * devPtr to float *  &devPtr
void write2dArrayToGPU(float ** arr, float * &devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMalloc((void**)&devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed";

    if(cudaSuccess != cudaMemcpy(devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed";
}

//changed float * devPtr to float * &devPtr
void read2dArrayFromGPU(float ** arr, float * &devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMemcpy(arr[0], devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl;
}

int main(){

int R = 100;
int C = 7;

cout << "Allocating an " << R << "x" << C << " array ...";
float ** arrA;
alloc2dArray(arrA, R, C);


cout << "Assigning some values ...";
for(int i = 0; i < R; i++){
    for(int j = 0; j < C; j++){
        arrA[i][j] = i*C + j;
    }
}
cout << "Done!" << endl;


cout << "Writing to the GPU ...";
float * Darr = 0;
write2dArrayToGPU(arrA, Darr, R, C);
cout << " Done!" << endl;

cout << "Allocating second " << R << "x" << C << " array ...";
float ** arrB;
alloc2dArray(arrB, R, C);
cout << "Done!" << endl;

cout << "Reading from the GPU into the new array ...";
read2dArrayFromGPU(arrB, Darr, R, C);


}

Second: Pass devPtr by pointer so the solution looks like

#include <iostream>

using namespace std;

void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){

    arr = new float*[rows];

    arr[0] = new float[rows * cols];

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols;
}

//changed float * devPtr to float ** devPtr
void write2dArrayToGPU(float ** arr, float ** devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMalloc((void**)devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed";

    if(cudaSuccess != cudaMemcpy(*devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed";
}

//changed float * devPtr to float ** devPtr
void read2dArrayFromGPU(float ** arr, float ** devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMemcpy(arr[0], *devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl;
}

int main(){

int R = 100;
int C = 7;

cout << "Allocating an " << R << "x" << C << " array ...";
float ** arrA;
alloc2dArray(arrA, R, C);


cout << "Assigning some values ...";
for(int i = 0; i < R; i++){
    for(int j = 0; j < C; j++){
        arrA[i][j] = i*C + j;
    }
}
cout << "Done!" << endl;


cout << "Writing to the GPU ...";
float * Darr = 0;
write2dArrayToGPU(arrA, &Darr, R, C); \\changed Darr to &Darr
cout << " Done!" << endl;

cout << "Allocating second " << R << "x" << C << " array ...";
float ** arrB;
alloc2dArray(arrB, R, C);
cout << "Done!" << endl;

cout << "Reading from the GPU into the new array ...";
read2dArrayFromGPU(arrB, &Darr, R, C); // changed Darr to &Darr


}

cudaMemcpy returns cudaErrorInvalidArgument when reading from Device to Host, unclear why

1 Answers1