0

I'm trying to write a simple wrapper class to move data to/from device memory, but I keep getting "invalid argument" errors in my call to cudaMempcy when I try to copy data back from device to host.

This is my code:

#include <iostream>

#define gpu_chk(ans) { gpu_assert( ( ans ), __FILE__, __LINE__ ); }
inline void gpu_assert( cudaError_t code, const char *file, int line, bool abort=true ) {
   if ( code != cudaSuccess ) {
      fprintf( stderr,"GPUassert: %s %s %d\n", cudaGetErrorString( code ), file, line );
      if( abort ) exit( code );
   }
}

class DevMatrix {
    int nrow;
    int ncol;
    double* dptr;

public:
    DevMatrix( int nrow, int ncol ) : nrow( nrow ), ncol( ncol ) {
        gpu_chk( cudaMalloc( (void**) &dptr, nrow * ncol * sizeof( double ) ) );
    }

    ~DevMatrix() {
        gpu_chk( cudaFree( dptr ) );
    }

    __host__ __device__ double* get() {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0))
        return dptr;
#else
        double* hptr;
        gpu_chk( cudaMemcpy( hptr, dptr, nrow * ncol * sizeof( double ), cudaMemcpyDeviceToHost ) );
        return hptr;
#endif
    }
};

__global__ void akernel( DevMatrix dm ) {
    int i = blockIdx.x;
    int j = threadIdx.x;
    int idx = ( gridDim.x * i ) + j;
    double* d = dm.get();
    d[idx] = -1;
}

#define ROWS 2
#define COLS 2

int main() {
    DevMatrix dm( ROWS, COLS );
    akernel<<<ROWS,COLS>>>( dm );
    double* hptr = dm.get();
    for( int i = 0; i < ROWS; i++ ) {
        for( int j = 0; j < COLS; j++ ) {
            int idx = ( i * ROWS ) + j;
            std::cout << hptr[idx] << std::endl;
        }
    }
    return 0;
}

Following answers to other "invalid argument" questions, I've tried different combinations like hptr, &hptr, etc.

Running the above in cuda-gdb, I can see that hptr and ptr have what I think to be the correct type, namely:

(cuda-gdb) p hptr
$1 = (double *) 0x7fffffffdd30
(cuda-gdb) p dptr
$2 = (double *) 0xb00a80000

But I keep getting the same error every time. What is wrong with the above code?

talonmies
  • 70,661
  • 34
  • 192
  • 269
jtatria
  • 527
  • 3
  • 12
  • 3
    I know nothing about CUDA, but a brief reading of the manual, it seems to me you need to allocate the memory hptr before you memcpy onto it, just like in c – Mike Vine Oct 14 '17 at 18:44
  • 1
    The pass-by-value mechanism for a C/C++ function call invokes a default copy-constructor for the object `dm` passed to the kernel. When that object is no longer required by the kernel, the same mechanism invokes the user-defined destructor. As a result `cudaFree(dptr)` is performed. When the code then arrives at `cudaMemcpy(hptr, dptr, ...` in `dm.get()` in host code after the kernel, `dptr` is no longer a valid device pointer (since it was "shared" between two copies of the `dm` object), and so the invalid argument error is thrown. After that is fixed, the `hptr` issue must be addressed. – Robert Crovella Oct 15 '17 at 14:43
  • As a simple test of this assertion, comment out the line calling `cudaFree(dptr)`, and modify the `hptr` definition line in `get` to read: `double* hptr = (double *) malloc(nrow * ncol * sizeof( double ));` and the code runs without runtime error for me. – Robert Crovella Oct 15 '17 at 14:47

0 Answers0