I'm trying to write a simple wrapper class to move data to/from device memory, but I keep getting "invalid argument" errors in my call to cudaMempcy when I try to copy data back from device to host.
This is my code:
#include <iostream>
#define gpu_chk(ans) { gpu_assert( ( ans ), __FILE__, __LINE__ ); }
inline void gpu_assert( cudaError_t code, const char *file, int line, bool abort=true ) {
if ( code != cudaSuccess ) {
fprintf( stderr,"GPUassert: %s %s %d\n", cudaGetErrorString( code ), file, line );
if( abort ) exit( code );
}
}
class DevMatrix {
int nrow;
int ncol;
double* dptr;
public:
DevMatrix( int nrow, int ncol ) : nrow( nrow ), ncol( ncol ) {
gpu_chk( cudaMalloc( (void**) &dptr, nrow * ncol * sizeof( double ) ) );
}
~DevMatrix() {
gpu_chk( cudaFree( dptr ) );
}
__host__ __device__ double* get() {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0))
return dptr;
#else
double* hptr;
gpu_chk( cudaMemcpy( hptr, dptr, nrow * ncol * sizeof( double ), cudaMemcpyDeviceToHost ) );
return hptr;
#endif
}
};
__global__ void akernel( DevMatrix dm ) {
int i = blockIdx.x;
int j = threadIdx.x;
int idx = ( gridDim.x * i ) + j;
double* d = dm.get();
d[idx] = -1;
}
#define ROWS 2
#define COLS 2
int main() {
DevMatrix dm( ROWS, COLS );
akernel<<<ROWS,COLS>>>( dm );
double* hptr = dm.get();
for( int i = 0; i < ROWS; i++ ) {
for( int j = 0; j < COLS; j++ ) {
int idx = ( i * ROWS ) + j;
std::cout << hptr[idx] << std::endl;
}
}
return 0;
}
Following answers to other "invalid argument" questions, I've tried different combinations like hptr, &hptr, etc.
Running the above in cuda-gdb, I can see that hptr and ptr have what I think to be the correct type, namely:
(cuda-gdb) p hptr
$1 = (double *) 0x7fffffffdd30
(cuda-gdb) p dptr
$2 = (double *) 0xb00a80000
But I keep getting the same error every time. What is wrong with the above code?