0

I'm having trouble tracking down the source of an invalid argument to a cudaMemcpy call, here is the relevant code:

In gpu_memory.cu I declare and allocate memory for device pointers:

#define cudaErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
...
__device__ double* conc;
...
__global__ void pointer_set_kernel(..., double* conc_in...) {
...
   conc = conc_in;
...
}

double* d_conc;
...
//memory initialization
void initialize_gpu_memory(int NUM, int block_size, int grid_size) {
    ...
    cudaErrorCheck(cudaMalloc((void**)&d_conc, NUM * 53 * sizeof(double)));
    ...
    pointer_set_kernel<<<1, 1>>>(...d_conc...);
    cudaErrorCheck( cudaPeekAtLastError() ); // Checks for launch error
    cudaErrorCheck( cudaThreadSynchronize() ); // Checks for execution error
}

Next in a different file (mechanism.cu), I declare the device pointer as an extern to copy data to it:

extern __device__ double* conc;
void write_jacobian_and_rates_output(int NUM, int block_size, int grid_size) {
    ...
    initialize_gpu_memory(NUM, block_size, grid_size);
    ...
    //get address of conc
    double* d_conc;
    cudaErrorCheck(cudaGetSymbolAddress((void **)&d_conc, conc));
    //populate the concentrations on the host
    double conc_host[NSP];
    double* conc_host_full = (double*)malloc(NUM * NSP * sizeof(double));
    //populate the concentrations
    get_concentrations(1.01325e6, y_host, conc_host);
    for (int i = 0; i < NUM; ++i) {
        for (int j = 0; j < NSP; ++j) {
           conc_host_full[i + j * NUM] = conc_host[j];
        }
    }
    //check for errors, and copy over
    cudaErrorCheck( cudaPeekAtLastError() ); // Checks for launch error
    cudaErrorCheck( cudaThreadSynchronize() ); // Checks for execution error
    cudaErrorCheck(cudaMemcpy(d_conc, conc_host_full, NUM * 53 * sizeof(double), cudaMemcpyHostToDevice));
    ...
}

I get the error on the last line, (the Memcpy). It appears that the initialize_gpu_memory function works correctly, this being the cuda-gdb inspection after the malloc and pointer_set_kernel:

p d_conc 
$1 = (double *) 0x1b03236000
p conc
$2 = (@generic double * @global) 0x1b03236000

and in the write_jacobian_and_rates function:

p d_conc
$3 = (double *) 0x1b02e20600
p conc
$4 = (@generic double * @global) 0x1b03236000

I don't know why d_conc in the write function points to a different memory location after the cudaGetSymbolAddress call, or why I'm getting an invalid argument on the memcpy. I'm sure I'm doing something stupid, but for the life of me I can't see it. Would appreciate any help in tracking down the source of this, thanks!

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
arghdos
  • 329
  • 2
  • 14

1 Answers1

1

There is nothing in your code snippet indicating you have extern scope for d_conc, therefore, the two instantiations of d_conc, in two different files, are completely different objects. So,
In this context: ( mechanism.cu )

double* d_conc;  //you create a new variable in this context
cudaErrorCheck(cudaGetSymbolAddress((void **)&d_conc, conc));
//populate the concentrations on the host
double conc_host[NSP];
double* conc_host_full = (double*)malloc(NUM * NSP * sizeof(double));  

No memory has been allocated to d_conc

I see you have allocated memory in the context of gpu_memory.cu for it's variable with the same name, but not here, where the error occurs.

This also would seem to address your question: I don't know why d_conc in the write function points to a different memory location after the cudaGetSymbolAddress call

ryyker
  • 22,849
  • 3
  • 43
  • 87
  • Ok, doing the extern for d_conc fixed the problem. However, I guess I'm unclear of what the point of cudaGetSymbolAddress is then? I thought I could use it so that I could get the address of the extern \__device__ double* conc, and copy it to it that way, instead of having the d_conc pointer be externed. I suppose it doesn't matter having to do it that way versus the way I tried, I'm just curious – arghdos Feb 12 '15 at 17:55
  • 1
    @arghdos - Giving a variable ***[extern](http://stackoverflow.com/questions/1433204/how-do-i-use-extern-to-share-variables-between-source-files-in-c)*** scope, within a set of files is sufficient to access and change that same variable (where it is in scope). In your code sample, you had two completely different variables, sharing nothing but the same symbol name - `d_conc`. (See next comment) – ryyker Feb 12 '15 at 19:44
  • 1
    Once you brought a single instance of that variable into `extern` scope, the correct use of `cudaGetSymbolAddress` was made possible. Here is ***[an example](http://cuda-programming.blogspot.com/2013/03/how-to-avoid-uses-of-cudamalloc-in.html)*** of why using cudaGetSymbolAddress provides an advantage. You did several things right, just missed the `extern` scope thing. – ryyker Feb 12 '15 at 19:45