How do I properly copy memory from device to host in CUDA?

Question

I am trying to simply increment a few matrix values in parallel in CUDA and trying to copy them back to main memory. However when I print them out once the thread function returns, the values are the same. I have even tried running the program with just 1 thread, but have had no luck. Any help would be greatly appreciated.

My code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>

#define BLOCK_SIZE 1024
#define MAX_N       100000000
#define MAX_THREADS     1024

int num_threads;
int count;              // Count of threads that have updated their partition
int size;
//int increment; // VS
int * inc2;
//int my_start;


//Host data
int * thread_ids;

//nvcc -arch=sm_20 -o nbody.exe nbody.cu (compilation)

__global__ void pcyc_red(float * a, float * b, float * c, float * D, float * X, 
                    float * a2, float * b2, float * c2, float * D2,
                    int * inc2_dev, int * size_dev, int * num_threads_dev){

//__threadfence();
int thread_id = threadIdx.x + (blockIdx.x * blockDim.x);
float k1;
float k2;
int i;

int start = 0;
//int end = size_dev-1;
//int inc2_dev = inc2_dev1[0];
//int inc_dev = *inc_dev1;
//int size_dev = size_dev1[0];
int nthreads = num_threads_dev[0];
//Thread work assignment
int chunk_size = size_dev[0]/nthreads;
int my_start = thread_id*(chunk_size);
int my_end = start + ((thread_id + 1)*chunk_size - 1);
//__threadfence();
__syncthreads();
//Forward Reduction
for(i = my_start; i <= my_end; ++i){
    a[i] = a[i]++;
    b[i] = b[i]++;
    c[i] = c[i]++;
    D[i] = D[i]++;
    X[i] = X[i]++;
}

__threadfence();
//__syncthreads();
}//Device Function


float* init_vector(int size){
float* output;
output = (float*) calloc(size, sizeof(float));
int i;
for(i = 0; i < size; ++i){
    output[i] = 2.0;
}
return output;
}

float* init_vector_ac(int s){
//s will be used for size-1 not to be confused for size.
float* output;
output = (float*) calloc(s, sizeof(float));
int i;
for(i = 0; i < s; ++i){
    output[i] = -1.0;
}
return output;
}

// Main program 
int main(int argc, char *argv[]) {

//num_threads -> atoi(argv[argc-1]); 
//struct timeval start, stop; 
float total_time;
int i;

///Host structures
float* a;
float* b;
float* c;
float* D;
float* X;

//increment = 2; // VS
inc2 = (int*) malloc(sizeof(int));
inc2[0] = 1;
//size = (int*) malloc(sizeof(int));
//num_threads = (int*) malloc(sizeof(int));
//my_start = 0;
//wait_flag = false;

///Device Data
//SYSTEM * sys_dev;
float * a_dev;
float * b_dev;
float * c_dev;
float * D_dev;
float * X_dev;

float * a2_dev;
float * b2_dev;
float * c2_dev;
float * D2_dev;
//float * X2_dev;

//int * inc_dev;
int * inc2_dev;
//int * mstart_dev;
int * size_dev;
int * num_threads_dev;
int result_var;

//int final_inc2;

cudaEvent_t start, stop;    // GPU timing variables
//struct timeval cpu_start, cpu_stop; // CPU timing variables
   // float time_array[10]; 

// Timing initializations
cudaEventCreate(&start);
cudaEventCreate(&stop);

if (argc != 3) 
{
    printf("Use: <executable_name> <size> <num_threads>\n"); 
    exit(0);
}
if ((size = atoi(argv[argc-2])) > MAX_N) 
{
    printf("Maximum number of nodes allowed: %d\n", MAX_N);
    exit(0);
}; 

if ((num_threads = atoi(argv[argc-1])) > MAX_THREADS) 
{
    printf("Maximum number of threads allowed: %d.\n", MAX_THREADS);
    exit(0);
}; 

int size_array = (size) * sizeof(float);
int size_array2 = (size - 1) * sizeof(float);

// Initialize host tridiagonal matrix
a = init_vector_ac(size-1); // a[i] = -1.0
b = init_vector(size);      // b[i] = 2.0
c = init_vector_ac(size-1); // c[i] = -1.0
D = init_vector(size);      // D[i] = 2.0
X = init_vector(size);      // X[i] = 2.0

//xs = init_vector_err(size);   

// Shift elements of a by 1
for(i = size-1; i > 0; i--) a[i] = a[i-1];
a[0] = 0.0;


thread_ids = (int*) calloc(num_threads, sizeof(int));

count = 0;

for(i = 0; i < num_threads; ++i){
    thread_ids[i] = i;
}
//Cuda Operation

cudaEventRecord( start, 0);

cudaMalloc((void **) &a_dev, size);
cudaMalloc((void **) &b_dev, size);
cudaMalloc((void **) &c_dev, size);
cudaMalloc((void **) &D_dev, size);
cudaMalloc((void **) &X_dev, size);
cudaMalloc((void **) &a2_dev, size);
cudaMalloc((void **) &b2_dev, size);
cudaMalloc((void **) &c2_dev, size);
cudaMalloc((void **) &D2_dev, size);
//cudaMalloc((void**)&inc_dev, sizeof(int));
cudaMalloc((void**)&inc2_dev, sizeof(int));
//cudaMalloc((void**)&mstart_dev, sizeof(int));
cudaMalloc((void**)&size_dev, sizeof(int));
cudaMalloc((void**)&num_threads_dev, sizeof(int));


cudaMemcpy(a_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D_dev, D, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(X_dev, X, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(a2_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b2_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c2_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D2_dev, D, size_array, cudaMemcpyHostToDevice);

//cudaMemcpy(inc_dev, &increment, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(inc2_dev, inc2, sizeof(int), cudaMemcpyHostToDevice);
//cudaMemcpy(mstart_dev, &my_start, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(size_dev, &size, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(num_threads_dev, &num_threads, sizeof(int), cudaMemcpyHostToDevice);

cudaDeviceSynchronize();
pcyc_red<<<1, num_threads>>>(a_dev, b_dev, c_dev, D_dev, X_dev,
                            a2_dev, b2_dev, c2_dev, D2_dev,
                            inc2_dev, size_dev, num_threads_dev);
cudaDeviceSynchronize();

cudaMemcpy(X, X_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(a, a_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(c, c_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(D, D_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(inc2, inc2_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&result_var, num_threads_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&total_time, start, stop);

printf("Final Var: %d\n\n", inc2[0]);
printf("Num Threads Var: %d\n\n", result_var);

for(i = 0; i < size; ++i){
    printf("a=%8.4f \n", a[i]); 
    printf("b=%8.4f \n", b[i]); 
    printf("c=%8.4f \n", c[i]); 
    printf("D=%8.4f \n", D[i]); 
    printf("X=%8.4f \n", X[i]); 
}

printf("Threads = %d, matrix_size = %d, time = %f\n", 
    num_threads, size, total_time);

cudaFree(a_dev);
cudaFree(b_dev);
cudaFree(c_dev);
cudaFree(D_dev);
cudaFree(X_dev);
//cudaFree(inc_dev);
cudaFree(inc2_dev);
//cudaFree(mstart_dev);
//cudaFree(size_dev);
//cudaFree(num_threads_dev);

}//end of main

start by adding [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) to your code. You can also run your code with `cuda-memcheck` to see what it reports. After that, if you want help, [post a *complete* code](http://stackoverflow.com/help/mcve), that someone could copy, paste, compile, and run, without adding anything or changing anything. — Robert Crovella, Oct 01 '14 at 16:47
I am getting "No Cuda-Memcheck results found" when running memcheck. — Harish Vangavolu, Oct 01 '14 at 17:10
I get an invalid argument on cudaMemcpy error when I run your code with cuda-memcheck and use 1024 32 as the arguments. Add proper cuda error checking. It will narrow it down to the specific cudaMemcpy call that you have not set up properly, or else it will point out a machine configuration issue. — Robert Crovella, Oct 01 '14 at 19:10

score 4 · Accepted Answer · edited May 23 '17 at 11:45

Add proper cuda error checking to your code.

One problem I can see is that your allocation sizes are not matched to your array sizes. To pick just one example:

int size_array = (size) * sizeof(float);
...
cudaMalloc((void **) &b_dev, size);  // size should probably be size_array here
...                          ^^^^
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice);  // this won't work, will throw error
                     ^^^^^^^^^^

The above is certainly an error, and there are several of that type in your code. You may also have a machine configuration issue (CUDA not properly installed, etc.) which the error checking will also indicate.

Turns out the server node that I kept submitting my jobs onto didn't have a GPU device. But thank you this really helps. — Harish Vangavolu, Oct 15 '14 at 22:44

How do I properly copy memory from device to host in CUDA?

1 Answers1