I'm testing a code in which a kernel is meant to perform a simple sum between two values stored in two pointers.
After a call to the kernel "add" I can no longer copy the pointers' data from host to device and from there to host again, even when no operations were performed over the pointers in the kernel. But when I comment the statement in which the function is called, I get the correct results. Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void add(int *a, int *b, int *c)
{
*c = *a - *b;
}
int main(void)
{
int result, x_val, y_val; //Store data from device to host in this vars.
int *x_host, *y_host; //Pointers in host
int *tempGPU, *x_dev, *y_dev; //Pointers in device
x_host = (int *)malloc(sizeof(int));
y_host = (int *)malloc(sizeof(int));
*x_host = 8;
*y_host = 4;
x_val = -5;
y_val = -10;
printf("\n x = %d, y = %d\n", *x_host, *y_host);
cudaMalloc( (void **)&tempGPU, sizeof(int) );
//It's wrong to pass this arguments to the function. The problem is in this statement.
add<<<1,1>>> (x_host, y_host, tempGPU);
cudaMemcpy(&result, tempGPU, sizeof(int), cudaMemcpyDeviceToHost);
printf("\n x_host - y_host = %d\n", result);
cudaMalloc( (void **)&x_dev, sizeof(int) );
cudaMalloc( (void **)&y_dev, sizeof(int) );
*x_host = 6;
*y_host = 20;
cudaMemcpy(x_dev, x_host, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(y_dev, y_host, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&x_val, x_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&y_val, y_dev, sizeof(int), cudaMemcpyDeviceToHost);
printf("\n x_host = %d, y_host = %d\n", *x_host, *y_host);
printf("\n x_val = %d, y_val = %d\n", x_val, y_val);
cudaFree( tempGPU );
printf( "\nCUDA: %s\n", cudaGetErrorString(cudaGetLastError()) );
return 0;
}
I know that the function is expecting pointers allocated in the device, but why such a mistake don't allow me to use cudaMemcpy properly? Why when I comment the line:
add<<<1,1>>> (x_host, y_host, tempGPU);
I get the correct results. Thanks.