I've created a simple but complete program basing on this tutorial: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//Kernel definition
__global__ void VecAdd(float* A, float* B, float* C,int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < N)
C[i] = A[i] + B[i];
}
//Host code
int main()
{
int N = 1000;
int i;
FILE *f;
size_t size = N * sizeof(float);
//allocate input vectors h_A and h_B in host memory
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
//Initialize input vectors
f = fopen("A.txt","r");
for(i=0;i<N;i++)
fscanf(f,"%f ",&h_A[i]);
fclose(f);
f = fopen("B.txt","r");
for(i=0;i<N;i++)
fscanf(f,"%f ",&h_B[i]);
fclose(f);
//Allocate vactors in device memory
float *d_A;
gpuErrchk(cudaMalloc(&d_A,size));
float *d_B;
cudaMalloc(&d_B,size);
float *d_C;
cudaMalloc(&d_C,size);
gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
f = fopen("C.txt","w");
printf("%f \n",h_C[i]);
for(i=0;i<1000;i++)
fprintf(f,"%f ",h_C[i]);
fclose(f);
printf("Zakonczono obliczenia\n");
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//Free host memory
free(h_A);
free(h_B);
return 0;
}
It should read two vectors from files, add them on device and then print the output into 'C.txt' file. However, it prints one thousand of zeros.
After a little debugging did I find the culprit- the cudaMalloc function.
(cuda-gdb) n 42 cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); (cuda-gdb) n 43 cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); (cuda-gdb) print d_A[0] $1 = 0 (cuda-gdb) print h_A[0] $2 = 3.66192293
I wonder why it doesn't work, this part of code had been raw copied from the tutorial.