I have written a CUDA test program, because my more complex program was not working. This one isn't working either.
What should it do?
I've written a test program (I think) to add 0.5 to an array of numbers. Or at least that's what it's supposed to do.
Here's the code:
#include <iostream>
#include <cuda.h>
__global__
void cuda_kernel_func(double *in, double *out, int count)
{
int index = blockIdx.x;
if(index < count)
{
out[index] = in[index] + 0.5;
}
}
int main()
{
int num = 10;
double *out;
double *d_out;
double *in;
double *d_in;
out = (double*)malloc(num * sizeof(double));
in = (double*)malloc(num * sizeof(double));
cudaMalloc(&d_out, num * sizeof(double));
cudaMalloc(&d_in, num * sizeof(double));
for(int i = 0; i < num; ++ i)
{
in[i] = (double)i;
}
cudaMemcpy(d_in, in, num * sizeof(double), cudaMemcpyHostToDevice);
cuda_kernel_func<<<num, 1>>>(d_in, d_out, num);
cudaDeviceSynchronize();
cudaMemcpy(out, d_out, num * sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);
for(int i = 0; i < num; ++ i)
{
std::cout << out[i] << " ";
}
std::cout << std::endl;
free(in);
free(out);
return 0;
}
I am fairly new to CUDA, but not to parallelization or C/C++. I think the code is fairly self-explanatory.
Output:
0 0 0 0 0 0 0 0 0 0
Which isn't very exciting.