I am a CUDA newbie, playing with CUDA kernels for the first time. I've got the following kernel that implements convloution (very naively), with a dummy loop that performs a calculation of the same element 1000 times in global memory (see below). The problem is that after the operation, some cells in the result matrix are wrong: starting at certain offset, the values are not a multiple of 1000 as one would expect. My kernel:
__global__ void conv(float *input, float *kernel, float *target)
{
for (long i = 0; i <100; i++)
{
atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
}
}
The invocation code for the kernel is below:
float image[1024] = {0.0};
float kernel[] =
{
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};
float res[784]={0};
for (int i = 0; i < 1024; i++)
{
image[i]=(float)i;
} // Got 32x32 matrix
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
exit (-1);
}
float *dev_image = 0;
float *dev_kernel = 0;
float *dev_res = 0;
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);
cudaMemset(dev_res,0,sizeof(res));
// Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);
for (int itr = 0; itr<10; itr++)
{
conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}
cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);
printf("res[0]=%f\n",res[0]);
cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);
exit (0);
It seems that I handled the concurrency issue, so it shouldn't be the root-cause. I appreciate any help.