CUDA atomicAdd() produces wrong result

Question

I am a CUDA newbie, playing with CUDA kernels for the first time. I've got the following kernel that implements convloution (very naively), with a dummy loop that performs a calculation of the same element 1000 times in global memory (see below). The problem is that after the operation, some cells in the result matrix are wrong: starting at certain offset, the values are not a multiple of 1000 as one would expect. My kernel:

__global__ void conv(float *input, float *kernel, float *target)
{
    for (long i = 0; i <100; i++)
    {
        atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
    }
}

The invocation code for the kernel is below:

float image[1024] = {0.0};
float kernel[] = 
{ 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f 
};

float res[784]={0};

for (int i = 0; i < 1024; i++)
{
    image[i]=(float)i;
} // Got 32x32 matrix

cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    exit (-1);
}

float *dev_image = 0;
float *dev_kernel = 0;
float *dev_res = 0;

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);

cudaMemset(dev_res,0,sizeof(res));

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);

for (int itr = 0; itr<10; itr++)
{
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);

printf("res[0]=%f\n",res[0]);

cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);

exit (0);

It seems that I handled the concurrency issue, so it shouldn't be the root-cause. I appreciate any help.

Why would the results be multiples of 1000 when your loop has 100 iterations? — Joe, Jun 02 '13 at 12:11
Joe: I run the kernel 10 times, this is where the 1000 comes from. — Sergei Gofman, Jun 02 '13 at 12:14

score 4 · Accepted Answer · edited May 23 '17 at 11:57

You're doing arbitrary arithmetic on float values and expecting perfect accuracy.

float values can store integers perfectly up to a certain mantissa. Once we exceed that value, then float operations begin to become imprecise. Naturally, the values in your result that tend to accumulate to the largest numbers (those towards the end of the res array) will show this effect first.

Let's call the product of the loops count in your kernel and the loops count in your host code around the kernel the total_loops. For a total_loops value up to around 700, I get "precise" results, that is, all results are evenly divisible by total_loops. After that, as you gradually increase total_loops, then the errors start to creep in, starting at the end of the res array.

You could switch to double instead of float and your results would be different, except that a version of atomicAdd for double isn't conveniently available. However, the programming guide shows how to create arbitrary atomic operations, and the example they give just happens to be implementing atomicAdd for double

So the following modification of your code allows you to explore both ideas:

if you want to see how double fixes the issue, change the define to USE_DOUBLE
instead, if you want to see how reducing the total_loops fixes the issue, change the LOOPS1 define from 100 to 70.
I would also mention that it's good practice to do cuda error checking on all API calls and kernel calls (you're only covering a few, and not the kernel), but it's not an issue in this case.

Here's the code:

#include <stdio.h>
#define LOOPS1 100
#define LOOPS2 10
// set to USE_DOUBLE or USE_FLOAT
#define USE_FLOAT

#ifndef USE_DOUBLE
typedef float mytype;
#else
typedef double mytype;
#endif

__device__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull =
                              (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}

__global__ void conv(mytype *input, mytype *kernel, mytype *target)
{
    for (long i = 0; i <LOOPS1; i++)
    {
        atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
    }
}

int main(){

mytype image[1024] = {0.0};
mytype kernel[] =
{
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};

mytype res[784]={0};

for (int i = 0; i < 1024; i++)
{
    image[i]=(mytype)i;
} // Got 32x32 matrix

cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    exit (-1);
}

mytype *dev_image = 0;
mytype *dev_kernel = 0;
mytype *dev_res = 0;

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);

cudaMemset(dev_res,0,sizeof(res));

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);

for (int itr = 0; itr<LOOPS2; itr++)
{
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);

printf("results:\n");
for (int i = 0; i< (28*28); i++)
  if ((((int)res[i])%(LOOPS1*LOOPS2)) != 0) {printf("first error index: %d, value: %f\n", i, res[i]); return 1;}

cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);

  return 0;
}

Note that even if you use double, the problem will eventually show up again if you accumulate to large enough values.

Also note that this isn't really a CUDA/GPU issue. float in host code has similar restrictions.

Thanks for such a detailed answer. I got to the very same conclusion later on today. — Sergei Gofman, Jun 02 '13 at 17:44

CUDA atomicAdd() produces wrong result

1 Answers1

Linked

Related