How do I use atomicMax on floating-point values in CUDA?

Question

I have used atomicMax() to find the maximum value in the CUDA kernel:

__global__ void global_max(float* values, float* gl_max)
{
    int i=threadIdx.x + blockDim.x * blockIdx.x;
    float val=values[i];

    atomicMax(gl_max, val);
}

It is throwing the following error:

error: no instance of overloaded function "atomicMax" matches the argument list

The argument types are: (float *, float).

It's not supported, but you can create your own – Konstantin Burlachenko Nov 18 '18 at 22:06 — Konstantin Burlachenko, Nov 18 '18 at 22:06

score 32 · Answer 1 · answered Jul 01 '13 at 09:25

32

atomicMax is not available for float types. But you can implement it via atomicCAS:

__device__ static float atomicMax(float* address, float val)
{
    int* address_as_i = (int*) address;
    int old = *address_as_i, assumed;
    do {
        assumed = old;
        old = ::atomicCAS(address_as_i, assumed,
            __float_as_int(::fmaxf(val, __int_as_float(assumed))));
    } while (assumed != old);
    return __int_as_float(old);
}

answered Jul 01 '13 at 09:25

vinograd47

6,320
28
30

6

To have an implementation for a float atomicMin version, just replace fmaxf by fminf. – Jul 02 '15 at 14:52
I'm not sure this is a good solution: the arguments of `atomicCAS` are not processed in an "atomic" way: thus you may have a race condition when evaluating `::fmax(val, ...)`. I tried to use this implementation and it resulted with erroneous outputs. I suspect due to this "non-atomic" `::fmax`. [Xiaojing An's solution](https://stackoverflow.com/a/51549250/1714410) seems to work better. – Shai Apr 16 '19 at 06:13
@Shai why would there be a race condition for `fmax`? Both are local variables and the while loop will only exit when the value on the address is the maximum from both what is stored there and `val`. That being said, the other solution might be a better choice as it is using a single atomic instruction and is most likely faster – user1612250 Sep 03 '20 at 09:37

score 13 · Answer 2 · edited Jul 27 '18 at 04:53

13

Based on the CUDA Toolkit Documentation v9.2.148, there are no atomic operations for float. But we can implement it by mixing atomicMax and atomicMin with signed and unsigned integer casts!

This is a float atomic min:

__device__ __forceinline__ float atomicMinFloat (float * addr, float value) {
        float old;
        old = (value >= 0) ? __int_as_float(atomicMin((int *)addr, __float_as_int(value))) :
             __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value)));

        return old;
}

This is a float atomic max:

__device__ __forceinline__ float atomicMaxFloat (float * addr, float value) {
    float old;
    old = (value >= 0) ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) :
         __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));

    return old;
}

edited Jul 27 '18 at 04:53

Kasimir Gabert

13
3

answered Jul 27 '18 at 00:41

Xiaojing An

131
1
4

should there also be a test on `*addr` if it negative or not? what happens if sign of `value` and `*addr` are not the same? – Shai May 02 '19 at 06:08
1

Only know the sign of value is enough. For example, when in atomicMinFloat, when value >= 0, we use atomicMin for signed int: if *addr < 0, then *addr return; if *addr >=0, then compare and have the min between *addr and value. – Xiaojing An May 02 '19 at 22:17
1

I don't believe this handles the `float` [negative zero](https://stackoverflow.com/questions/5095968/does-float-have-a-negative-zero-0f) case correctly. You might be able to trivially fix that by adding zero to `value` before using it. – Robert Crovella Dec 21 '20 at 16:33
1

It seems to me that, for `atomicMinFloat`: if `-0` and `0` are compared, it will set `*addr` as `-0`; Otherwise, correct. Is the first case here that you think is incorrect? If not, I would really appreciate clarification with a counter example. :) – Xiaojing An Jan 06 '21 at 17:48

score 9 · Answer 3 · answered Jun 23 '15 at 18:11

9

You need to map float to orderedIntFloat to use atomicMax!

__device__ __forceinline__ int floatToOrderedInt( float floatVal ) {
 int intVal = __float_as_int( floatVal );
 return (intVal >= 0 ) ? intVal : intVal ^ 0x7FFFFFFF;
}
__device__ __forceinline__ float orderedIntToFloat( int intVal ) {
 return __int_as_float( (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF);
}

answered Jun 23 '15 at 18:11

Informate.it

149
1
2

2

how can you map `*address` this way - the mapping is NOT atomic? – Shai May 02 '19 at 06:07

talonmies · Accepted Answer · 2021-05-07T00:38:09.510

4

The short answer is that you can't. As you can see from the atomic function documentation, only integer arguments are supported for atomicMax and 64 bit integer arguments are only supported on compute capability 3.5 devices.

edited May 07 '21 at 00:38

answered Jul 01 '13 at 07:29

talonmies

70,661
34
192
269

score 2 · Answer 5 · answered Jun 01 '22 at 11:55

I believe the answer given by Xiaojing An is a good solution but there is a minor issue with the negative zero which is mentioned by Robert Crovella in a comment. For example, if *addr = -1.0f and val = -0.0f then after running the atomicMaxFloat function addr will be set to -1.0f but it should be -0.0f, and the atomicMinFloat function will also be wrong in this case. This happens because the >= 0 check returns true for negative 0 but we need it to be false in this case. This case can be fixed by using the signbit function instead:

__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
    float old;
    old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
    float old;
    old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

Note - i would have posted this as a comment to the answer from Xiaojing An but don't have enough reputation.

Of course, it's unclear what will happen with nans or infs in this function but i think it can be used without worrying about that assuming you don't need to handle those cases - the negative 0 is probably the only really worrying case. It also depends on your willingness to accept this kind of hackery where we are making assumptions about the way the floating point values are represented in binary and many people may prefer never to go down this kind of route.

Here's a small test program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <math.h>

/*
//these versions fail some of the tests involving negative 0
__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
    float old;
    old = value >= 0 ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
    float old;
    old = value >= 0 ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));

    return old;
}
*/


__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
    float old;
    old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
    float old;
    old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

__global__ void testKernel(float* testMaxData, 
                           float* testMinData,
                           const float* testValues, 
                           int numTests)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index >= numTests)
    {
        return;
    }
    float val = testValues[index];
    atomicMaxFloat(testMaxData + index, val);
    atomicMinFloat(testMinData + index, val);
}

void checkCudaErr(cudaError_t cudaStatus)
{
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "CUDA Runtime error: %s\n", cudaGetErrorString(cudaStatus));
    }
}

int main()
{
    const int numValues = 6;
    const int numTests = numValues * numValues;
    float testData[numValues] = { 0.0f, -0.0f, 1.0f, -1.0f, 200.0f, -200.0f };
    float testValuesMinMaxHost[numTests];
    float testValuesHost[numTests];

    for (int i = 0; i < numValues; ++i)
    {
        for (int j = 0; j < numValues; ++j)
        {
            /*
            We will test the values of min(a,b) and max(a,b) for
            all values of a and b in the testData array.
            */
            testValuesMinMaxHost[numValues * i + j] = testData[i];
            testValuesHost[numValues * i + j] = testData[j];
        }
    }
  
    float* devTestMax = 0;
    float* devTestMin = 0;
    float* devTestValues = 0;

    checkCudaErr(cudaSetDevice(0));
    checkCudaErr(cudaMalloc((void**)&devTestMax, numTests * sizeof(float)));
    checkCudaErr(cudaMalloc((void**)&devTestMin, numTests * sizeof(float)));
    checkCudaErr(cudaMalloc((void**)&devTestValues, numTests * sizeof(float)));

    checkCudaErr(cudaMemcpy(devTestMax, testValuesMinMaxHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErr(cudaMemcpy(devTestMin, testValuesMinMaxHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErr(cudaMemcpy(devTestValues, testValuesHost, numTests * sizeof(float), cudaMemcpyHostToDevice));

    int blockSize = 128;
    testKernel << < (numTests+(blockSize-1))/ blockSize, blockSize >> > (devTestMax, devTestMin, devTestValues, numTests);
    checkCudaErr(cudaGetLastError());
    
    float resultsMin[numTests];
    float resultsMax[numTests];

    checkCudaErr(cudaMemcpy(resultsMin, devTestMin, numTests * sizeof(float), cudaMemcpyDeviceToHost));
    checkCudaErr(cudaMemcpy(resultsMax, devTestMax, numTests * sizeof(float), cudaMemcpyDeviceToHost));

    checkCudaErr(cudaFree(devTestMax));
    checkCudaErr(cudaFree(devTestMin));
    checkCudaErr(cudaFree(devTestValues));

    int fail = 0;
    for (int i = 0; i < numTests; ++i)
    {
        float expectedMax = fmax(testValuesMinMaxHost[i], testValuesHost[i]);
        if (resultsMax[i] != expectedMax)
        {
            printf("fail, expected %f, got %f from max(%f, %f)\n",
                   expectedMax,
                   resultsMax[i],
                   testValuesMinMaxHost[i],
                   testValuesHost[i]);
            fail = 1;
        }

        float expectedMin = fmin(testValuesMinMaxHost[i], testValuesHost[i]);
        if (resultsMin[i] != expectedMin)
        {
            printf("fail, expected %f, got %f from min(%f, %f)\n",
                   expectedMin,
                   resultsMin[i],
                   testValuesMinMaxHost[i],
                   testValuesHost[i]);
            fail = 1;
        }
    }

    if (fail == 0)
    {
        printf("all tests passed\n");
    }

    return 0;
}

score -3 · Answer 6 · answered Jul 01 '13 at 09:10

-3

This is the syntax for Atomic MAX

int atomicMax(int* address,int val);

But there are exception like atomicAdd which support floats.

answered Jul 01 '13 at 09:10

Sagar Masuti

1,271
2
11
30

How do I use atomicMax on floating-point values in CUDA?

6 Answers6

Linked