how to use fma on gpu v100?

Question

I've tried the example of FMA on https://docs.nvidia.com/cuda/floating-point/index.html

union  {
    float  f;
    unsigned  int  i
} a,  b;
float  r;

a.i = 0x3F800001;
b.i = 0xBF800002;
r = a.f  * a.f  + b.f;

printf("a %.8g\n", a.f); 
printf("b %.8g\n", b.f); 
printf("r %.8g\n",   r);

However, I got 0 on gpu. My test.cu is shown as below:

#include <stdio.h>
#include <iostream>
using namespace std;
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    return EXIT_FAILURE;}} while(0)

#define BLOCKS 1 //useless
#define TPB 3    //useless
#define TIMES 5  //useless

__global__ void test() {
    union  {
        float  f;
        unsigned  int  i;
    } a,  b;
    float  r;

    a.i = 0x3F800001;
    b.i = 0xBF800002;
    r = a.f  * a.f  + b.f;
    printf("a %.30g\n", a.f);
    printf("b %.30g\n", b.f);
    printf("r %.30g\n",   r);
}

int main() {
    float *devResults; //useless

    CUDA_CALL(cudaMalloc((void **)&devResults, BLOCKS * TPB * TIMES *
              sizeof(float)));

    CUDA_CALL(cudaMemset(devResults, 0, BLOCKS * TPB * TIMES *
              sizeof(float)));
    test<<<1, 1>>>();
    CUDA_CALL(cudaFree(devResults));
    return 0;
}

I compiled test.cu using: nvcc test.cu --fmad=true

When I call 'fma' function, it works. However, it is supposed to work without 'fma'.

"it is supposed to work" --> Not quite: `b.i = 0xBF800003; r = a.f * a.f + b.f;` is [undefined](https://stackoverflow.com/q/2310483/2410359) in general. Unsure about CUDA — chux - Reinstate Monica, Nov 22 '19 at 07:16
The two pieces of code in your question are not the same -- the hex constants differ. And I am fairly sure that the compiler will pre-compile the result in your code and replace it with a constant, so if the code doesn't do what you expect it might well be a compiler bug in how floating point constants are handled and nothing related to FMAD instructions — talonmies, Nov 22 '19 at 07:49

talonmies · Accepted Answer · 2019-11-22T14:24:53.170

Your interpretation of what is happening is incorrect. Before looking at your code in details, look at my version of what you are trying to do (let's leave aside the issue of the union which is technically undefined behaviour):

#include <stdio.h>

typedef union  {
        float  f;
        unsigned  int  i;
       } bodge;

__global__ void test(unsigned int x, unsigned int y, float* out, bool dowrite) {

    bodge a, b;
    a.i = x;
    b.i = y;

    float r = a.f  * a.f  + b.f;
    printf("a %.30g\n", a.f);
    printf("b %.30g\n", b.f);
    printf("r %.30g\n",   r);
    if (dowrite) *out = r;
}

int main() {
    test<<<1, 1>>>(0x3F800001, 0xBF800002, (float*)0, false);
    cudaDeviceSynchronize();
    cudaDeviceReset();
    return 0;
}

which compiles and runs like so on a Maxwell GPU:

$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 1.42108547152020037174224853516e-14

If we look at the assembler output for this, we clearly see an FMAD instruction at /*0058*/:

$ cuobjdump -sass fmad

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

    code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

    code for sm_52
        Function : _Z4testjjPfb
    .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                                /* 0x001c4400fe0007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                        /* 0x4c98078000870001 */
        /*0010*/         {         IADD32I R1, R1, -0x8;                        /* 0x1c0fffffff870101 */
        /*0018*/                   F2F.F64.F32 R10, c[0x0][0x140];        }     /* 0x4ca8000005070b0a */
                                                                                /* 0x001fc400fe2007f5 */
        /*0028*/                   MOV R17, c[0x0][0x140];                      /* 0x4c98078005070011 */
        /*0030*/                   LOP.OR R2, R1, c[0x0][0x4];                  /* 0x4c47020000170102 */
        /*0038*/                   MOV32I R4, 0x0;                              /* 0x010000000007f004 */
                                                                                /* 0x001fc800fc2007f1 */
        /*0048*/                   MOV32I R5, 0x0;                              /* 0x010000000007f005 */
        /*0050*/                   MOV R7, RZ;                                  /* 0x5c9807800ff70007 */
        /*0058*/                   FFMA R17, R17, R17, c[0x0][0x144];           /* 0x5180088005171111 */
                                                                                /* 0x0023c800ffe007f1 */
        /*0068*/                   LOP32I.AND R16, R2, 0xffffff;                /* 0x04000ffffff70210 */
        /*0070*/                   MOV R6, R2;                                  /* 0x5c98078000270006 */
        /*0078*/                   STL.64 [R16], R10;                           /* 0xef5500000007100a */
                                                                                /* 0x001c4400fe000ffd */
        /*0088*/                   JCAL 0x0;                                    /* 0xe220000000000040 */
        /*0090*/         {         MOV32I R4, 0x0;                              /* 0x010000000007f004 */
        /*0098*/                   F2F.F64.F32 R10, c[0x0][0x144];        }     /* 0x4ca8000005170b0a */
                                                                                /* 0x001ffc00fe2007f1 */
        /*00a8*/                   MOV32I R5, 0x0;                              /* 0x010000000007f005 */
        /*00b0*/                   MOV R6, R2;                                  /* 0x5c98078000270006 */
        /*00b8*/                   MOV R7, RZ;                                  /* 0x5c9807800ff70007 */
                                                                                /* 0x001fc001ffa008f2 */
        /*00c8*/                   STL.64 [R16], R10;                           /* 0xef5500000007100a */
        /*00d0*/                   JCAL 0x0;                                    /* 0xe220000000000040 */
        /*00d8*/         {         MOV R6, R2;                                  /* 0x5c98078000270006 */
        /*00e8*/                   F2F.F64.F32 R10, R17;        }               /* 0x001fc400fe200711 */
                                                                                /* 0x5ca8000001170b0a */
        /*00f0*/                   MOV32I R4, 0x0;                              /* 0x010000000007f004 */
        /*00f8*/                   MOV32I R5, 0x0;                              /* 0x010000000007f005 */
                                                                                /* 0x003ff4011e4007ff */
        /*0108*/                   MOV R7, RZ;                                  /* 0x5c9807800ff70007 */
        /*0110*/                   STL.64 [R16], R10;                           /* 0xef5500000007100a */
        /*0118*/                   JCAL 0x0;                                    /* 0xe220000000000040 */
                                                                                /* 0x003fb401e3a0071f */
        /*0128*/                   LDC.U8 R0, c[0x0][0x150];                    /* 0xef9000001507ff00 */
        /*0130*/                   I2I.S16.S8 R0, R0;                           /* 0x5ce0000000073100 */
        /*0138*/                   LOP.AND.NZ P0, RZ, R0, 0xff;                 /* 0x384030000ff700ff */
                                                                                /* 0x001fc800fe2007fd */
        /*0148*/              @!P0 EXIT;                                        /* 0xe30000000008000f */
        /*0150*/                   MOV R2, c[0x0][0x148];                       /* 0x4c98078005270002 */
        /*0158*/                   MOV R3, c[0x0][0x14c];                       /* 0x4c98078005370003 */
                                                                                /* 0x001ffc00ffe000f1 */
        /*0168*/                   STG.E [R2], R17;                             /* 0xeedc200000070211 */
        /*0170*/                   EXIT;                                        /* 0xe30000000007000f */
        /*0178*/                   BRA 0x178;                                   /* 0xe2400fffff87000f */
        .......................

So now let's look at your kernel code:

$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 0

Different result. Dissassembly shows why:

$ cuobjdump -sass fmad

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

    code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

    code for sm_52
        Function : _Z5test0v
    .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                           /* 0x001fc400fec007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                   /* 0x4c98078000870001 */
        /*0010*/                   IADD32I R1, R1, -0x8;                   /* 0x1c0fffffff870101 */
        /*0018*/                   LOP.OR R2, R1, c[0x0][0x4];             /* 0x4c47020000170102 */
                                                                           /* 0x001fc400fe2007f1 */
        /*0028*/                   MOV32I R10, 0x20000000;                 /* 0x010200000007f00a */
        /*0030*/                   MOV32I R11, 0x3ff00000;                 /* 0x0103ff000007f00b */
        /*0038*/                   MOV32I R4, 0x0;                         /* 0x010000000007f004 */
                                                                           /* 0x001fc000fe4007e2 */
        /*0048*/                   MOV32I R5, 0x0;                         /* 0x010000000007f005 */
        /*0050*/                   LOP32I.AND R16, R2, 0xffffff;           /* 0x04000ffffff70210 */
        /*0058*/         {         MOV R7, RZ;                             /* 0x5c9807800ff70007 */
        /*0068*/                   STL.64 [R16], R10;        }             /* 0x003ff400fec000f1 */
                                                                           /* 0xef5500000007100a */
        /*0070*/                   MOV R6, R2;                             /* 0x5c98078000270006 */
        /*0078*/                   JCAL 0x0;                               /* 0xe220000000000040 */
                                                                           /* 0x001fc000fe4007f1 */
        /*0088*/                   MOV32I R10, 0x40000000;                 /* 0x010400000007f00a */
        /*0090*/                   MOV32I R11, 0xbff00000;                 /* 0x010bff000007f00b */
        /*0098*/         {         MOV32I R4, 0x0;                         /* 0x010000000007f004 */
        /*00a8*/                   STL.64 [R16], R10;        }             /* 0x001fc400fe2000f1 */
                                                                           /* 0xef5500000007100a */
        /*00b0*/                   MOV32I R5, 0x0;                         /* 0x010000000007f005 */
        /*00b8*/                   MOV R6, R2;                             /* 0x5c98078000270006 */
                                                                           /* 0x001fc001ffa007e6 */
        /*00c8*/                   MOV R7, RZ;                             /* 0x5c9807800ff70007 */
        /*00d0*/                   JCAL 0x0;                               /* 0xe220000000000040 */
        /*00d8*/         {         MOV R6, R2;                             /* 0x5c98078000270006 */
        /*00e8*/                   STL.64 [R16], RZ;        }              /* 0x001fc400fe2000f1 */
                                                                           /* 0xef550000000710ff */
        /*00f0*/                   MOV32I R4, 0x0;                         /* 0x010000000007f004 */
        /*00f8*/                   MOV32I R5, 0x0;                         /* 0x010000000007f005 */
                                                                           /* 0x001ffc01ffa007f6 */
        /*0108*/                   MOV R7, RZ;                             /* 0x5c9807800ff70007 */
        /*0110*/                   JCAL 0x0;                               /* 0xe220000000000040 */
        /*0118*/                   EXIT;                                   /* 0xe30000000007000f */
                                                                           /* 0x001f8000fc0007ff */
        /*0128*/                   BRA 0x120;                              /* 0xe2400fffff07000f */
        /*0130*/                   NOP;                                    /* 0x50b0000000070f00 */
        /*0138*/                   NOP;                                    /* 0x50b0000000070f00 */
        ....................

You can see that there are no floating point instructions at all. Why? Because the compiler has determined that everything within the kernel is constant and it is safe to pre-calculate the result and substitute that into the emitted code. And I would hypothesize that the computations were done in a higher precision, which is why the result is 0 (or there is an internal compiler bug).

Thanks for your answer! However, when I use tensorflow-gpu or pytorch doing the same thing, ie. 'a*a+b', I get zero, too. Could you please help me to explain this? — Di Huang, Nov 22 '19 at 12:42
@hd232508: That is a different question which I cannot help you with — talonmies, Nov 22 '19 at 15:43

how to use fma on gpu v100?

1 Answers1