Your interpretation of what is happening is incorrect. Before looking at your code in details, look at my version of what you are trying to do (let's leave aside the issue of the union which is technically undefined behaviour):
#include <stdio.h>
typedef union {
float f;
unsigned int i;
} bodge;
__global__ void test(unsigned int x, unsigned int y, float* out, bool dowrite) {
bodge a, b;
a.i = x;
b.i = y;
float r = a.f * a.f + b.f;
printf("a %.30g\n", a.f);
printf("b %.30g\n", b.f);
printf("r %.30g\n", r);
if (dowrite) *out = r;
}
int main() {
test<<<1, 1>>>(0x3F800001, 0xBF800002, (float*)0, false);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
which compiles and runs like so on a Maxwell GPU:
$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 1.42108547152020037174224853516e-14
If we look at the assembler output for this, we clearly see an FMAD instruction at /*0058*/
:
$ cuobjdump -sass fmad
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_52
Function : _Z4testjjPfb
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c4400fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ { IADD32I R1, R1, -0x8; /* 0x1c0fffffff870101 */
/*0018*/ F2F.F64.F32 R10, c[0x0][0x140]; } /* 0x4ca8000005070b0a */
/* 0x001fc400fe2007f5 */
/*0028*/ MOV R17, c[0x0][0x140]; /* 0x4c98078005070011 */
/*0030*/ LOP.OR R2, R1, c[0x0][0x4]; /* 0x4c47020000170102 */
/*0038*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/* 0x001fc800fc2007f1 */
/*0048*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*0050*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0058*/ FFMA R17, R17, R17, c[0x0][0x144]; /* 0x5180088005171111 */
/* 0x0023c800ffe007f1 */
/*0068*/ LOP32I.AND R16, R2, 0xffffff; /* 0x04000ffffff70210 */
/*0070*/ MOV R6, R2; /* 0x5c98078000270006 */
/*0078*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/* 0x001c4400fe000ffd */
/*0088*/ JCAL 0x0; /* 0xe220000000000040 */
/*0090*/ { MOV32I R4, 0x0; /* 0x010000000007f004 */
/*0098*/ F2F.F64.F32 R10, c[0x0][0x144]; } /* 0x4ca8000005170b0a */
/* 0x001ffc00fe2007f1 */
/*00a8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*00b0*/ MOV R6, R2; /* 0x5c98078000270006 */
/*00b8*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/* 0x001fc001ffa008f2 */
/*00c8*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/*00d0*/ JCAL 0x0; /* 0xe220000000000040 */
/*00d8*/ { MOV R6, R2; /* 0x5c98078000270006 */
/*00e8*/ F2F.F64.F32 R10, R17; } /* 0x001fc400fe200711 */
/* 0x5ca8000001170b0a */
/*00f0*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00f8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/* 0x003ff4011e4007ff */
/*0108*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0110*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/*0118*/ JCAL 0x0; /* 0xe220000000000040 */
/* 0x003fb401e3a0071f */
/*0128*/ LDC.U8 R0, c[0x0][0x150]; /* 0xef9000001507ff00 */
/*0130*/ I2I.S16.S8 R0, R0; /* 0x5ce0000000073100 */
/*0138*/ LOP.AND.NZ P0, RZ, R0, 0xff; /* 0x384030000ff700ff */
/* 0x001fc800fe2007fd */
/*0148*/ @!P0 EXIT; /* 0xe30000000008000f */
/*0150*/ MOV R2, c[0x0][0x148]; /* 0x4c98078005270002 */
/*0158*/ MOV R3, c[0x0][0x14c]; /* 0x4c98078005370003 */
/* 0x001ffc00ffe000f1 */
/*0168*/ STG.E [R2], R17; /* 0xeedc200000070211 */
/*0170*/ EXIT; /* 0xe30000000007000f */
/*0178*/ BRA 0x178; /* 0xe2400fffff87000f */
.......................
So now let's look at your kernel code:
$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 0
Different result. Dissassembly shows why:
$ cuobjdump -sass fmad
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_52
Function : _Z5test0v
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001fc400fec007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ IADD32I R1, R1, -0x8; /* 0x1c0fffffff870101 */
/*0018*/ LOP.OR R2, R1, c[0x0][0x4]; /* 0x4c47020000170102 */
/* 0x001fc400fe2007f1 */
/*0028*/ MOV32I R10, 0x20000000; /* 0x010200000007f00a */
/*0030*/ MOV32I R11, 0x3ff00000; /* 0x0103ff000007f00b */
/*0038*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/* 0x001fc000fe4007e2 */
/*0048*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*0050*/ LOP32I.AND R16, R2, 0xffffff; /* 0x04000ffffff70210 */
/*0058*/ { MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0068*/ STL.64 [R16], R10; } /* 0x003ff400fec000f1 */
/* 0xef5500000007100a */
/*0070*/ MOV R6, R2; /* 0x5c98078000270006 */
/*0078*/ JCAL 0x0; /* 0xe220000000000040 */
/* 0x001fc000fe4007f1 */
/*0088*/ MOV32I R10, 0x40000000; /* 0x010400000007f00a */
/*0090*/ MOV32I R11, 0xbff00000; /* 0x010bff000007f00b */
/*0098*/ { MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00a8*/ STL.64 [R16], R10; } /* 0x001fc400fe2000f1 */
/* 0xef5500000007100a */
/*00b0*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*00b8*/ MOV R6, R2; /* 0x5c98078000270006 */
/* 0x001fc001ffa007e6 */
/*00c8*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*00d0*/ JCAL 0x0; /* 0xe220000000000040 */
/*00d8*/ { MOV R6, R2; /* 0x5c98078000270006 */
/*00e8*/ STL.64 [R16], RZ; } /* 0x001fc400fe2000f1 */
/* 0xef550000000710ff */
/*00f0*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00f8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/* 0x001ffc01ffa007f6 */
/*0108*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0110*/ JCAL 0x0; /* 0xe220000000000040 */
/*0118*/ EXIT; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0128*/ BRA 0x120; /* 0xe2400fffff07000f */
/*0130*/ NOP; /* 0x50b0000000070f00 */
/*0138*/ NOP; /* 0x50b0000000070f00 */
....................
You can see that there are no floating point instructions at all. Why? Because the compiler has determined that everything within the kernel is constant and it is safe to pre-calculate the result and substitute that into the emitted code. And I would hypothesize that the computations were done in a higher precision, which is why the result is 0 (or there is an internal compiler bug).