As others have pointed out, CUDA devices do not have instructions for floating point division in hardware. Instead they start from an initial approximation to the reciprocal of the denominator, provided by a single precision special function unit. It's product with the numerator is then iteratively refined until it matches the fraction to within machine precision.
Even the __ddiv_rn()
intrinsic is compiled to this instruction sequence by ptxas, so it's use makes no difference.
You can gain closer insight by inspecting the code yourself using cuobjdump -sass
, although this is made difficult by no official documentation for shader assembly being available other than the bare list of instructions.
I'll use the following bare-bones division kernel as an example:
__global__ void div(double x, double y, double *z) {
*z = x / y;
}
This is compiled to the following shader assembly for a compute capability 3.5 device:
Function : _Z3divddPd
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08a0109c10801000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ MOV R0, c[0x0][0x14c]; /* 0x64c03c00299c0002 */
/*0018*/ MOV32I R2, 0x1; /* 0x74000000009fc00a */
/*0020*/ MOV R8, c[0x0][0x148]; /* 0x64c03c00291c0022 */
/*0028*/ MOV R9, c[0x0][0x14c]; /* 0x64c03c00299c0026 */
/*0030*/ MUFU.RCP64H R3, R0; /* 0x84000000031c000e */
/*0038*/ MOV32I R0, 0x35b7333; /* 0x7401adb9999fc002 */
/* 0x08a080a080a4a4a4 */
/*0048*/ DFMA R4, -R8, R2, c[0x2][0x0]; /* 0x9b880840001c2012 */
/*0050*/ DFMA R4, R4, R4, R4; /* 0xdb801000021c1012 */
/*0058*/ DFMA R4, R4, R2, R2; /* 0xdb800800011c1012 */
/*0060*/ DMUL R6, R4, c[0x0][0x140]; /* 0x64000000281c101a */
/*0068*/ FSETP.GE.AND P0, PT, R0, |c[0x0][0x144]|, PT; /* 0x5db09c00289c001e */
/*0070*/ DFMA R8, -R8, R6, c[0x0][0x140]; /* 0x9b881800281c2022 */
/*0078*/ MOV R2, c[0x0][0x150]; /* 0x64c03c002a1c000a */
/* 0x0880acb0a0ac8010 */
/*0088*/ MOV R3, c[0x0][0x154]; /* 0x64c03c002a9c000e */
/*0090*/ DFMA R4, R8, R4, R6; /* 0xdb801800021c2012 */
/*0098*/ @P0 BRA 0xb8; /* 0x120000000c00003c */
/*00a0*/ FFMA R0, RZ, c[0x0][0x14c], R5; /* 0x4c001400299ffc02 */
/*00a8*/ FSETP.GT.AND P0, PT, |R0|, c[0x2][0x8], PT; /* 0x5da01c40011c021e */
/*00b0*/ @P0 BRA 0xe8; /* 0x120000001800003c */
/*00b8*/ MOV R4, c[0x0][0x140]; /* 0x64c03c00281c0012 */
/* 0x08a1b810b8008010 */
/*00c8*/ MOV R5, c[0x0][0x144]; /* 0x64c03c00289c0016 */
/*00d0*/ MOV R7, c[0x0][0x14c]; /* 0x64c03c00299c001e */
/*00d8*/ MOV R6, c[0x0][0x148]; /* 0x64c03c00291c001a */
/*00e0*/ CAL 0xf8; /* 0x1300000008000100 */
/*00e8*/ ST.E.64 [R2], R4; /* 0xe5800000001c0810 */
/*00f0*/ EXIT; /* 0x18000000001c003c */
/*00f8*/ LOP32I.AND R0, R7, 0x40000000; /* 0x20200000001c1c00 */
/* 0x08a08010a010b010 */
/*0108*/ MOV32I R15, 0x1ff00000; /* 0x740ff800001fc03e */
/*0110*/ ISETP.LT.U32.AND P0, PT, R0, c[0x2][0xc], PT; /* 0x5b101c40019c001e */
/*0118*/ MOV R8, RZ; /* 0xe4c03c007f9c0022 */
/*0120*/ SEL R9, R15, c[0x2][0x10], !P0; /* 0x65002040021c3c26 */
/*0128*/ MOV32I R12, 0x1; /* 0x74000000009fc032 */
/*0130*/ DMUL R10, R8, R6; /* 0xe4000000031c202a */
/*0138*/ LOP32I.AND R0, R5, 0x7f800000; /* 0x203fc000001c1400 */
/* 0x08a0108ca01080a0 */
/*0148*/ MUFU.RCP64H R13, R11; /* 0x84000000031c2c36 */
/*0150*/ DFMA R16, -R10, R12, c[0x2][0x0]; /* 0x9b883040001c2842 */
/*0158*/ ISETP.LT.U32.AND P0, PT, R0, c[0x2][0x14], PT; /* 0x5b101c40029c001e */
/*0160*/ MOV R14, RZ; /* 0xe4c03c007f9c003a */
/*0168*/ DFMA R16, R16, R16, R16; /* 0xdb804000081c4042 */
/*0170*/ SEL R15, R15, c[0x2][0x10], !P0; /* 0x65002040021c3c3e */
/*0178*/ SSY 0x3a0; /* 0x1480000110000000 */
/* 0x08acb4a4a4a4a480 */
/*0188*/ DMUL R14, R14, R4; /* 0xe4000000021c383a */
/*0190*/ DFMA R12, R16, R12, R12; /* 0xdb803000061c4032 */
/*0198*/ DMUL R16, R14, R12; /* 0xe4000000061c3842 */
/*01a0*/ DFMA R10, -R10, R16, R14; /* 0xdb883800081c282a */
/*01a8*/ DFMA R10, R10, R12, R16; /* 0xdb804000061c282a */
/*01b0*/ DSETP.LEU.AND P0, PT, |R10|, RZ, PT; /* 0xdc581c007f9c2a1e */
/*01b8*/ @!P0 BRA 0x1e0; /* 0x120000001020003c */
/* 0x088010b010b8acb4 */
/*01c8*/ DSETP.EQ.AND P0, PT, R10, RZ, PT; /* 0xdc101c007f9c281e */
/*01d0*/ @!P0 BRA 0x358; /* 0x12000000c020003c */
/*01d8*/ DMUL.S R8, R4, R6; /* 0xe4000000035c1022 */
/*01e0*/ ISETP.GT.U32.AND P0, PT, R0, c[0x2][0x18], PT; /* 0x5b401c40031c001e */
/*01e8*/ MOV32I R0, 0x1ff00000; /* 0x740ff800001fc002 */
/*01f0*/ MOV R14, RZ; /* 0xe4c03c007f9c003a */
/*01f8*/ SEL R15, R0, c[0x2][0x10], !P0; /* 0x65002040021c003e */
/* 0x08b4a49c849c849c */
/*0208*/ DMUL R12, R10, R8; /* 0xe4000000041c2832 */
/*0210*/ DMUL R18, R10, R14; /* 0xe4000000071c284a */
/*0218*/ DMUL R10, R12, R14; /* 0xe4000000071c302a */
/*0220*/ DMUL R16, R8, R18; /* 0xe4000000091c2042 */
/*0228*/ DFMA R8, R10, R6, -R4; /* 0xdb901000031c2822 */
/*0230*/ DFMA R12, R16, R6, -R4; /* 0xdb901000031c4032 */
/*0238*/ DSETP.GT.AND P0, PT, |R8|, |R12|, PT; /* 0xdc209c00061c221e */
/* 0x08b010ac10b010a0 */
/*0248*/ SEL R9, R17, R11, P0; /* 0xe5000000059c4426 */
/*0250*/ FSETP.GTU.AND P1, PT, |R9|, 1.469367938527859385e-39, PT; /* 0xb5e01c00801c263d */
/*0258*/ MOV R11, R9; /* 0xe4c03c00049c002e */
/*0260*/ SEL R8, R16, R10, P0; /* 0xe5000000051c4022 */
/*0268*/ @P1 NOP.S; /* 0x8580000000443c02 */
/*0270*/ FSETP.LT.AND P0, PT, |R5|, 1.5046327690525280102e-36, PT; /* 0xb5881c20001c161d */
/*0278*/ MOV32I R0, 0x3ff00000; /* 0x741ff800001fc002 */
/* 0x0880a48090108c10 */
/*0288*/ MOV R16, RZ; /* 0xe4c03c007f9c0042 */
/*0290*/ SEL R17, R0, c[0x2][0x1c], !P0; /* 0x65002040039c0046 */
/*0298*/ LOP.OR R10, R8, 0x1; /* 0xc2001000009c2029 */
/*02a0*/ LOP.AND R8, R8, -0x2; /* 0xca0003ffff1c2021 */
/*02a8*/ DMUL R4, R16, R4; /* 0xe4000000021c4012 */
/*02b0*/ DMUL R6, R16, R6; /* 0xe4000000031c401a */
/*02b8*/ DFMA R14, R10, R6, -R4; /* 0xdb901000031c283a */
/* 0x08b010b010a0b4a4 */
/*02c8*/ DFMA R12, R8, R6, -R4; /* 0xdb901000031c2032 */
/*02d0*/ DSETP.GT.AND P0, PT, |R12|, |R14|, PT; /* 0xdc209c00071c321e */
/*02d8*/ SEL R8, R10, R8, P0; /* 0xe5000000041c2822 */
/*02e0*/ LOP.AND R0, R8, 0x1; /* 0xc2000000009c2001 */
/*02e8*/ IADD R11.CC, R8, -0x1; /* 0xc88403ffff9c202d */
/*02f0*/ ISETP.EQ.U32.AND P0, PT, R0, 0x1, PT; /* 0xb3201c00009c001d */
/*02f8*/ IADD.X R0, R9, -0x1; /* 0xc88043ffff9c2401 */
/* 0x08b4a480a010b010 */
/*0308*/ SEL R10, R11, R8, !P0; /* 0xe5002000041c2c2a */
/*0310*/ @P0 IADD R8.CC, R8, 0x1; /* 0xc084000000802021 */
/*0318*/ SEL R11, R0, R9, !P0; /* 0xe5002000049c002e */
/*0320*/ @P0 IADD.X R9, R9, RZ; /* 0xe08040007f802426 */
/*0328*/ DFMA R14, R10, R6, -R4; /* 0xdb901000031c283a */
/*0330*/ DFMA R4, R8, R6, -R4; /* 0xdb901000031c2012 */
/*0338*/ DSETP.GT.AND P0, PT, |R4|, |R14|, PT; /* 0xdc209c00071c121e */
/* 0x08b4acb4a010b810 */
/*0348*/ SEL R8, R10, R8, P0; /* 0xe5000000041c2822 */
/*0350*/ SEL.S R9, R11, R9, P0; /* 0xe500000004dc2c26 */
/*0358*/ MOV R8, RZ; /* 0xe4c03c007f9c0022 */
/*0360*/ MUFU.RCP64H R9, R7; /* 0x84000000031c1c26 */
/*0368*/ DSETP.GT.AND P0, PT, |R8|, RZ, PT; /* 0xdc201c007f9c221e */
/*0370*/ @P0 BRA.U 0x398; /* 0x120000001000023c */
/*0378*/ @!P0 DSETP.NEU.AND P1, PT, |R6|, +INF , PT; /* 0xb4681fff80201a3d */
/* 0x0800b8a010ac0010 */
/*0388*/ @!P0 SEL R9, R7, R9, P1; /* 0xe500040004a01c26 */
/*0390*/ @!P0 SEL R8, R6, RZ, P1; /* 0xe50004007fa01822 */
/*0398*/ DMUL.S R8, R8, R4; /* 0xe4000000025c2022 */
/*03a0*/ MOV R4, R8; /* 0xe4c03c00041c0012 */
/*03a8*/ MOV R5, R9; /* 0xe4c03c00049c0016 */
/*03b0*/ RET; /* 0x19000000001c003c */
/*03b8*/ BRA 0x3b8; /* 0x12007ffffc1c003c */
The MUFU.RCP64H
instruction provides the initial approximation of the reciprocal. It operates on the high 32 bits of the denominator (y
) and provides the high 32 bits of the double precision approximation, and therefor is counted as a Floating Point Operations (Single Precision Special) by the profiler.
There is another single precision FFMA
instruction further down apparently used as a high-throughput version of testing a conditional where full precision isn't required.