#include <immintrin.h>
__m256 mult(__m256 num) {
return 278*num/(num+1400);
}
.LCPI0_0:
.long 0x438b0000 # float 278
.LCPI0_1:
.long 0x44af0000 # float 1400
mult(float __vector(8)): # @mult(float __vector(8))
vbroadcastss ymm1, dword ptr [rip + .LCPI0_0] # ymm1 = [2.78E+2,2.78E+2,2.78E+2,2.78E+2,2.78E+2,2.78E+2,2.78E+2,2.78E+2]
vmulps ymm1, ymm0, ymm1
vbroadcastss ymm2, dword ptr [rip + .LCPI0_1] # ymm2 = [1.4E+3,1.4E+3,1.4E+3,1.4E+3,1.4E+3,1.4E+3,1.4E+3,1.4E+3]
vaddps ymm0, ymm0, ymm2
vrcpps ymm2, ymm0
vmulps ymm3, ymm1, ymm2
vfmsub213ps ymm0, ymm3, ymm1 # ymm0 = (ymm3 * ymm0) - ymm1
vfnmadd213ps ymm0, ymm2, ymm3 # ymm0 = -(ymm2 * ymm0) + ymm3
ret
Why does Clang add the two extra FMA instructions to the code? The result should already be computed with vmulps ymm3, ymm1, ymm2
. Don't the extra instructions increase the latency beyond just using vdivps
like with -O3
?