I ended up using both. They have different advantages.
typedef __m128 p4f_t;
static inline p4f_t p4f_if(p4f_t a, p4f_t b, p4f_t c) {
return p4f_or(p4f_and(a, b), p4f_andn(a, c));
}
static inline p4f_t p4f_if2(p4f_t a, p4f_t b, p4f_t c) {
return p4f_xor(p4f_and(a, p4f_xor(b, c)), c);
}
I posted the wrapper code at the end. negif
and negif2
below produce different object codes while doing the same job.
static inline p4f_t p4f_neg(p4f_t a) {
p4f_t mask = p4f_fill_bitCopy(0x80000000);
return p4f_xor(a, mask);
}
p4f_t negif(p4f_t a, p4f_t b) {
return p4f_if(b, p4f_neg(a), a);
}
p4f_t negif2(p4f_t a, p4f_t b) {
return p4f_if2(b, p4f_neg(a), a);
}
This is the disassembly.
<negif>:
movaps xmm3,XMMWORD PTR .LC0[rip]
movaps xmm2,xmm1
andnps xmm2,xmm0
xorps xmm3,xmm0
andps xmm1,xmm3
orps xmm1,xmm2
movaps xmm0,xmm1
ret
<negif2>:
andps xmm1,XMMWORD PTR .LC0[rip]
xorps xmm0,xmm1
ret
Why is negif2
optimized better?
neg(a) = xor(a, mask)
if2(a, b, c) = xor(and(a, xor(b, c)), c)
negif2(a, b) = if2(b, neg(a), a)
= xor(and(b, xor(neg(a), a)), a)
= xor(and(b, xor(xor(a, mask), a)), a)
= xor(and(b, mask), a)
negif2
can be simplified by trivial substitution while negif
cannot, so at least the current version of gcc cannot optimize negif
properly.
However, as @RaymondChen and @chtz mentioned in the comments, and
and andn
doesn't have a dependency on each other in the first if
; they can run in parallel, so in other cases, p4f_if
should be a choice over p4f_if2
.
This is the wrapper code.
static inline p4f_t p4f_fill(float a) {
return _mm_set1_ps(a);
}
static inline p4f_t p4f_fill_bitCopy(uint32_t a) {
float a_;
memcpy(&a_, &a, sizeof(a));
return p4f_fill(a_);
}
static inline p4f_t p4f_and(p4f_t a, p4f_t b) {
return _mm_and_ps(a, b);
}
static inline p4f_t p4f_andn(p4f_t a, p4f_t b) {
return _mm_andnot_ps(a, b);
}
static inline p4f_t p4f_or(p4f_t a, p4f_t b) {
return _mm_or_ps(a, b);
}
static inline p4f_t p4f_xor(p4f_t a, p4f_t b) {
return _mm_xor_ps(a, b);
}