I've put up 4 example of your code, with certain caveats on Godbolt.
- no side effects
- trivial types
#include <stdint.h>
#include <xmmintrin.h>
bool Test1(int x, int y, int g, int h, int m, int n, int q, int r) {
return x < y && g < h && m < n && q < r;
}
bool Test2(int x, int y, int g, int h, int m, int n, int q, int r) {
const bool a = x < y && g < h;
const bool b = m < n && q < r;
return a && b;
}
bool TestSIMD(__m128i v1, __m128i v2) {
__m128i vcmp = _mm_cmplt_epi32(v1, v2);
uint16_t mask = _mm_movemask_epi8(vcmp);
return (mask == 0xffff);
}
bool Test4(int x, int y, int g, int h, int m, int n, int q, int r) {
return x < y & g < h & m < n & q < r;
}
This compiles to
Test1(int, int, int, int, int, int, int, int):
cmp edi, esi
setl al
cmp edx, ecx
setl dl
and al, dl
je .L1
cmp r8d, r9d
mov ecx, DWORD PTR [rsp+16]
setl al
cmp DWORD PTR [rsp+8], ecx
setl dl
and eax, edx
.L1:
ret
Test2(int, int, int, int, int, int, int, int):
cmp r8d, r9d
mov r10d, DWORD PTR [rsp+16]
setl al
cmp DWORD PTR [rsp+8], r10d
setl r8b
and eax, r8d
cmp edx, ecx
setl dl
and eax, edx
cmp edi, esi
setl dl
and eax, edx
ret
TestSIMD(long long __vector(2), long long __vector(2)):
vpcmpgtd xmm1, xmm1, xmm0
vpmovmskb eax, xmm1
cmp ax, -1
sete al
ret
Test4(int, int, int, int, int, int, int, int):
cmp r8d, r9d
mov r10d, DWORD PTR [rsp+16]
setl al
cmp DWORD PTR [rsp+8], r10d
setl r8b
and eax, r8d
cmp edx, ecx
setl dl
and eax, edx
cmp edi, esi
setl dl
and eax, edx
ret
Cycles times is approximated as I haven't bothered with analysing every instruction.
- The first case has a conditional branch, which can be bad if the branch is not correctly predicted every time. Takes 2(correct predict early out),3,4 or 2+12 (branch mis-predict). The compiler is taking liberties with short circuit as the data is trivial with no side effect.
- The second case doesn't have branches but like the first takes 3 or 4 cycles. But the execution time should be the same every time.
- The SIMD solution take 4 cycles, due to data dependency, without branches. But uses much less of the pipeline and therefore can overlap with more instructions. Also the data must be loadable into the registers or already present in the registers which cost at least one additional cycle.
- The & solution also takes 4 cycles but also uses 13 instructions.
So if this is close to your problem use either SIMD if you can else use the fastest of Test2 and Test4 on your platform.