If your compiler/architecture supports vector extensions (like clang and gcc) you can use something like:
#ifdef __SSE2__
#include <immintrin.h>
#elif defined __ARM_NEON
#include <arm_neon.h>
#elif defined __ALTIVEC__
#include <altivec.h>
//#elif ... TODO more architectures
#endif
static int hastrue128(void *x){
#ifdef __SSE2__
return _mm_movemask_epi8(*(__m128i*)x);
#elif defined __ARM_NEON
return vaddlvq_u8(*(uint8x16_t*)x);
#elif defined __ALTIVEC__
typedef __UINT32_TYPE__ v4si __attribute__ ((__vector_size__ (16), aligned(4), __may_alias__));
return vec_any_ne(*(v4si*)x,(v4si){0});
#else
int *y = x;
return y[0]|y[1]|y[2]|y[3];
#endif
}
//if inputs will always be aligned to 16 add an aligned attribute
//otherwise ensure they are at least aligned to 4
int cmp3( int* a , int* b ){
typedef __INT32_TYPE__ i32x4 __attribute__ ((__vector_size__ (16), aligned(4), __may_alias__));
i32x4 x = *(i32x4*)a, cmp, tmp, y0 = y0^y0, y1 = y0, y2 = y0;
//start vectors off at 0 and add the int to each element for optimization
//it adds the int to each element, but since we started it at zero,
//a good compiler (not ICC at -O3) will skip the xor and add and just broadcast/whatever
y0 += b[0];
y1 += b[1];
y2 += b[2];
cmp = x == y0;
tmp = x == y1; //ppc complains if we don't use temps here
cmp |= tmp;
tmp = x ==y2;
cmp |= tmp;
//now hack off then end since we only need 3
cmp &= (i32x4){0xffffffff,0xffffffff,0xffffffff,0};
return hastrue128(&cmp);
}
int cmp4( int* a , int* b ){
typedef __INT32_TYPE__ i32x4 __attribute__ ((__vector_size__ (16), aligned(4), __may_alias__));
i32x4 x = *(i32x4*)a, cmp, tmp, y0 = y0^y0, y1 = y0, y2 = y0, y3 = y0;
y0 += b[0];
y1 += b[1];
y2 += b[2];
y3 += b[3];
cmp = x == y0;
tmp = x == y1; //ppc complains if we don't use temps here
cmp |= tmp;
tmp = x ==y2;
cmp |= tmp;
tmp = x ==y3;
cmp |= tmp;
return hastrue128(&cmp);
}
On arm64 this compiles to the following branchless code:
cmp3:
ldr q2, [x0]
adrp x2, .LC0
ld1r {v1.4s}, [x1]
ldp w0, w1, [x1, 4]
dup v0.4s, w0
cmeq v1.4s, v2.4s, v1.4s
dup v3.4s, w1
ldr q4, [x2, #:lo12:.LC0]
cmeq v0.4s, v2.4s, v0.4s
cmeq v2.4s, v2.4s, v3.4s
orr v0.16b, v1.16b, v0.16b
orr v0.16b, v0.16b, v2.16b
and v0.16b, v0.16b, v4.16b
uaddlv h0,v0.16b
umov w0, v0.h[0]
uxth w0, w0
ret
cmp4:
ldr q2, [x0]
ldp w2, w0, [x1, 4]
dup v0.4s, w2
ld1r {v1.4s}, [x1]
dup v3.4s, w0
ldr w1, [x1, 12]
dup v4.4s, w1
cmeq v1.4s, v2.4s, v1.4s
cmeq v0.4s, v2.4s, v0.4s
cmeq v3.4s, v2.4s, v3.4s
cmeq v2.4s, v2.4s, v4.4s
orr v0.16b, v1.16b, v0.16b
orr v0.16b, v0.16b, v3.16b
orr v0.16b, v0.16b, v2.16b
uaddlv h0,v0.16b
umov w0, v0.h[0]
uxth w0, w0
ret
And on ICC x86_64 -march=skylake
it produces the following branchless code:
cmp3:
vmovdqu xmm2, XMMWORD PTR [rdi] #27.24
vpbroadcastd xmm0, DWORD PTR [rsi] #34.17
vpbroadcastd xmm1, DWORD PTR [4+rsi] #35.17
vpcmpeqd xmm5, xmm2, xmm0 #34.17
vpbroadcastd xmm3, DWORD PTR [8+rsi] #37.16
vpcmpeqd xmm4, xmm2, xmm1 #35.17
vpcmpeqd xmm6, xmm2, xmm3 #37.16
vpor xmm7, xmm4, xmm5 #36.5
vpor xmm8, xmm6, xmm7 #38.5
vpand xmm9, xmm8, XMMWORD PTR __$U0.0.0.2[rip] #40.5
vpmovmskb eax, xmm9 #11.12
ret #41.12
cmp4:
vmovdqu xmm3, XMMWORD PTR [rdi] #46.24
vpbroadcastd xmm0, DWORD PTR [rsi] #51.17
vpbroadcastd xmm1, DWORD PTR [4+rsi] #52.17
vpcmpeqd xmm6, xmm3, xmm0 #51.17
vpbroadcastd xmm2, DWORD PTR [8+rsi] #54.16
vpcmpeqd xmm5, xmm3, xmm1 #52.17
vpbroadcastd xmm4, DWORD PTR [12+rsi] #56.16
vpcmpeqd xmm7, xmm3, xmm2 #54.16
vpor xmm8, xmm5, xmm6 #53.5
vpcmpeqd xmm9, xmm3, xmm4 #56.16
vpor xmm10, xmm7, xmm8 #55.5
vpor xmm11, xmm9, xmm10 #57.5
vpmovmskb eax, xmm11 #11.12
ret
And it even works on ppc64 with altivec - though definitely suboptimal
cmp3:
lwa 10,4(4)
lxvd2x 33,0,3
vspltisw 11,-1
lwa 9,8(4)
vspltisw 12,0
xxpermdi 33,33,33,2
lwa 8,0(4)
stw 10,-32(1)
addi 10,1,-80
stw 9,-16(1)
li 9,32
stw 8,-48(1)
lvewx 0,10,9
li 9,48
xxspltw 32,32,3
lvewx 13,10,9
li 9,64
vcmpequw 0,1,0
lvewx 10,10,9
xxsel 32,44,43,32
xxspltw 42,42,3
xxspltw 45,45,3
vcmpequw 13,1,13
vcmpequw 1,1,10
xxsel 45,44,43,45
xxsel 33,44,43,33
xxlor 32,32,45
xxlor 32,32,33
vsldoi 1,12,11,12
xxland 32,32,33
vcmpequw. 0,0,12
mfcr 3,2
rlwinm 3,3,25,1
cntlzw 3,3
srwi 3,3,5
blr
cmp4:
lwa 10,8(4)
lxvd2x 33,0,3
vspltisw 10,-1
lwa 9,12(4)
vspltisw 11,0
xxpermdi 33,33,33,2
lwa 7,0(4)
lwa 8,4(4)
stw 10,-32(1)
addi 10,1,-96
stw 9,-16(1)
li 9,32
stw 7,-64(1)
stw 8,-48(1)
lvewx 0,10,9
li 9,48
xxspltw 32,32,3
lvewx 13,10,9
li 9,64
xxspltw 45,45,3
vcmpequw 13,1,13
xxsel 44,43,42,45
lvewx 13,10,9
li 9,80
vcmpequw 0,1,0
xxspltw 45,45,3
xxsel 32,43,42,32
vcmpequw 13,1,13
xxlor 32,32,44
xxsel 45,43,42,45
lvewx 12,10,9
xxlor 32,32,45
xxspltw 44,44,3
vcmpequw 1,1,12
xxsel 33,43,42,33
xxlor 32,32,33
vcmpequw. 0,0,11
mfcr 3,2
rlwinm 3,3,25,1
cntlzw 3,3
srwi 3,3,5
blr
As you can see from the generated asm, there is still a little room for improvement, but it will compile on risc-v, mips, ppc and other architecture+compiler combinations that support vector extensions.