When using the GCC vector extensions for C, how can I check that all the values on a vector are zero?
For instance:
#include <stdint.h>
typedef uint32_t v8ui __attribute__ ((vector_size (32)));
v8ui*
foo(v8ui *mem) {
v8ui v;
for ( v = (v8ui){ 1, 1, 1, 1, 1, 1, 1, 1 };
v[0] || v[1] || v[2] || v[3] || v[4] || v[5] || v[6] || v[7];
mem++)
v &= *(mem);
return mem;
}
SSE4.2 has the PTEST
instruction which allows to run a test like the one used as the for
condition but the code generated by GCC just unpacks the vector and checks the single elements one by one:
.L2:
vandps (%rax), %ymm1, %ymm1
vmovdqa %xmm1, %xmm0
addq $32, %rax
vmovd %xmm0, %edx
testl %edx, %edx
jne .L2
vpextrd $1, %xmm0, %edx
testl %edx, %edx
jne .L2
vpextrd $2, %xmm0, %edx
testl %edx, %edx
jne .L2
vpextrd $3, %xmm0, %edx
testl %edx, %edx
jne .L2
vextractf128 $0x1, %ymm1, %xmm0
vmovd %xmm0, %edx
testl %edx, %edx
jne .L2
vpextrd $1, %xmm0, %edx
testl %edx, %edx
jne .L2
vpextrd $2, %xmm0, %edx
testl %edx, %edx
jne .L2
vpextrd $3, %xmm0, %edx
testl %edx, %edx
jne .L2
vzeroupper
ret
Is there any way to get GCC to generate an efficient test for that without reverting to using intrinsics?
Update: For reference, code using an unportable GCC builtin for (V)PTEST
:
typedef uint32_t v8ui __attribute__ ((vector_size (32)));
typedef long long int v4si __attribute__ ((vector_size (32)));
const v8ui ones = { 1, 1, 1, 1, 1, 1, 1, 1 };
v8ui*
foo(v8ui *mem) {
v8ui v;
for ( v = ones;
!__builtin_ia32_ptestz256((v4si)v,
(v4si)ones);
mem++)
v &= *(mem);
return mem;
}