It seems that _mm_shuffle_epi8 is indeed the key to a solution. The idea is to set individual bits according to the values of the input vector a.
These bits are distributed over (horizontal OR) the bytes of the 128 bits wide
register.
#include <stdio.h>
#include <immintrin.h>
/* gcc -O3 -Wall -mavx test4.c */
/* gcc -O3 -Wall -msse2 -mssse3 -msse4.1 test4.c */
int print_char128(__m128i * x);
int print_char128(__m128i * x){
unsigned char v_x[16];
_mm_storeu_si128((__m128i *)v_x,*x);
printf("%4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u \n",
v_x[0], v_x[1], v_x[2], v_x[3], v_x[4], v_x[5], v_x[6], v_x[7],
v_x[8], v_x[9], v_x[10], v_x[11], v_x[12], v_x[13], v_x[14], v_x[15] );
return 0;
}
int main()
{
unsigned char a_v[] = {0, 0, 0, 10, 0, 0, 0, 2, 0, 0, 0, 0, 3, 1, 0, 0 };
/*unsigned char a_v[] = {13, 30, 0, 10, 0, 6, 0, 2, 0, 0, 7, 0, 3, 11, 0, 0 };*/
__m128i t0, t1, t2, t3;
__m128i a, r, msk0, msk1, msk0_1, zero, bin_ones, one_epi8;
/* set some constants */
unsigned char msk0_v[] ={1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0};
msk0=_mm_loadu_si128((__m128i *)msk0_v);
msk1=_mm_shuffle_epi32(msk0,0b01001110);
msk0_1=_mm_blend_epi16(msk0,msk1,0b11110000);
zero=_mm_setzero_si128();
bin_ones=_mm_cmpeq_epi32(zero,zero);
one_epi8=_mm_sub_epi8(zero,bin_ones);
/* load indices */
a=_mm_loadu_si128((__m128i *)a_v);
/* start of 'SIMD magic' */
/* index a_i sets the a_i -th bit within a byte of t0 if 0<=a_i<8 */
/* or set (a_i-8)-th bit within a byte of t1 if 8<=a_i<16 */
t0=_mm_shuffle_epi8(msk0,a);
t1=_mm_shuffle_epi8(msk1,a);
/* horizontal OR of the bytes in t0 and t1: */
t2=_mm_blend_epi16(t0,t1,0b11110000);
t3=_mm_alignr_epi8(t1,t0,8);
t0=_mm_or_si128(t2,t3);
t1=_mm_shuffle_epi32(t0,0b10110001);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,2);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,1);
t0=_mm_or_si128(t0,t1);
t0=_mm_shuffle_epi32(t0,0b11110101); /* end of horizontal OR */
/* filter out the relevant bits */
t0=_mm_and_si128(t0,msk0_1);
t0=_mm_cmpeq_epi8(t0,zero);
r=_mm_andnot_si128(t0,one_epi8); /* the result is in r */
print_char128(&r);
return 0;
}
This should work quite fast: Aside from the instructions for setting the constants and loading the data it is only 15 SSEx instructions. On today's processors these instructions all have a latency of only 1 cycle.
The (reciprocal) througput is even smaller: 1/2 or 1/3 cycle.
Intrinsic _mm_blend_epi16 is SSE4.1, some others are SSSE3.