Not the nicest code going, and I can't really say if it's better or worse than processing each element as uint16. You could save a few ops if you ensure the bit shift amount is always < 16, but it's still not great.
__m128i sllv_epi16(__m128i v, __m128i s) {
// test each bit I the shift
const __m128i _1 = _mm_set1_epi16(1);
const __m128i _2 = _mm_set1_epi16(2);
const __m128i _4 = _mm_set1_epi16(4);
const __m128i _8 = _mm_set1_epi16(8);
// testing to set to zero if 16 or greater
const __m128i _16 = _mm_set1_epi16(16);
s = _mm_min_epi16(s, _16);
// mask out each bit in the shift amount
__m128i cmp1 = _mm_and_si128(s, _1);
__m128i cmp2 = _mm_and_si128(s, _2);
__m128i cmp4 = _mm_and_si128(s, _4);
__m128i cmp8 = _mm_and_si128(s, _8);
__m128i cmp16 = _mm_cmpeq_epi16(_16, s);
// convert each bit into a true/false mask
cmp1 = _mm_cmpeq_epi16(_1, cmp1);
cmp2 = _mm_cmpeq_epi16(_2, cmp2);
cmp4 = _mm_cmpeq_epi16(_4, cmp4);
cmp8 = _mm_cmpeq_epi16(_8, cmp8);
// shift by 1 bit, select result
__m128i shift1 = _mm_slli_epi16(v, 1);
v = _mm_or_si128(_mm_andnot_si128(cmp1, v),
_mm_and_si128(cmp1, shift1));
// shift by 2 bits, select result
__m128i shift2 = _mm_slli_epi16(v, 2);
v = _mm_or_si128(_mm_andnot_si128(cmp2, v),
_mm_and_si128(cmp2, shift2));
// shift by 4 bits, select result
__m128i shift4 = _mm_slli_epi16(v, 4);
v = _mm_or_si128(_mm_andnot_si128(cmp4, v),
_mm_and_si128(cmp4, shift4));
// shift by 8 bits, select result
__m128i shift8 = _mm_slli_epi16(v, 8);
v = _mm_or_si128(_mm_andnot_si128(cmp8, v),
_mm_and_si128(cmp8, shift8));
// filter out shifts >= 16.
return _mm_andnot_si128(cmp16, v);
}
and for 8 bit
__m128i sllv_epi8(__m128i v, __m128i s) {
const __m128i _1 = _mm_set1_epi8(1);
const __m128i _2 = _mm_set1_epi8(2);
const __m128i _4 = _mm_set1_epi8(4);
const __m128i _8 = _mm_set1_epi8(8);
s = _mm_min_epu8(s, _8);
__m128i cmp1 = _mm_and_si128(s, _1);
__m128i cmp2 = _mm_and_si128(s, _2);
__m128i cmp4 = _mm_and_si128(s, _4);
__m128i cmp8 = _mm_cmpeq_epi8(_8, s);
cmp1 = _mm_cmpeq_epi8(_1, cmp1);
cmp2 = _mm_cmpeq_epi8(_2, cmp2);
cmp4 = _mm_cmpeq_epi8(_4, cmp4);
__m128i shift1 = _mm_slli_epi16( _mm_and_si128(v, _mm_set1_epi8(0x7F)), 1);
v = _mm_or_si128(_mm_andnot_si128(cmp1, v),
_mm_and_si128(cmp1, shift1));
__m128i shift2 = _mm_slli_epi16(_mm_and_si128(v, _mm_set1_epi8(0x3F)), 2);
v = _mm_or_si128(_mm_andnot_si128(cmp2, v),
_mm_and_si128(cmp2, shift2));
__m128i shift4 = _mm_slli_epi16(_mm_and_si128(v, _mm_set1_epi8(0x0F)), 4);
v = _mm_or_si128(_mm_andnot_si128(cmp4, v),
_mm_and_si128(cmp4, shift4));
return _mm_andnot_si128(cmp8, v);
}