If you have BMI2, use the following version.
__m128i compressZeroIndices_bmi2( __m128i v )
{
const __m128i zero = _mm_setzero_si128();
// Replace zeros with 0xFF
v = _mm_cmpeq_epi8( v, zero );
// Extract low/high pieces into scalar registers for PEXT instruction
uint64_t low = (uint64_t)_mm_cvtsi128_si64( v );
uint64_t high = (uint64_t)_mm_extract_epi64( v, 1 );
// Count payload bytes in the complete vector
v = _mm_sub_epi8( zero, v );
v = _mm_sad_epu8( v, zero );
v = _mm_add_epi64( v, _mm_srli_si128( v, 8 ) );
v = _mm_shuffle_epi8( v, zero );
// Make a mask vector filled with 0 for payload bytes, 0xFF for padding
const __m128i identity = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
v = _mm_max_epu8( v, identity );
__m128i mask = _mm_cmpeq_epi8( v, identity );
// The following line requires C++/20
// If you don't have it, use #ifdef _MSC_VER to switch between __popcnt64() and _popcnt64() intrinsics.
uint64_t lowBits = std::popcount( low );
// Use BMI2 to gather these indices
low = _pext_u64( 0x0706050403020100ull, low );
high = _pext_u64( 0x0F0E0D0C0B0A0908ull, high );
// Merge payload into a vector
v = _mm_cvtsi64_si128( low | ( high << lowBits ) );
v = _mm_insert_epi64( v, high >> ( 64 - lowBits ), 1 );
// Apply the mask to set unused elements to -1, enables pmovmskb + tzcnt to find the length
return _mm_or_si128( v, mask );
}
Here’s another version without BMI2. Probably slower on most CPUs, but the code is way simpler, and doesn’t use any scalar instructions.
inline __m128i sortStep( __m128i a, __m128i perm, __m128i blend )
{
// The min/max are independent and their throughput is 0.33-0.5 cycles,
// so this whole function only takes 3 (AMD) or 4 (Intel) cycles to complete
__m128i b = _mm_shuffle_epi8( a, perm );
__m128i i = _mm_min_epu8( a, b );
__m128i ax = _mm_max_epu8( a, b );
return _mm_blendv_epi8( i, ax, blend );
}
__m128i compressZeroIndices( __m128i v )
{
// Replace zeros with 0-based indices, ones with 0xFF
v = _mm_cmpgt_epi8( v, _mm_setzero_si128() );
const __m128i identity = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
v = _mm_or_si128( v, identity );
// Sort bytes in the vector with a network
// https://demonstrations.wolfram.com/SortingNetworks/
// Click the "transposition" algorithm on that demo
const __m128i perm1 = _mm_setr_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
const __m128i blend1 = _mm_set1_epi16( (short)0xFF00 );
const __m128i perm2 = _mm_setr_epi8( 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 15 );
const __m128i blend2 = _mm_setr_epi8( 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0 );
for( size_t i = 0; i < 8; i++ )
{
v = sortStep( v, perm1, blend1 );
v = sortStep( v, perm2, blend2 );
}
return v;
}
P.S. If you want the length of the output vector, use this function:
uint32_t vectorLength( __m128i v )
{
uint32_t mask = (uint32_t)_mm_movemask_epi8( v );
mask |= 0x10000;
return _tzcnt_u32( mask );
}