With AVX512 (AVX512VL + AVX512CD), you would use VPCONFLICTQ
, which is designed for this purpose.
For AVX2:
Shaved off a couple of operations by doing fewer redundant comparisons:
int test1(__m256i x)
{
__m256i x0 = _mm256_permute4x64_epi64(x, 0x4B);
// 1 0 2 3
// 3 2 1 0
__m256i e0 = _mm256_cmpeq_epi64(x0, x);
__m256i x1 = _mm256_shuffle_epi32(x, 0x4E);
// 2 3 0 1
// 3 2 1 0
__m256i e1 = _mm256_cmpeq_epi64(x1, x);
__m256i t = _mm256_or_si256(e0, e1);
return !_mm256_testz_si256(t, _mm256_set1_epi32(-1));
}
Previously:
A simple "compare everything with everything" approach can be used with some shuffles, something like this (not tested):
int hasDupe(__m256i x)
{
__m256i x1 = _mm256_shuffle_epi32(x, 0x4E);
__m256i x2 = _mm256_permute4x64_epi64(x, 0x4E);
__m256i x3 = _mm256_shuffle_epi32(x2, 0x4E);
// 2 3 0 1
// 3 2 1 0
__m256i e0 = _mm256_cmpeq_epi64(x1, x);
// 1 0 3 2
// 3 2 1 0
__m256i e1 = _mm256_cmpeq_epi64(x2, x);
// 0 1 2 3
// 3 2 1 0
__m256i e2 = _mm256_cmpeq_epi64(x3, x);
__m256i t0 = _mm256_or_si256(_mm256_or_si256(e0, e1), e2);
return !_mm256_testz_si256(t0, _mm256_set1_epi32(-1));
}
GCC 7 compiles this to reasonable code, but Clang does really strange things. It seems to think that vpor
has no 256 bit version (which it totally does). Changing the ORs to additions does roughly the same thing in this case (adding a couple of -1's together will not be zero) and doesn't cause trouble with Clang (also not tested):
int hasDupe(__m256i x)
{
__m256i x1 = _mm256_shuffle_epi32(x, 0x4E);
__m256i x2 = _mm256_permute4x64_epi64(x, 0x4E);
__m256i x3 = _mm256_shuffle_epi32(x2, 0x4E);
// 2 3 0 1
// 3 2 1 0
__m256i e0 = _mm256_cmpeq_epi64(x1, x);
// 1 0 3 2
// 3 2 1 0
__m256i e1 = _mm256_cmpeq_epi64(x2, x);
// 0 1 2 3
// 3 2 1 0
__m256i e2 = _mm256_cmpeq_epi64(x3, x);
// "OR" results, workaround for Clang being weird
__m256i t0 = _mm256_add_epi64(_mm256_add_epi64(e0, e1), e2);
return !_mm256_testz_si256(t0, _mm256_set1_epi32(-1));
}