Constant time cannot work if you have a variable number inputs. You have to iterate over the values at least once, right?
In any case, you can use intrinsics to minimize the number of operations. You have not specified your target architecture or the integer size. So I assume AVX2 and 64 bit integers as output. Also, for convenience, I assume that the inputs are 64 bit.
If your inputs are smaller-sized integers than the output, you have to add some zero-extensions.
#include <immintrin.h>
#include <array>
#include <cstdint>
#include <cstdio>
std::uint64_t accum(const std::uint64_t* bitpositions, std::size_t n)
{
// 2 x 64 bit integers set to 1
const __m128i ones2 = _mm_set1_epi64(_m_from_int64(1));
// 4 x 64 bit integers set to 1
const __m256i ones4 = _mm256_broadcastsi128_si256(ones2);
// 4 x 64 bit integers serving as partial bit masks
__m256i accum4 = _mm256_setzero_si256();
std::size_t i;
for(i = 0; i + 4 <= n; i += 4) {
// may be replaced with aligned load
__m256i positions = _mm256_loadu_si256((const __m256i*)(bitpositions + i));
// vectorized (1 << position) bit shift
__m256i shifted = _mm256_sllv_epi64(ones4, positions);
// add new bits to accumulator
accum4 = _mm256_or_si256(accum4, shifted);
}
// reduce 4 to 2 64 bit integers
__m128i accum2 = _mm256_castsi256_si128(accum4);
__m128i high2 = _mm256_extracti128_si256(accum4, 1);
if(i + 2 <= n) {
// zero or one iteration with 2 64 bit integers
__m128i positions = _mm_loadu_si128((const __m128i*)(bitpositions + i));
__m128i shifted = _mm_sllv_epi64(ones2, positions);
accum2 = _mm_or_si128(accum2, shifted);
i += 2;
}
// high2 folded in with delay to account for extract latency
accum2 = _mm_or_si128(accum2, high2);
// reduce to 1 64 bit integer
__m128i high1 = _mm_unpackhi_epi64(accum2, accum2);
accum2 = _mm_or_si128(accum2, high1);
std::uint64_t accum1 = static_cast<std::uint64_t>(_mm_cvtsi128_si64(accum2));
if(i < n)
accum1 |= 1 << bitpositions[i];
return accum1;
}
EDIT
I have just seen that your example inputs use 1-based indexing. So bit 1 would be set for value 1 and input value 0 is probably undefined behavior. I suggest switching to zero-based indexing. But if you are stuck with that notation, just add a _mm256_sub_epi64(positions, ones4)
or _mm_sub_epi64(positions, ones2)
before the shift.
With smaller input size
And here is a version for byte-sized input integers.
std::uint64_t accum(const std::uint8_t* bitpositions, std::size_t n)
{
const __m128i ones2 = _mm_set1_epi64(_m_from_int64(1));
const __m256i ones4 = _mm256_broadcastsi128_si256(ones2);
__m256i accum4 = _mm256_setzero_si256();
std::size_t i;
for(i = 0; i + 4 <= n; i += 4) {
/*
* As far as I can see, there is no point in loading a full 128 or 256 bit
* vector. To zero-extend more values, we would need to use many shuffle
* instructions and those have a lower throughput than repeated
* 32 bit loads
*/
__m128i positions = _mm_cvtsi32_si128(*(const int*)(bitpositions + i));
__m256i extended = _mm256_cvtepu8_epi64(positions);
__m256i shifted = _mm256_sllv_epi64(ones4, extended);
accum4 = _mm256_or_si256(accum4, shifted);
}
__m128i accum2 = _mm256_castsi256_si128(accum4);
__m128i high2 = _mm256_extracti128_si256(accum4, 1);
accum2 = _mm_or_si128(accum2, high2);
/*
* Until AVX512, there is no single instruction to load 2 byte into a vector
* register. So we don't bother. Instead, the scalar code below will run up
* to 3 times
*/
__m128i high1 = _mm_unpackhi_epi64(accum2, accum2);
accum2 = _mm_or_si128(accum2, high1);
std::uint64_t accum1 = static_cast<std::uint64_t>(_mm_cvtsi128_si64(accum2));
/*
* We use a separate accumulator to avoid the long dependency chain through
* the reduction above
*/
std::uint64_t tail = 0;
/*
* Compilers create a ton of code if we give them a simple loop because they
* think they can vectorize. So we unroll the loop, even if it is stupid
*/
if(i + 2 <= n) {
tail = std::uint64_t(1) << bitpositions[i++];
tail |= std::uint64_t(1) << bitpositions[i++];
}
if(i < n)
tail |= std::uint64_t(1) << bitpositions[i];
return accum1 | tail;
}