I'm currently trying to implement an XOR_SHIFT Random Number Generator using AVX2, it's actually quite easy and very fast. However I need to be able to specify a range. This usually requires modulo.
This is a major problem for me for 2 reasons:
- Adding the _mm256_rem_epu32() / _mm256_rem_epi32() SVML function to my code takes the run time of my loop from around 270ms to 1.8 seconds. Ouch!
- SVML is only available on MSVC and Intel Compilers
Are the any significantly faster ways to do modulo using AVX2?
Non Vector Code:
std::srand(std::time(nullptr));
std::mt19937_64 e(std::rand());
uint32_t seed = static_cast<uint32_t>(e());
for (; i != end; ++i)
{
seed ^= (seed << 13u);
seed ^= (seed >> 7u);
seed ^= (seed << 17u);
arr[i] = static_cast<T>(low + (seed % ((up + 1u) - low)));
}//End for
Vectorized:
constexpr uint32_t thirteen = 13u;
constexpr uint32_t seven = 7u;
constexpr uint32_t seventeen = 17u;
const __m256i _one = _mm256_set1_epi32(1);
const __m256i _lower = _mm256_set1_epi32(static_cast<uint32_t>(low));
const __m256i _upper = _mm256_set1_epi32(static_cast<uint32_t>(up));
__m256i _temp = _mm256_setzero_si256();
__m256i _res = _mm256_setzero_si256();
__m256i _seed = _mm256_set_epi32(
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e())
);
for (; (i + 8uz) < end; ++i)
{
//Generate Random Numbers
_temp = _mm256_slli_epi32(_seed, thirteen);
_seed = _mm256_xor_si256(_seed, _temp);
_temp = _mm256_srai_epi32(_seed, seven);
_seed = _mm256_xor_si256(_seed, _temp);
_temp = _mm256_slli_epi32(_seed, seventeen);
_seed = _mm256_xor_si256(_seed, _temp);
//Narrow
_temp = _mm256_add_epi32(_upper, _one);
_temp = _mm256_sub_epi32(_temp, _lower);
_temp = _mm256_rem_epu32(_seed, _temp); //Comment this line out for a massive speed up but incorrect results
_res = _mm256_add_epi32(_lower, _temp);
_mm256_store_si256((__m256i*) &arr[i], _res);
}//End for