I'm trying to determine if the _mm512_mullox_epi64 (AVX-512 foundation) sequence intrinsic is substantially slower than the _mm512_mullo_epi64 (AVX-512 Double-Word and Quad-Word ISA) hardware-implemented intrinsic.
The _mm512_mullo_epi64 will throw an "Invalid Instruction" exception on hardware with AVX-512 but without the DWQW instruction set extensions.
I don't have an AVX-512 capable CPU and trying to benchmark using godbolt provides very inconsistent results. My code also does not compile using quick bench as you can't currently pass in compiler options like -mavx512dq
I'm also interested in knowing whether or not there is a good option for using AVX2 as there is no intrinsic for multiplying 64-Bit integers with AVX2.
Using _mm256_mul_pd with a cast often produces incorrect results when the product is within the bounds of an int64_t but outside of the bounds of a 64-bit double.
Here's my test code if you're interested:
#include "immintrin.h"
#include <cstdint>
#include <array>
#include <algorithm>
#include <numeric>
#include <iostream>
#include <chrono>
std::array<int64_t, 1000000> arr1;
std::array<int64_t, 1000000> arr2;
std::array<int64_t, 1000000> arr3;
class Timer
{
public:
Timer()
{
start = std::chrono::high_resolution_clock::now();
}//End of constructor
Timer(Timer const&) = delete;
Timer& operator=(Timer const&) = delete;
Timer(Timer&&) = delete;
Timer& operator=(Timer&&) = delete;
~Timer()
{
end = std::chrono::high_resolution_clock::now();
std::chrono::high_resolution_clock::duration d = end - start;
std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(d).count() << "ns\n";
}//End of destructor
private:
std::chrono::high_resolution_clock::time_point start;
std::chrono::high_resolution_clock::time_point end;
};//End of class Timer
template<uint64_t SIZE1, uint64_t SIZE2, uint64_t SIZE3>
void mul_f(const std::array<int64_t, SIZE1>& src, const std::array<int64_t, SIZE2>& src2, std::array<int64_t, SIZE3>& dest)
{
__m512i _src1;
__m512i _src2;
__m512i _dest;
for(uint64_t i = 0; i < SIZE3; i+=8)
{
if((i + 8) > SIZE3)
{
break;
}
_src1 = _mm512_load_epi64(&src[i]);
_src2 = _mm512_load_epi64(&src2[i]);
_dest = _mm512_mullox_epi64(_src1, _src2);
_mm512_store_epi64(&dest[i], _dest);
}
}
template<uint64_t SIZE1, uint64_t SIZE2, uint64_t SIZE3>
void mul_dq(const std::array<int64_t, SIZE1>& src, const std::array<int64_t, SIZE2>& src2, std::array<int64_t, SIZE3>& dest)
{
__m512i _src1;
__m512i _src2;
__m512i _dest;
for(uint64_t i = 0; i < SIZE3; i+=8)
{
if((i + 8) > SIZE3)
{
break;
}
_src1 = _mm512_load_epi64(&src[i]);
_src2 = _mm512_load_epi64(&src2[i]);
_dest = _mm512_mullo_epi64(_src1, _src2);
_mm512_store_epi64(&dest[i], _dest);
}
}
template<uint64_t SIZE1, uint64_t SIZE2, uint64_t SIZE3>
void mul_avx2(const std::array<int64_t, SIZE1>& src, const std::array<int64_t, SIZE2>& src2, std::array<int64_t, SIZE3>& dest)
{
__m256i _src1;
__m256i _src2;
__m256i _dest;
for(uint64_t i = 0; i < SIZE3; i+=4)
{
if((i + 4) > SIZE3)
{
break;
}
_src1 = _mm256_load_si256((__m256i*)&src[i]);
_src2 = _mm256_load_si256((__m256i*)&src2[i]);
int64_t d[4] = {};
for (size_t x = 0; x != 4; ++x)
{
#ifdef _WIN32
d[x] = _src1.m256i_i64[x] * _src2.m256i_i64[x];
#else
d[x] = _src1[x] * _src2[x];
#endif
}//End for
_dest = _mm256_load_si256((__m256i*) &d);
_mm256_store_si256((__m256i*)&dest[i], _dest);
}
}
int main()
{
std::iota(arr1.begin(), arr1.end(), 5);
std::iota(arr2.begin(), arr2.end(), 2);
{
Timer();
mul_f(arr1, arr2, arr3);
}
{
Timer();
mul_dq(arr1, arr2, arr3);
}
{
Timer();
mul_avx2(arr1, arr2, arr3);
}
return static_cast<int>(arr3[0]);
}
Thanks in advance for your assistance.