I have the following AVX and Native codes:
__forceinline double dotProduct_2(const double* u, const double* v)
{
_mm256_zeroupper();
__m256d xy = _mm256_mul_pd(_mm256_load_pd(u), _mm256_load_pd(v));
__m256d temp = _mm256_hadd_pd(xy, xy);
__m128d dotproduct = _mm_add_pd(_mm256_extractf128_pd(temp, 0), _mm256_extractf128_pd(temp, 1));
return dotproduct.m128d_f64[0];
}
__forceinline double dotProduct_1(const D3& a, const D3& b)
{
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
}
And respective test scripts:
std::cout << res_1 << " " << res_2 << " " << res_3 << '\n';
{
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < (1 << 30); ++i)
{
zx_1 += dotProduct_1(aVx[i % 10000], aVx[(i + 1) % 10000]);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "NAIVE : " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << '\n';
}
{
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < (1 << 30); ++i)
{
zx_2 += dotProduct_2(&aVx[i % 10000][0], &aVx[(i + 1) % 10000][0]);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "AVX : " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << '\n';
}
std::cout << math::min2(zx_1, zx_2) << " " << zx_1 << " " << zx_2;
Well, all of the data are aligned by 32. (D3 with __declspec... and aVx arr with _mm_malloc()..) And, as i can see, native variant is equal/or faster than AVX variant. I can't understand it's nrmally behaviour ? Because i'm think that AVX is 'super FAST' ... If not, how i can optimize it ? I compile it on MSVC 2015(x64), with arch AVX. Also, my hardwre is intel i7 4750HQ(haswell)