I am trying to see the performance speedup of AVX instructions. Below is the example code I am running:
#include <iostream>
#include <stdio.h>
#include <string.h>
#include <cstdlib>
#include <algorithm>
#include <immintrin.h>
#include <chrono>
#include <complex>
//using Type = std::complex<double>;
using Type = double;
int main()
{
size_t b_size = 1;
b_size = (1ul << 30) * b_size;
Type *d_ptr = (Type*)malloc(sizeof(Type)*b_size);
for(int i = 0; i < b_size; i++)
{
d_ptr[i] = 0;
}
std::cout <<"malloc finishes!" << std::endl;
#ifndef AVX512
auto a = std::chrono::high_resolution_clock::now();
for (int i = 0; i < b_size; i ++)
{
d_ptr[i] = i*0.1;
}
std::cout << d_ptr[b_size-1] << std::endl;
auto b = std::chrono::high_resolution_clock::now();
long long diff = std::chrono::duration_cast<std::chrono::microseconds>(b-a).count();
std::cout << "No avx takes " << diff << std::endl;
#else
auto a = std::chrono::high_resolution_clock::now();
for (int i = 0; i < b_size; i += 4)
{
/* __m128d tmp1 = _mm_load_pd(reinterpret_cast<double*>(&d_ptr[i]));
__m128d tmp2 = _mm_set_pd((i+1)*0.1,0.1*i);
__m128d tmp3 = _mm_add_pd(tmp1,tmp2);
_mm_store_pd(reinterpret_cast<double*>(&d_ptr[i]),tmp3);*/
__m256d tmp1 = _mm256_loadu_pd(reinterpret_cast<double*>(&d_ptr[i]));
__m256d tmp2 = _mm256_set_pd(0.1*(i+3),0.1*(i+2),0.1*(i+1),0.1*i);
__m256d tmp3 = _mm256_add_pd(tmp1,tmp2);
_mm256_storeu_pd(reinterpret_cast<double*>(&d_ptr[i]),tmp3);
}
std::cout << d_ptr[b_size-1] << std::endl;
auto b = std::chrono::high_resolution_clock::now();
long long diff = std::chrono::duration_cast<std::chrono::microseconds>(b-a).count();
std::cout << "avx takes " << diff << std::endl;
#endif
}
I have tested this code on both Haswell and Cascade lake machines, the cases without and with AVX produces quite similar execution times.
---Edit--- Here is the simple compiler command I used:
Without AVX g++ test_avx512_performance.cpp -march=native -o test_avx512_performance_noavx
With AVX g++ test_avx512_performance.cpp -march=native -DAVX512 -o test_avx512_performance
--Edit Again-- I have run the above code on the Haswell machine again. The results are surprising:
Without AVX and compiled with O3:
~$ ./test_avx512_auto_noavx
malloc finishes!
1.07374e+08
No avx takes 3824740
With AVX and compiled without any optimization flags:
~$ ./test_avx512_auto
malloc finishes!
1.07374e+08
avx takes 2121917
With AVX and compiled with O3:
~$ ./test_avx512_auto_o3
malloc finishes!
1.07374e+08
avx takes 6307190
It is against what we thought before.
Also, I have implemented a vectorized version (similar to Add+Mul become slower with Intrinsics - where am I wrong? ), see the code below:
#else
auto a = std::chrono::high_resolution_clock::now();
__m256d tmp2 = _mm256_set1_pd(0.1);
__m256d base = _mm256_set_pd(-1.0,-2.0,-3.0,-4.0);
__m256d tmp3 = _mm256_set1_pd(4.0);
for (int i = 0; i < b_size; i += 4)
{
/* __m128d tmp1 = _mm_load_pd(reinterpret_cast<double*>(&d_ptr[i]));
__m128d tmp2 = _mm_set_pd((i+1)*0.1,0.1*i);
__m128d tmp3 = _mm_add_pd(tmp1,tmp2);
_mm_store_pd(reinterpret_cast<double*>(&d_ptr[i]),tmp3);*/
__m256d tmp1 = _mm256_loadu_pd(reinterpret_cast<double*>(&d_ptr[i]));
base = _mm256_add_pd(base,tmp3);
__m256d tmp5 = _mm256_mul_pd(base,tmp2);
tmp1 = _mm256_add_pd(tmp1,tmp5);
_mm256_storeu_pd(reinterpret_cast<double*>(&d_ptr[i]),tmp1);
}
std::cout << d_ptr[b_size-1] << std::endl;
auto b = std::chrono::high_resolution_clock::now();
long long diff = std::chrono::duration_cast<std::chrono::microseconds>(b-a).count();
std::cout << "avx takes " << diff << std::endl;
#endif
On the same machine, this gives me:
With AVX and without any optimization flags
~$ ./test_avx512_manual
malloc finishes!
1.07374e+08
avx takes 2151390
With AVX and with O3:
~$ ./test_avx512_manual_o3
malloc finishes!
1.07374e+08
avx takes 5965288
Not sure where the problem is. Why O3 gives up worse performance?