I have some simple SIMD code which multiply 2 double arrays using intel intrinsic (using flag /arch:AVX2) and I compare it to standard loop without simd:
int const N = 67108864;
__declspec(align(32)) double* ar1 = new double[N];
__declspec(align(32)) double* ar2 = new double[N];
__declspec(align(32)) double* ar3 = new double[N];
for (size_t i = 0; i < N; i++)
{
ar1[0] = 3.0;
ar2[0] = 2.0;
}
for (int s = 0; s < 20; s++)
{
auto begin = chrono::steady_clock::now();
for (size_t i = 0; i < N; i++)
{
ar3[i] = ar1[i] * ar2[i];
}
cout << "n: " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << endl;
begin = chrono::steady_clock::now();
for (size_t i = 0; i < N; i+=4)
{
__m256d in1 = _mm256_load_pd(&ar1[i]);
__m256d in2 = _mm256_load_pd(&ar2[i]);
_mm256_store_pd(&ar3[i], _mm256_mul_pd(in1, in2));
}
cout << "s: " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << endl;
}
but, I can't get any performance improvement for simd version. I looked on assembly and i guess, that it is because of vmovupd instruction when it should use vmovapd. Why it's use mov for unaligned packed data when i am using __declspec(align(32))?
entire simd loop:
61: for (size_t i = 0; i < N; i+=4)
62: {
63: __m256d in1 = _mm256_load_pd(&ar1[i]);
64: __m256d in2 = _mm256_load_pd(&ar2[i]);
00007FF62ED612A0 vmovupd ymm1,ymmword ptr [rax]
65:
66: _mm256_store_pd(&ar3[i], _mm256_mul_pd(in1, in2));
00007FF62ED612A4 vmulpd ymm1,ymm1,ymmword ptr [rax+r13]
00007FF62ED612AA vmovupd ymmword ptr [rdx+rax],ymm1
00007FF62ED612AF lea rax,[rax+20h]
00007FF62ED612B3 sub rcx,1
00007FF62ED612B7 vzeroupper
00007FF62ED612BA jne main+2A0h (07FF62ED612A0h)
67: }
I am new to code vectorisation, so I would be happy for pointer for any common mistakes I am doing.