2

I have some simple SIMD code which multiply 2 double arrays using intel intrinsic (using flag /arch:AVX2) and I compare it to standard loop without simd:

int const N = 67108864;
__declspec(align(32)) double* ar1 = new double[N];
__declspec(align(32)) double* ar2 = new double[N];
__declspec(align(32)) double* ar3 = new double[N];

for (size_t i = 0; i < N; i++)
{
    ar1[0] = 3.0;
    ar2[0] = 2.0;
}

for (int s = 0; s < 20; s++)
{
    auto begin = chrono::steady_clock::now();
    for (size_t i = 0; i < N; i++)
    {
        ar3[i] = ar1[i] * ar2[i];
    }
    cout << "n: " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << endl;


    begin = chrono::steady_clock::now();
    for (size_t i = 0; i < N; i+=4)
    {
        __m256d in1 = _mm256_load_pd(&ar1[i]);
        __m256d in2 = _mm256_load_pd(&ar2[i]);

        _mm256_store_pd(&ar3[i], _mm256_mul_pd(in1, in2));
    }
    cout << "s: " << chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - begin).count() << endl;
}

but, I can't get any performance improvement for simd version. I looked on assembly and i guess, that it is because of vmovupd instruction when it should use vmovapd. Why it's use mov for unaligned packed data when i am using __declspec(align(32))?

entire simd loop:

61:         for (size_t i = 0; i < N; i+=4)
62:         {
63:             __m256d in1 = _mm256_load_pd(&ar1[i]);
64:             __m256d in2 = _mm256_load_pd(&ar2[i]);
00007FF62ED612A0  vmovupd     ymm1,ymmword ptr [rax]  
65: 
66:             _mm256_store_pd(&ar3[i], _mm256_mul_pd(in1, in2));
00007FF62ED612A4  vmulpd      ymm1,ymm1,ymmword ptr [rax+r13]  
00007FF62ED612AA  vmovupd     ymmword ptr [rdx+rax],ymm1  
00007FF62ED612AF  lea         rax,[rax+20h]  
00007FF62ED612B3  sub         rcx,1  
00007FF62ED612B7  vzeroupper  
00007FF62ED612BA  jne         main+2A0h (07FF62ED612A0h)  
67:         }

I am new to code vectorisation, so I would be happy for pointer for any common mistakes I am doing.

Martin877
  • 263
  • 2
  • 10
  • 2
    Are your sure that `__declspec(align(32)) double* ar1 = new double[N];` aligns the memory pointed to by `ar1`? – Pixelchemist Mar 29 '17 at 11:23
  • 1
    There is a duplicate: [Why _mm256_load_pd compiled to MOVUPD instead of MOVAPD?](http://stackoverflow.com/questions/36709096/why-mm256-load-pd-compiled-to-movupd-instead-of-movapd) but I cannot close this one because the other question also has no answer. The comments should give you an idea, however. – Pixelchemist Mar 29 '17 at 11:45
  • Use `_mm_malloc`, `posix_memalign`, or some other aligned memory allocator (e.g. http://en.cppreference.com/w/cpp/memory/align) instead of `new`. – Jeff Hammond May 19 '17 at 17:21

1 Answers1

2

I have already faced with this problem. I have found following solution:

inline __m256d Load(const double * p)
{
#ifdef _MSC_VER
    return _mm256_castsi256_pd(_mm256_load_si256((__m256i*)p));
#else
    return _mm256_load_pd(p);
#endif
}

Analogous solution for float type:

inline __m256 Load(const float * p)
{
#ifdef _MSC_VER
    return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p));
#else
    return _mm256_load_ps(p);
#endif
}

I have just checked and it works. But in order to cheat Visual Studio compiler you have to use dynamically allocated pointers. Otherwise compiler doesn't use VMOVDQA instruction.

#include <immintrin.h>

int main()
{
    float * ps = (float*)_mm_malloc(40, 32);
    double * pd = (double*)_mm_malloc(40, 32);

    __m256 s = Load(ps);
//00007FF79FF81325  vmovdqa     ymm1,ymmword ptr [rdi]  
    __m256d d = Load(pd);
//00007FF79FF8132F  vmovdqa     ymm0,ymmword ptr [rax]

    _mm256_storeu_ps(ps, s);
    _mm256_storeu_pd(pd, d);

    _mm_free(ps);
    _mm_free(pd);
}
ErmIg
  • 3,980
  • 1
  • 27
  • 40
  • Sadly, it does not work :(, now it just generates: vmovdqu (still unaligned) instruction in place of first vmovupd. – Martin877 Mar 29 '17 at 13:35
  • 2
    @Martin877 I have just checked and it works. But in order to cheat Visual Studio compiler you have to use dynamically allocated pointers. Otherwise compiler doesn't use VMOVDQA instruction. – ErmIg Mar 29 '17 at 14:29
  • Still not working...I even compiled your minimal example, but still vmovdqu. I have read that aligned instruction are becoming obsolete...that on newer cpus (i have broadwell) un/aligned instructions have similar performance above aligned data structures...,but still ca't get any perfrmance gain. I will try to compile it in gcc and icc, thx anyway. – Martin877 Mar 30 '17 at 11:51
  • I'm using icc (ICC) 19.0.1.144 under Arch Linux and suffering from the same problem. `_mm256_castsi256_ps(_mm256_load_si256((__m256i*)p))` doesn't work. My pointer is aligned to 64 bytes. – recolic Feb 10 '19 at 11:34