AVX2 - method is 14x slower over a classic version

Question

I have rewritten logaritmic function from http://gruntthepeon.free.fr/ssemath/ to be used with doubles and AVX2.

However, entire function is 14x slower (15s) than regular C/C++ version (1.1s). When I comment all ines, that use _mm256_sub_pd, AVX2 runs 3x faster (300ms) than regular C/C++ version. What is wrong? When I rewritten sincos and exp to AVX, there was no problem and speed up was in a range 3x - 5x against regular C/C++ version.

#include <immintrin.h>     //AVX2


#define PD_CONST(Name, Val) \
  static const alignas(32) double _pd_##Name[4] = { Val, Val, Val, Val }

#define PI_CONST_64(Name, Val) \
  static const alignas(32) uint64_t _i64_##Name[4] = { Val, Val, Val, Val }

#define PI_CONST_32(Name,  Val) \
  static const alignas(32) uint32_t _i32_##Name[4] = { Val, Val, Val, Val }

//constants of size 64bit
PD_CONST(1, 1.0f);
PD_CONST(0p5, 0.5f);

PD_CONST(cephes_SQRTHF, 0.707106781186547524);
PD_CONST(cephes_log_p0, 7.0376836292E-2);
PD_CONST(cephes_log_p1, -1.1514610310E-1);
PD_CONST(cephes_log_p2, 1.1676998740E-1);
PD_CONST(cephes_log_p3, -1.2420140846E-1);
PD_CONST(cephes_log_p4, +1.4249322787E-1);
PD_CONST(cephes_log_p5, -1.6668057665E-1);
PD_CONST(cephes_log_p6, +2.0000714765E-1);
PD_CONST(cephes_log_p7, -2.4999993993E-1);
PD_CONST(cephes_log_p8, +3.3333331174E-1);
PD_CONST(cephes_log_q1, -2.12194440e-4);
PD_CONST(cephes_log_q2, 0.693359375);


/* the smallest non denormalized double number */
PI_CONST_64(min_norm_pos ,  0x0080'0000'0000'0000);
PI_CONST_64(mant_mask    ,  0x7ff0'0000'0000'0000);
PI_CONST_64(inv_mant_mask, ~0x7ff0'0000'0000'0000);

PI_CONST_64(inv_sign_mask, 0x7FFF'FFFF'FFFF'FFFF);
PI_CONST_64(sign_mask   , ~0x7FFF'FFFF'FFFF'FFFF);

//constants of size 32bit
PI_CONST_32(1, 0x1);
PI_CONST_32(0x7f, 0x7f);

__m256d _my_mm256_log_pd(__m256d x) {
    __m256d one = *(__m256d*)_pd_1;

    __m256d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LE_OQ);

    x = _mm256_max_pd(x, *(__m256d*)_i64_min_norm_pos);

    //"native" computation in double requires _mm256_cvtepi64_pd
    //which is AVX512   
    //convert double to float -> perform calculation in float -> convert back
    const int MANTISA_SIZE = 23;
    __m128i emm0 = _mm_srli_epi32(_mm_castps_si128(_mm256_cvtpd_ps(x)), MANTISA_SIZE);

    x = _mm256_and_pd(x, *(__m256d*)_i64_inv_mant_mask);
    x = _mm256_or_pd(x, *(__m256d*)_pd_0p5);

    emm0 = _mm_sub_epi32(emm0, *(__m128i*)_i32_0x7f);   
    __m256d e = _mm256_cvtps_pd(_mm_cvtepi32_ps(emm0));
    //=======================================================

    e = _mm256_add_pd(e, one);

    __m256d mask = _mm256_cmp_pd(x, *(__m256d*)_pd_cephes_SQRTHF, _CMP_LT_OQ);
    __m256d tmp = _mm256_and_pd(x, mask);
    x = _mm256_sub_pd(x, one);
    e = _mm256_sub_pd(e, _mm256_and_pd(one, mask));
    x = _mm256_add_pd(x, tmp);

    __m256d y = *(__m256d*)_pd_cephes_log_p0;
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p1);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p2);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p3);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p4);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p5);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p6);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p7);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p8);
    y = _mm256_mul_pd(y, x);

    __m256d z = _mm256_mul_pd(x, x);
    y = _mm256_mul_pd(y, z);


    tmp = _mm256_mul_pd(e, *(__m256d*)_pd_cephes_log_q1);
    y = _mm256_add_pd(y, tmp);


    tmp = _mm256_mul_pd(z, *(__m256d*)_pd_0p5);
    y = _mm256_sub_pd(y, tmp);

    tmp = _mm256_mul_pd(e, *(__m256d*)_pd_cephes_log_q2);
    x = _mm256_add_pd(x, y);
    x = _mm256_add_pd(x, tmp);

    return _mm256_or_pd(x, invalid_mask); // negative arg will be NAN    
}

And my test code:

#include <chrono>
#include <algorithm>
#include <stdio.h>


void findMax(double * data, size_t dataLen){
    double dataMax = std::log(data[0]);

    for (size_t i = 1; i < dataLen; i++) {
        dataMax = std::max(dataMax, std::log(data[i]));
    }

    printf("%f\n", dataMax);
}

void findMaxSimd(double * data, size_t dataLen) {   
    __m256d dataMax = _mm256_load_pd(data);
    dataMax = _my_mm256_log_pd(dataMax);

    size_t dataLen4 = dataLen - dataLen % 4;

    for (size_t i = 4; i < dataLen4; i += 4) {
        __m256d tmp = _mm256_load_pd(data + i);
        __m256d s = _my_mm256_log_pd(tmp);
        dataMax = _mm256_max_pd(dataMax, s);        
    }

    alignas(32)  double res[4];
    _mm256_store_pd(res, dataMax);

    double m = std::max(res[0], std::max(res[1], std::max(res[2], res[3])));

    //process rest of array that is not modulable by 4
    for (size_t i = dataLen4; i < dataLen; i++) {
        m = std::max(m, std::log(data[i]));
    }

    printf("%f\n", m);
}


int main(int argc, char ** argv) {  
    std::uniform_real_distribution<float> uniform_distf(0, 20);
    std::default_random_engine genf;

    size_t dataLen = 4 * 50'000'000 + 2;

    double * alignedData = (double *)_aligned_malloc(dataLen * sizeof(double), 32);

    for (size_t i = 0; i < dataLen; i++) {
        alignedData[i] = uniform_distf(genf);
    }

    auto t00 = std::chrono::high_resolution_clock::now();   
    findMax(alignedData, dataLen);  
    auto tc = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t00).count();
    printf("%lld ms\n", tc);


    t00 = std::chrono::high_resolution_clock::now();    
    findMaxSimd(alignedData, dataLen);
    tc = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t00).count();
    printf("%lld ms\n", tc);

    _aligned_free(alignedData);

    return 0;
}

No repro. I'm getting 1176 ms for `findMax` and 546 ms for `findMaxSimd` when running this code. — user7860670, Jun 29 '19 at 11:14
It is unlikely that this is caused by subnormal number calculations, but nevertheless I would try `_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);` and `_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);` and check the timing results again. Only to make sure that no subnormal number are involved. — wim, Jun 29 '19 at 13:51
Which toolset (compiler) are you using, and which version? I've seen massive improvements in AVX code when a compiler gets a major version upgrade. — 1201ProgramAlarm, Jun 29 '19 at 15:31
@OznOg: This is C++, not Java. There's no runtime JIT to warm up, only CPU cache and branch prediction. 1 second is normally fine for a microbenchmark, especially if it doesn't touch a huge amount of memory, and on a modern Skylake CPU with hardware P-states that ramps up to max turbo in the first millisecond. Of course, if you don't understand the details about what / how you're measuring, a longer run can be a good idea. Especially to check that runtime scales linearly with iteration count, otherwise you're probably not measuring what you think you are! — Peter Cordes, Jun 29 '19 at 19:12
@MartinPerry: Are you sure you compiled with `-O2 -arch:AVX2`? (Or `-Ox -arch:AVX2`)? Debug builds often make AVX intrinsics slower than scalar code, because there are more statements per loop iteration. — Peter Cordes, Jun 29 '19 at 19:15
I cannot reproduce this either (I tried gcc and clang). I think that you should add some disassembly here. — geza, Jun 29 '19 at 20:39
@OznOg: usually doing a couple 1-second runs and throwing out outliers is totally fine. i.e. 2 separate runs that are both 0.75 seconds and one that's 0.85 seconds tells you something weird happened on the slower run, and the faster ones are the ones you want. Averaging away the noise over a very long run works too, but 1 second is already ~4 billion core clock cycles. vs. the disruption from an interrupt or cache eviction being maybe a million. What's more likely to get you is another process running on the other logical core of the same physical core, causing sustained competition. — Peter Cordes, Jun 30 '19 at 04:20
@OznOg: But you won't really detect that with one long run. Multiple runs that are long enough to hide measurement overhead (like 1 second) is better because you can notice whether it settles down to a consistent result or not. And it's still short enough for interactive experimentation. — Peter Cordes, Jun 30 '19 at 04:22
@PeterCordes Stupid mistake, I have forget to add `-arch:AVX2`. With this flag, code is running as expected. Still, interesting, that it affected only this piece of code and other AVX code was running fine. — Martin Perry, Jun 30 '19 at 05:41
Without `-arch:AVX2`, I think MSVC will use the non-VEX encoding for instructions that don't *require* AVX, e.g. for scalar math. So depending on which Intel CPU you have (Haswell/Broadwell vs. Skylake and later), you either get mode-transition penalties or false dependencies. — Peter Cordes, Jun 30 '19 at 05:44

AVX2 - method is 14x slower over a classic version

0 Answers0