1

I thought unaligned access and write has got cheaper on recent x86_64 CPUs compared to the older ones. However, I recently found out that doing a series of unaligned load and stores can be a huge bottleneck.

e7_avx_a and e7_avx_u effectively does the same job. The difference is that one is doing a lot of unaligned access while the other only does aligned access.

The result in rdtscp cycles is,

   873050898 e7_avx_a
  2356276111 e7_avx_u

so the unaligned version is almost 3 times slower. The rdtscp ticks are guaranteed to be constant on my CPU.

I then wrote similar functions in which the difference is that DWORD load and stores are done instead of YMM load and stores.

  3305091854 e_dword_a
 28410838104 e_dword_u

The difference got bigger! The unaligned version is about 9 times slower than the aligned version.

In this stackoverflow question, _mm_loadu_ps was only 5 percent slower when actually doing unaligned access, although there is a difference that I'm also doing unaligned stores after the load.

I'd like to know what is the cause of the huge slowdown.

My CPU is 11th Gen Intel i5-1135G7 (Tiger Lake), and this is the code. If you're compiling with gcc, turn off auto vectorization to get a clear result.

#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <immintrin.h>

#define NI __attribute__((noinline))

NI void e_dword_a(uint8_t *f, uint8_t *end) {
    
    //for simple testing, doesn't do the same job as the AVX ones
    //but the memory access pattern is similar
    
    uint32_t em0 = 0xfbfefdfb;
    uint32_t em1 = 0xfdfbfefd;
    uint32_t em2 = 0xfefdfbfe;
    for (uint32_t *df = (uint32_t *)f; df < (uint32_t *)end; df += 3) {
        uint32_t df0 = df[0];
        uint32_t df1 = df[1];
        uint32_t df2 = df[2];
        df0 &= em0;
        df1 &= em1;
        df2 &= em2;
        df[0] = df0;
        df[1] = df1;
        df[2] = df2;
    }
}

NI void e_dword_u(uint8_t *f, uint8_t *end) {
    uint32_t em = 0xfffefdfb;
    for (uint8_t *_f = f; _f < end; _f += 3 * 4) {
        uint32_t df0; memcpy(&df0, _f + 3 * 0, 4);
        uint32_t df1; memcpy(&df1, _f + 3 * 1, 4);
        uint32_t df2; memcpy(&df2, _f + 3 * 2, 4);
        uint32_t df3; memcpy(&df3, _f + 3 * 3, 4);
        df0 &= em;
        df1 &= em;
        df2 &= em;
        df3 &= em;
        memcpy(_f + 3 * 0, &df0, 4);
        memcpy(_f + 3 * 1, &df1, 4);
        memcpy(_f + 3 * 2, &df2, 4);
        memcpy(_f + 3 * 3, &df3, 4);
    }
}

NI void e7_avx_a(uint8_t *f, uint8_t *end) {
    
    //the cycle is finished after repeating 7 times
    //after which the same pattern appears again
    
    __m256i em0 = _mm256_set_epi8(126, -17, -33, -3,
    -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3,
    -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3);
    __m256i em1 = _mm256_set_epi8(-3, -65, -5, -9,
    126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9,
    126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9);
    __m256i em2 = _mm256_set_epi8(-9, 126, -17, -33,
    -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33,
    -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33);
    __m256i em3 = _mm256_set_epi8(-33, -3, -65, -5,
    -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5,
    -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5);
    __m256i em4 = _mm256_set_epi8(-5, -9, 126, -17,
    -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17,
    -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17);
    __m256i em5 = _mm256_set_epi8(-17, -33, -3, -65,
    -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65,
    -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65);
    __m256i em6 = _mm256_set_epi8(-65, -5, -9, 126,
    -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126,
    -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126);
    for (__m256i *yf = (__m256i *)f; yf < (__m256i *)end; yf += 7) {
        __m256i yf0 = _mm256_load_si256(yf + 0);
        __m256i yf1 = _mm256_load_si256(yf + 1);
        __m256i yf2 = _mm256_load_si256(yf + 2);
        __m256i yf3 = _mm256_load_si256(yf + 3);
        __m256i yf4 = _mm256_load_si256(yf + 4);
        __m256i yf5 = _mm256_load_si256(yf + 5);
        __m256i yf6 = _mm256_load_si256(yf + 6);
        yf0 = _mm256_and_si256(yf0, em0);
        yf1 = _mm256_and_si256(yf1, em1);
        yf2 = _mm256_and_si256(yf2, em2);
        yf3 = _mm256_and_si256(yf3, em3);
        yf4 = _mm256_and_si256(yf4, em4);
        yf5 = _mm256_and_si256(yf5, em5);
        yf6 = _mm256_and_si256(yf6, em6);
        _mm256_store_si256(yf + 0, yf0);
        _mm256_store_si256(yf + 1, yf1);
        _mm256_store_si256(yf + 2, yf2);
        _mm256_store_si256(yf + 3, yf3);
        _mm256_store_si256(yf + 4, yf4);
        _mm256_store_si256(yf + 5, yf5);
        _mm256_store_si256(yf + 6, yf6);
    }
}

NI void e7_avx_u(uint8_t *f, uint8_t *end) {
    __m256i em = _mm256_set_epi8(-1, -1, -1, -1,
    -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3,
    -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3);
    for (uint8_t *_f = f; _f < end; _f += 28 * 8) {
        __m256i yf0 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 0));
        __m256i yf1 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 1));
        __m256i yf2 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 2));
        __m256i yf3 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 3));
        __m256i yf4 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 4));
        __m256i yf5 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 5));
        __m256i yf6 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 6));
        __m256i yf7 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 7));
        yf0 = _mm256_and_si256(yf0, em);
        yf1 = _mm256_and_si256(yf1, em);
        yf2 = _mm256_and_si256(yf2, em);
        yf3 = _mm256_and_si256(yf3, em);
        yf4 = _mm256_and_si256(yf4, em);
        yf5 = _mm256_and_si256(yf5, em);
        yf6 = _mm256_and_si256(yf6, em);
        yf7 = _mm256_and_si256(yf7, em);
        _mm256_storeu_si256((__m256i *)(_f + 28 * 0), yf0);
        _mm256_storeu_si256((__m256i *)(_f + 28 * 1), yf1);
        _mm256_storeu_si256((__m256i *)(_f + 28 * 2), yf2);
        _mm256_storeu_si256((__m256i *)(_f + 28 * 3), yf3);
        _mm256_storeu_si256((__m256i *)(_f + 28 * 4), yf4);
        _mm256_storeu_si256((__m256i *)(_f + 28 * 5), yf5);
        _mm256_storeu_si256((__m256i *)(_f + 28 * 6), yf6);
        _mm256_storeu_si256((__m256i *)(_f + 28 * 7), yf7);
    }
}

#define C (256 * 1024)

static unsigned long long rdtscp() {
    unsigned _;
    return __builtin_ia32_rdtscp(&_);
}

NI void time(void (*e)(uint8_t *, uint8_t *), uint8_t *f) {
    unsigned long long c = rdtscp();
    for (int i = 0; i < 100000; ++i) {
        e(f, f + C);
    }
    c = rdtscp() - c;
    printf("%12llu\n", c);
}

static void test(void (*ea)(uint8_t *, uint8_t *),
                 void (*eu)(uint8_t *, uint8_t *), uint8_t (*f)[C + 1024]) {
    
    //see if both are doing the same job, then measure time
    
    memset(f[0], -1, C);
    memset(f[1], -1, C);
    printf("%d\n", memcmp(f[0], f[1], C));
    ea(f[0], f[0] + C);
    eu(f[1], f[1] + C);
    printf("%d\n", memcmp(f[0], f[1], C));
    time(ea, f[0]);
    time(eu, f[1]);
}

int main() {
    _Alignas(64) uint8_t f[2][C + 1024];
    test(e7_avx_a, e7_avx_u, f);
    test(e_dword_a, e_dword_u, f);
    return 0;
}
xiver77
  • 2,162
  • 1
  • 2
  • 12
  • Are your loads overlapping your previous stores here, creating a store-forwarding stall? That's a separate penalty from misalignment. e.g. storing a byte in the middle of an aligned `uint64_t` and then reloading it will hurt just just as much in asm. (In C the compiler might optimize it into ALU stuff on a register.) See [Bubble sort slower with -O3 than -O2 with GCC](https://stackoverflow.com/q/69503317) for an example and details about store-forwarding stalls. – Peter Cordes Jan 23 '22 at 02:03
  • The perf event `ld_blocks.store_forward` counts number of loads that suffer a store-forwarding stall. – Peter Cordes Jan 23 '22 at 02:05
  • 1
    Yup, tested on my own Skylake desktop, quite a few counts for that event (like 2.3G out of 50.8G total uops, and that's including the non-overlapping versions). And only 6 counts total when I removed the calls to `e_dword_u` and `e7_avx_u`. I compiled with `gcc -O3 -march=native -fno-tree-vectorize unalign.c`, profiled with `taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any,uops_executed.thread,idq.mite_uops,ld_blocks.store_forward -r1 ./a.out` – Peter Cordes Jan 23 '22 at 02:18
  • It would be even worse if you hadn't unrolled your loop; when you load a block of 8 vectors, only the first one partially overlaps with the last 32-byte store. – Peter Cordes Jan 23 '22 at 02:19
  • @PeterCordes Something is wrong with this site that every `x86` related question by me was answered by just you recently, many thanks. – xiver77 Jan 23 '22 at 15:46

0 Answers0