I thought unaligned access and write has got cheaper on recent x86_64 CPUs compared to the older ones. However, I recently found out that doing a series of unaligned load and stores can be a huge bottleneck.
e7_avx_a
and e7_avx_u
effectively does the same job. The difference is that one is doing a lot of unaligned access while the other only does aligned access.
The result in rdtscp
cycles is,
873050898 e7_avx_a
2356276111 e7_avx_u
so the unaligned version is almost 3 times slower. The rdtscp
ticks are guaranteed to be constant on my CPU.
I then wrote similar functions in which the difference is that DWORD
load and stores are done instead of YMM
load and stores.
3305091854 e_dword_a
28410838104 e_dword_u
The difference got bigger! The unaligned version is about 9 times slower than the aligned version.
In this stackoverflow question, _mm_loadu_ps
was only 5 percent slower when actually doing unaligned access, although there is a difference that I'm also doing unaligned stores after the load.
I'd like to know what is the cause of the huge slowdown.
My CPU is 11th Gen Intel i5-1135G7 (Tiger Lake), and this is the code. If you're compiling with gcc
, turn off auto vectorization to get a clear result.
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <immintrin.h>
#define NI __attribute__((noinline))
NI void e_dword_a(uint8_t *f, uint8_t *end) {
//for simple testing, doesn't do the same job as the AVX ones
//but the memory access pattern is similar
uint32_t em0 = 0xfbfefdfb;
uint32_t em1 = 0xfdfbfefd;
uint32_t em2 = 0xfefdfbfe;
for (uint32_t *df = (uint32_t *)f; df < (uint32_t *)end; df += 3) {
uint32_t df0 = df[0];
uint32_t df1 = df[1];
uint32_t df2 = df[2];
df0 &= em0;
df1 &= em1;
df2 &= em2;
df[0] = df0;
df[1] = df1;
df[2] = df2;
}
}
NI void e_dword_u(uint8_t *f, uint8_t *end) {
uint32_t em = 0xfffefdfb;
for (uint8_t *_f = f; _f < end; _f += 3 * 4) {
uint32_t df0; memcpy(&df0, _f + 3 * 0, 4);
uint32_t df1; memcpy(&df1, _f + 3 * 1, 4);
uint32_t df2; memcpy(&df2, _f + 3 * 2, 4);
uint32_t df3; memcpy(&df3, _f + 3 * 3, 4);
df0 &= em;
df1 &= em;
df2 &= em;
df3 &= em;
memcpy(_f + 3 * 0, &df0, 4);
memcpy(_f + 3 * 1, &df1, 4);
memcpy(_f + 3 * 2, &df2, 4);
memcpy(_f + 3 * 3, &df3, 4);
}
}
NI void e7_avx_a(uint8_t *f, uint8_t *end) {
//the cycle is finished after repeating 7 times
//after which the same pattern appears again
__m256i em0 = _mm256_set_epi8(126, -17, -33, -3,
-65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3,
-65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3);
__m256i em1 = _mm256_set_epi8(-3, -65, -5, -9,
126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9,
126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9);
__m256i em2 = _mm256_set_epi8(-9, 126, -17, -33,
-3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33,
-3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33);
__m256i em3 = _mm256_set_epi8(-33, -3, -65, -5,
-9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5,
-9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5);
__m256i em4 = _mm256_set_epi8(-5, -9, 126, -17,
-33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17,
-33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17);
__m256i em5 = _mm256_set_epi8(-17, -33, -3, -65,
-5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65,
-5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65);
__m256i em6 = _mm256_set_epi8(-65, -5, -9, 126,
-17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126,
-17, -33, -3, -65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126);
for (__m256i *yf = (__m256i *)f; yf < (__m256i *)end; yf += 7) {
__m256i yf0 = _mm256_load_si256(yf + 0);
__m256i yf1 = _mm256_load_si256(yf + 1);
__m256i yf2 = _mm256_load_si256(yf + 2);
__m256i yf3 = _mm256_load_si256(yf + 3);
__m256i yf4 = _mm256_load_si256(yf + 4);
__m256i yf5 = _mm256_load_si256(yf + 5);
__m256i yf6 = _mm256_load_si256(yf + 6);
yf0 = _mm256_and_si256(yf0, em0);
yf1 = _mm256_and_si256(yf1, em1);
yf2 = _mm256_and_si256(yf2, em2);
yf3 = _mm256_and_si256(yf3, em3);
yf4 = _mm256_and_si256(yf4, em4);
yf5 = _mm256_and_si256(yf5, em5);
yf6 = _mm256_and_si256(yf6, em6);
_mm256_store_si256(yf + 0, yf0);
_mm256_store_si256(yf + 1, yf1);
_mm256_store_si256(yf + 2, yf2);
_mm256_store_si256(yf + 3, yf3);
_mm256_store_si256(yf + 4, yf4);
_mm256_store_si256(yf + 5, yf5);
_mm256_store_si256(yf + 6, yf6);
}
}
NI void e7_avx_u(uint8_t *f, uint8_t *end) {
__m256i em = _mm256_set_epi8(-1, -1, -1, -1,
-65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3,
-65, -5, -9, 126, -17, -33, -3, -65, -5, -9, 126, -17, -33, -3);
for (uint8_t *_f = f; _f < end; _f += 28 * 8) {
__m256i yf0 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 0));
__m256i yf1 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 1));
__m256i yf2 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 2));
__m256i yf3 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 3));
__m256i yf4 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 4));
__m256i yf5 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 5));
__m256i yf6 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 6));
__m256i yf7 = _mm256_lddqu_si256((__m256i *)(_f + 28 * 7));
yf0 = _mm256_and_si256(yf0, em);
yf1 = _mm256_and_si256(yf1, em);
yf2 = _mm256_and_si256(yf2, em);
yf3 = _mm256_and_si256(yf3, em);
yf4 = _mm256_and_si256(yf4, em);
yf5 = _mm256_and_si256(yf5, em);
yf6 = _mm256_and_si256(yf6, em);
yf7 = _mm256_and_si256(yf7, em);
_mm256_storeu_si256((__m256i *)(_f + 28 * 0), yf0);
_mm256_storeu_si256((__m256i *)(_f + 28 * 1), yf1);
_mm256_storeu_si256((__m256i *)(_f + 28 * 2), yf2);
_mm256_storeu_si256((__m256i *)(_f + 28 * 3), yf3);
_mm256_storeu_si256((__m256i *)(_f + 28 * 4), yf4);
_mm256_storeu_si256((__m256i *)(_f + 28 * 5), yf5);
_mm256_storeu_si256((__m256i *)(_f + 28 * 6), yf6);
_mm256_storeu_si256((__m256i *)(_f + 28 * 7), yf7);
}
}
#define C (256 * 1024)
static unsigned long long rdtscp() {
unsigned _;
return __builtin_ia32_rdtscp(&_);
}
NI void time(void (*e)(uint8_t *, uint8_t *), uint8_t *f) {
unsigned long long c = rdtscp();
for (int i = 0; i < 100000; ++i) {
e(f, f + C);
}
c = rdtscp() - c;
printf("%12llu\n", c);
}
static void test(void (*ea)(uint8_t *, uint8_t *),
void (*eu)(uint8_t *, uint8_t *), uint8_t (*f)[C + 1024]) {
//see if both are doing the same job, then measure time
memset(f[0], -1, C);
memset(f[1], -1, C);
printf("%d\n", memcmp(f[0], f[1], C));
ea(f[0], f[0] + C);
eu(f[1], f[1] + C);
printf("%d\n", memcmp(f[0], f[1], C));
time(ea, f[0]);
time(eu, f[1]);
}
int main() {
_Alignas(64) uint8_t f[2][C + 1024];
test(e7_avx_a, e7_avx_u, f);
test(e_dword_a, e_dword_u, f);
return 0;
}