I am studying AVX by writing AVX code with inline assembly. In this case, I tried to implement AVX in a simple function. The function name I made is lower_all_chars_base
.
Its behavior is: Apply logical OR to every single char in std::string
with 0x20
.
- Let
c
be every single char in the input. - Assuming the input only contains chars in this set
'A' <= c && c <= 'Z'
.
So the function will make the characters be lower case.
I tried to make the AVX version of the function, the store instruction was unaligned, and there was no speed up at all.
Then I thought, if the memory access is aligned, then it must be faster. After that, I tried to make the AVX version with aligned store, but still gcc
base optimization -O3
beats up my vectorized code by hand. What am I doing wrong here?
Functions Summary
lower_all_chars_base
simple function.lower_all_chars_avx_aligned
AVX2 aligned move version:
- Process first unaligned memory with base operation.
- Then process aligned memory part with AVX2 and aligned move.
- Then the rest with base operation again.
lower_all_chars_avx_unaligned
AVX2 unaligned move version:
- Process the data with AVX2 and unaligned move
- Then the rest with base operation.
Questions
- Why does
gcc
base optimization-O3
beat up my optimization? - What am I doing wrong here?
- What is the proper AVX operation to do this?
Benchmark Result
- CPU: Intel(R) Xeon(R) CPU E5-2650 v4 (2.20GHz)
- Microarchitecture: Broadwell
root@esteh:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@esteh:/tmp# g++ -Wall -Wextra -std=c++2a -O3 test.cpp -o test
root@esteh:/tmp# nice -n -20 ./test
lower_all_chars_base
Min = 0.00662300
Max = 0.00793100
Avg = 0.00717280
Total = 0.07172800
lower_all_chars_avx_aligned
Min = 0.00650200
Max = 0.00785100
Avg = 0.00726220
Total = 0.07262200
lower_all_chars_avx_unaligned
Min = 0.00623600
Max = 0.00835000
Avg = 0.00701360
Total = 0.07013600
Code
Edit: N - 1
for the memset.
Godbolt link: https://godbolt.org/z/a16cGK
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void lower_all_chars_avx_unaligned(string &str);
void do_benchmark(std::string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);
#define N (size_t)(1024u * 1024 * 40)
#define BENCHMARK(STR, FX) do { \
puts(#FX); \
do_benchmark(STR, FX); \
} while(0)
int main() {
static char x[N];
memset(x, 'A', N - 1);
string a(x), b(x), c(x);
BENCHMARK(a, lower_all_chars_base);
BENCHMARK(b, lower_all_chars_avx_aligned);
BENCHMARK(c, lower_all_chars_avx_unaligned);
assert(a == b);
assert(b == c);
memset(x, 'a', N - 1);
assert(memcmp(c.c_str(), x, N - 1) == 0);
}
void do_benchmark(std::string &x, void (*fx)(string &)) {
const size_t n = 10;
double min, max, avg, c, total = 0;
for (size_t i = 0; i < n; i++) {
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
mem_flush(x.c_str(), x.size());
}
avg = total / (double)n;
printf("Min = %.8f\n", min);
printf("Max = %.8f\n", max);
printf("Avg = %.8f\n", avg);
printf("Total = %.8f\n\n", total);
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull, 0x2020202020202020ull,
0x2020202020202020ull, 0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
/* Handle unaligned data from the head. */
uint8_t n = (uintptr_t)cs & 0b11111u;
for (uint8_t i = 0; i < n; i++) {
*cs++ |= 0x20;
}
len -= n;
/* Prevent AVX to process data beyond the array. */
size_t vlen = len - 288;
size_t j;
/* Process the aligned memory with AVX. */
asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
"vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
"vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
"vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
"vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
"vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
"vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
"vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
"vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
"vmovdqa\t%%ymm1, (%[cs],%[j])\n\t"
"vmovdqa\t%%ymm2, 32(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm3, 64(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm4, 96(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm5, 128(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm6, 160(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm7, 192(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm8, 224(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm9, 256(%[cs],%[j])"
:
: [cs]"p"(cs), [j]"r"(j)
: "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
"ymm6", "ymm7", "ymm8", "ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
__attribute__((noinline))
void lower_all_chars_avx_unaligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
size_t j;
size_t vlen = len - 288;
asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
"vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
"vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
"vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
"vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
"vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
"vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
"vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
"vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
"vmovdqu\t%%ymm1, (%[cs],%[j])\n\t"
"vmovdqu\t%%ymm2, 32(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm3, 64(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm4, 96(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm5, 128(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm6, 160(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm7, 192(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm8, 224(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm9, 256(%[cs],%[j])"
:
: [cs]"p"(cs), [j]"r"(j)
: "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
"ymm6", "ymm7", "ymm8", "ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
void mem_flush(const void *p, unsigned int allocation_size) {
/* https://stackoverflow.com/a/43694725/7275114 */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}