I'm trying to translate some scalar code (calc_offsets
below) into the AVX2 equivalent. It takes a series of 'counts' and generates a table of offset positions, starting from some provided base value.
My attempt at converting this to AVX2 (avx2_calc_offsets
), which I think is correct, seems to be about half the speed of the simple array approach. This is part of an effort to translate a larger hot section of (bottlenecked) code into AVX2 instructions and I'm looking to process the offsets further as vectors. I'd like to avoid jumping between AVX2 and scalar code for operations like this.
There's some example and simple benchmarking code provided. I'm getting around 2.15 seconds runtime for the array version and 4.41 seconds for the AVX2 version (on Ryzen Zen v1).
Is there a better approach using AVX2 to make this operation faster? I need to consider older AVX2 CPUs such as Haswell and original Ryzen series.
#include <immintrin.h>
#include <inttypes.h>
#include <stdio.h>
typedef uint32_t u32;
typedef uint64_t u64;
void calc_offsets (const u32 base, const u32 *counts, u32 *offsets)
{
offsets[0] = base;
offsets[1] = offsets[0] + counts[0];
offsets[2] = offsets[1] + counts[1];
offsets[3] = offsets[2] + counts[2];
offsets[4] = offsets[3] + counts[3];
offsets[5] = offsets[4] + counts[4];
offsets[6] = offsets[5] + counts[5];
offsets[7] = offsets[6] + counts[6];
}
__m256i avx2_calc_offsets (const u32 base, const __m256i counts)
{
const __m256i shuff = _mm256_set_epi32 (6, 5, 4, 3, 2, 1, 0, 7);
__m256i v, t;
// shift whole vector `v` 4 bytes left and insert `base`
v = _mm256_permutevar8x32_epi32 (counts, shuff);
v = _mm256_insert_epi32 (v, base, 0);
// accumulate running total within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// add highest value in right-hand lane to each value in left
t = _mm256_set1_epi32 (_mm256_extract_epi32 (v, 3));
v = _mm256_blend_epi32 (_mm256_add_epi32 (v, t), v, 0x0F);
return v;
}
void main()
{
u32 base = 900000000;
u32 counts[8] = { 5, 50, 500, 5000, 50000, 500000, 5000000, 50000000 };
u32 offsets[8];
calc_offsets (base, &counts[0], &offsets[0]);
printf ("calc_offsets: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
__m256i v, t;
v = _mm256_loadu_si256 ((__m256i *) &counts[0]);
t = avx2_calc_offsets (base, v);
_mm256_storeu_si256 ((__m256i *) &offsets[0], t);
printf ("avx2_calc_offsets: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
// --- benchmarking ---
#define ITERS 1000000000
// uncomment to benchmark AVX2 version
// #define AVX2_BENCH
#ifdef AVX2_BENCH
// benchmark AVX2 version
for (u64 i = 0; i < ITERS; i++) {
v = avx2_calc_offsets (base, v);
}
_mm256_storeu_si256 ((__m256i *) &offsets[0], v);
#else
// benchmark array version
u32 *c = &counts[0];
u32 *o = &offsets[0];
for (u64 i = 0; i < ITERS; i++) {
calc_offsets (base, c, o);
// feedback results to prevent optimizer 'cleverness'
u32 *tmp = c;
c = o;
o = tmp;
}
#endif
printf ("offsets after benchmark: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
}
I'm using gcc -O2 -mavx2 ...
to build. Godbolt link.