I have written some Rust functions that use AVX2 and AVX512 instructions to speed up image compositing. I am using an AMD 7950x CPU.
When I run RUSTFLAGS="-C target-cpu=native" cargo bench
I get:
test overlay_using_avx2 ... bench: 483,596 ns/iter (+/- 10,006)
test overlay_using_avx512 ... bench: 317,818 ns/iter (+/- 729)
However, I would like to build the executable on one machine and the run it on another machine. Therefore, I am explicitly enabling the features that my code needs and checking at runtime if they are present. However, when I do this, the AVX512 benchmark runs slower and I don't understand why. I am running:
RUSTFLAGS="-C target-feature=+avx2,+avx,+sse2,+avx512f,+avx512bw" cargo bench
:
test overlay_using_avx2 ... bench: 490,664 ns/iter (+/- 13,172)
test overlay_using_avx512 ... bench: 1,519,720 ns/iter (+/- 38,608)
Do I need to enable some other feature(s) from the rustc --print target-features
list? Is it possible to see which features were enabled by setting target-cpu=native
?
My benchmark code is below and runs on nightly:
#![feature(stdsimd)]
#![feature(test)]
use std::arch::x86_64::*;
unsafe fn overlay_chunk_avx2(this_chunk: &mut [u8], image_chunk: &[u8], c1: __m256i, c2: __m256i) {
let this_ptr = this_chunk.as_mut_ptr() as *mut __m128i;
let image_ptr = image_chunk.as_ptr() as *const __m128i;
let this_argb = _mm_loadu_si128(this_ptr);
let image_argb = _mm_loadu_si128(image_ptr);
let this_u16 = _mm256_cvtepu8_epi16(this_argb);
let image_u16 = _mm256_cvtepu8_epi16(image_argb);
let image_alpha = _mm256_shuffle_epi8(image_u16, c1);
let image_inv_alpha = _mm256_sub_epi8(c2, image_alpha);
let this_blended = _mm256_mullo_epi16(this_u16, image_inv_alpha);
let image_blended = _mm256_mullo_epi16(image_u16, image_alpha);
let blended = _mm256_add_epi16(this_blended, image_blended);
let divided = _mm256_srli_epi16(blended, 8);
let lo_lane = _mm256_castsi256_si128(divided);
let hi_lane = _mm256_extracti128_si256(divided, 1);
let divided_u8 = _mm_packus_epi16(lo_lane, hi_lane);
_mm_storeu_si128(this_ptr, divided_u8);
}
unsafe fn overlay_chunk_avx512(this_chunk: &mut [u8], image_chunk: &[u8], c1: __m512i, c2: __m512i) {
let this_ptr = this_chunk.as_mut_ptr() as *mut i8;
let image_ptr = image_chunk.as_ptr() as *const i8;
let this_argb = _mm256_loadu_epi8(this_ptr);
let image_argb = _mm256_loadu_epi8(image_ptr);
let this_u16 = _mm512_cvtepu8_epi16(this_argb);
let image_u16 = _mm512_cvtepu8_epi16(image_argb);
let image_alpha = _mm512_shuffle_epi8(image_u16, c1);
let image_inv_alpha = _mm512_sub_epi8(c2, image_alpha);
let this_blended = _mm512_mullo_epi16(this_u16, image_inv_alpha);
let image_blended = _mm512_mullo_epi16(image_u16, image_alpha);
let blended = _mm512_add_epi16(this_blended, image_blended);
let divided = _mm512_srli_epi16(blended, 8);
let divided_u8 = _mm512_cvtepi16_epi8(divided);
_mm256_storeu_epi8(this_ptr, divided_u8);
}
extern crate test;
#[bench]
fn overlay_using_avx2(bencher: &mut test::Bencher) {
let mut frame = vec![0; 1920 * 1080 * 4];
let image = vec![0; 1920 * 1080 * 4];
let constant1 = unsafe { _mm256_set_epi8(-1, 24, -1, 24, -1, 24, -1, -1, -1, 16, -1, 16, -1, 16, -1, -1, -1, 8, -1, 8, -1, 8, -1, -1, -1, 0, -1, 0, -1, 0, -1, -1) };
let constant2 = unsafe { _mm256_set_epi8(0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0) };
bencher.iter(|| {
let frame_chunks = frame.chunks_exact_mut(128 / 8);
let image_chunks = image.chunks_exact(128 / 8);
for (frame_chunk, image_chunk) in frame_chunks.zip(image_chunks) {
unsafe { overlay_chunk_avx2(frame_chunk, image_chunk, constant1, constant2); }
}
});
}
#[bench]
fn overlay_using_avx512(bencher: &mut test::Bencher) {
let mut frame = vec![0; 1920 * 1080 * 4];
let image = vec![0; 1920 * 1080 * 4];
let constant1 = unsafe { _mm512_set_epi8(-1, 56, -1, 56, -1, 56, -1, -1, -1, 48, -1, 48, -1, 48, -1, -1, -1, 40, -1, 40, -1, 40, -1, -1, -1, 32, -1, 32, -1, 32, -1, -1, -1, 24, -1, 24, -1, 24, -1, -1, -1, 16, -1, 16, -1, 16, -1, -1, -1, 8, -1, 8, -1, 8, -1, -1, -1, 0, -1, 0, -1, 0, -1, -1) };
let constant2 = unsafe { _mm512_set_epi8(0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0) };
bencher.iter(|| {
let frame_chunks = frame.chunks_exact_mut(256 / 8);
let image_chunks = image.chunks_exact(256 / 8);
for (frame_chunk, image_chunk) in frame_chunks.zip(image_chunks) {
unsafe { overlay_chunk_avx512(frame_chunk, image_chunk, constant1, constant2); }
}
});
}