1

I have two images A and B that are stored as byte arrays of ARGB data:

Image A: [a0, r0, g0, b0, a1, r1, g1, b1, ...]
Image B: [a0, r0, g0, b0, a1, r1, g1, b1, ...]

I would like to overlay image B on top of A using the alpha blending formula.

How can I achieve this with AVX512 instructions that operate on multiple pixels at a time?

I don't mind using 256 instead of 255 in the calculations if that makes things simpler.

Edit:

I tried implementing this based on another stackoverflow answer. However, it seems to be slower than the non-AVX512 code that runs one pixel at a time. What am I doing wrong?

I tried without using lazy_static! (because I think it uses a locking data structure) and passed the constants into the function but it was still slower. Is this just not a good problem to solve with AVX512? It seems like it should be.

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn overlay_row_avx512(this_chunk: &mut [u8], image_chunk: &[u8]) {
    use std::arch::x86_64::*;

    let this_ptr = this_chunk.as_mut_ptr() as *mut i8;
    let image_ptr = image_chunk.as_ptr() as *const i8;

    let this_argb = _mm512_loadu_epi8(this_ptr);
    let image_argb = _mm512_loadu_epi8(image_ptr);

    // Pick out the upper 256-bits and calculate inv_alpha.
    let this_upper = _mm512_shuffle_epi8(this_argb, *UPPER_TO_U16);
    let image_upper = _mm512_shuffle_epi8(image_argb, *UPPER_TO_U16);
    let alpha_upper = _mm512_shuffle_epi8(image_argb, *UPPER_ALPHA_TO_U16);
    let inv_alpha_upper = _mm512_subs_epu8(*U8_MAX_VALUE, alpha_upper);

    // Apply the blend function and store the result in blended_upper_u8.
    let this_blended_upper = _mm512_mullo_epi16(this_upper, inv_alpha_upper);
    let image_blended_upper = _mm512_mullo_epi16(image_upper, alpha_upper); // TODO: premultiply alpha
    let blended_upper = _mm512_add_epi16(this_blended_upper, image_blended_upper);
    let blended_upper_u8 = _mm512_shuffle_epi8(blended_upper, *UPPER_U16_TO_U8);

    // Repeat for the lower 256-bits.
    let this_lower = _mm512_shuffle_epi8(this_argb, *LOWER_TO_U16);
    let image_lower = _mm512_shuffle_epi8(image_argb, *LOWER_TO_U16);
    let alpha_lower = _mm512_shuffle_epi8(image_argb, *LOWER_ALPHA_TO_U16);
    let inv_alpha_lower = _mm512_subs_epu8(*U8_MAX_VALUE, alpha_lower);

    let this_blended_lower = _mm512_mullo_epi16(this_lower, inv_alpha_lower);
    let image_blended_lower = _mm512_mullo_epi16(image_lower, alpha_lower); // TODO: premultiply alpha
    let blended_lower = _mm512_add_epi16(this_blended_lower, image_blended_lower);
    let blended_lower_u8 = _mm512_add_epi16(blended_lower, *LOWER_U16_TO_U8);

    // OR together the upper and lower 256-bits.
    let blended = _mm512_or_si512(blended_upper_u8, blended_lower_u8);

    _mm512_storeu_epi8(this_ptr, blended);
}

lazy_static! {
    static ref U8_MAX_VALUE: __m512i = unsafe { _mm512_set1_epi8(-1) };

    static ref UPPER_TO_U16: __m512i = unsafe {
        _mm512_set_epi8(
            X, 63, X, 62, X, 61, X, 60, X, 59, X, 58, X, 57, X, 56,
            X, 55, X, 54, X, 53, X, 52, X, 51, X, 50, X, 49, X, 48,
            X, 47, X, 46, X, 45, X, 44, X, 43, X, 42, X, 41, X, 40,
            X, 39, X, 38, X, 37, X, 36, X, 35, X, 34, X, 33, X, 32,
        )
    };

    static ref LOWER_TO_U16: __m512i = unsafe {
        _mm512_set_epi8(
            X, 31, X, 30, X, 29, X, 28, X, 27, X, 26, X, 25, X, 24,
            X, 23, X, 22, X, 21, X, 20, X, 19, X, 18, X, 17, X, 16,
            X, 15, X, 14, X, 13, X, 12, X, 11, X, 10, X,  9, X,  8,
            X,  7, X,  6, X,  5, X,  4, X,  3, X,  2, X,  1, X,  0,
        )
    };

    static ref UPPER_ALPHA_TO_U16: __m512i = unsafe {
        _mm512_set_epi8(
            X, 63, X, 63, X, 63, X, 63, X, 59, X, 59, X, 59, X, 59,
            X, 55, X, 55, X, 55, X, 55, X, 51, X, 51, X, 51, X, 51,
            X, 47, X, 47, X, 47, X, 47, X, 43, X, 43, X, 43, X, 43,
            X, 39, X, 39, X, 39, X, 39, X, 35, X, 35, X, 35, X, 35,
        )
    };

    static ref LOWER_ALPHA_TO_U16: __m512i = unsafe {
        _mm512_set_epi8(
            X, 31, X, 31, X, 31, X, 31, X, 27, X, 27, X, 27, X, 27,
            X, 23, X, 23, X, 23, X, 23, X, 19, X, 19, X, 19, X, 19,
            X, 15, X, 15, X, 15, X, 15, X, 11, X, 11, X, 11, X, 11,
            X,  7, X,  7, X,  7, X,  7, X,  3, X,  3, X,  3, X,  3,
        )
    };

    // Pick out the upper 8-bits of each 16-bit u16.
    // This effectively divides by 256.
    static ref UPPER_U16_TO_U8: __m512i = unsafe {
        _mm512_set_epi8(
            63, X, 62, X, 61, X, 60, X, 59, X, 58, X, 57, X, 56, X,
            55, X, 54, X, 53, X, 52, X, 51, X, 50, X, 49, X, 48, X,
            47, X, 46, X, 45, X, 44, X, 43, X, 42, X, 41, X, 40, X,
            39, X, 38, X, 37, X, 36, X, 35, X, 34, X, 33, X, 32, X,
        )
    };

    static ref LOWER_U16_TO_U8: __m512i = unsafe {
        _mm512_set_epi8(
            31, X, 30, X, 29, X, 28, X, 27, X, 26, X, 25, X, 24, X,
            23, X, 22, X, 21, X, 20, X, 19, X, 18, X, 17, X, 16, X,
            15, X, 14, X, 13, X, 12, X, 11, X, 10, X,  9, X,  8, X,
             7, X,  6, X,  5, X,  4, X,  3, X,  2, X,  1, X,  0, X,
        )
    };
}

For comparison, here's my code that runs one pixel at a time:

// A chunk is just 4 bytes in this case rather than 64 bytes.
fn overlay_row_without_simd(this_chunk: &mut [u8], image_chunk: &[u8]) {
    let alpha = image_chunk[0] as u32;
    let inv_alpha = 255 - alpha;

    this_chunk[1] = ((this_chunk[1] as u32 * inv_alpha + image_chunk[1] as u32 * alpha) / 255) as u8;
    this_chunk[2] = ((this_chunk[2] as u32 * inv_alpha + image_chunk[2] as u32 * alpha) / 255) as u8;
    this_chunk[3] = ((this_chunk[3] as u32 * inv_alpha + image_chunk[3] as u32 * alpha) / 255) as u8;
}
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
Chris
  • 1,501
  • 17
  • 32
  • Both your images have their own alpha. The formula you linked has a single alpha that's used as `alpha` and `1-alpha` so you automatically have something normalized to 1.0. I'm sure there are alpha-blending formulas for two images with their own alpha, so have a look for that; there must be something more efficient than the obvious way of doing integer division to normalize. Like maybe a weighted sum, so if both images have low alpha, the colours are less intense, like there was a black background behind them? – Peter Cordes Aug 31 '23 at 17:58
  • Also, I'd recommend looking for an AVX2 implementation of blending two images with per-pixel alphas (not pre-multiplied assuming that's what you have). Some of the same instructions are probably useful, although AVX512 might have other ways to do some steps. – Peter Cordes Aug 31 '23 at 18:01
  • I don't have an AVX512 compatible CPU, so I can't really develop for that one as I can't test my code. – Finomnis Aug 31 '23 at 18:02
  • 1
    @Chris in case you are wondering what Peter is talking about, I assume he's talking about the 'over' operation shown in https://en.wikipedia.org/wiki/Alpha_compositing#Description, expecially the two lines of formula below the quads and circles figure. – Finomnis Aug 31 '23 at 18:08
  • Potentially relevant discussion: https://github.com/rust-lang/portable-simd/issues/28 – Finomnis Aug 31 '23 at 18:12
  • @Finomnis Intel has a convenient SDE emulator that can transparently run one binary: [How to test AVX-512 instructions w/o supported hardware?](https://stackoverflow.com/q/51805127) . Obviously you can't test performance that way, but if you did want to play around with something you can test correctness. (And estimate performance with https://uica.uops.info/ for branchless loop bodies, at least for Intel CPUs but not Zen 4.) And yes, thanks for digging up a wiki link about handling two inputs with separate alphas. – Peter Cordes Aug 31 '23 at 19:35
  • @Chris: your scalar code only uses the `alpha` from one of the images! (From the source, ignoring the `this_chunk[0]` alpha value.) It looks like your SIMD code does the same thing. Is this what you want, to ignore the alpha in the destination, or assume it's 255 or something? Also, what CPU are you testing on, and what speed ratio are you seeing? (And how many clock cycles per pixel, or vice versa?) LLVM might be auto-vectorizing the scalar code with 256-bit vectors, which might be faster than 512 if your data isn't aligned. (Especially on a CPU with only one 512-bit SIMD multiplier) – Peter Cordes Aug 31 '23 at 20:18
  • Did you make sure that the final executable actually includes AVX-512 instructions? Rust's LLVM config by default does not generate those. – Finomnis Aug 31 '23 at 20:23
  • @tadman: I'm curious why you untagged [tag:alphablending]? Seemed like the obvious tag, and there are a few existing questions about implementing the operation (e.g. [Optimizing alpha blending for two colors with alpha](https://stackoverflow.com/q/77006591)). Oh, but it looks like most of the questions are about *using* the operation in graphics languages, not implementing / optimizing it. There's also a [tag:alpha-transparency] – Peter Cordes Aug 31 '23 at 20:43
  • @PeterCordes With only ~500 questions tagged with that, it's hardly something people will be looking for. If you want to put it back, I'm not going to object. I just think it makes sense to focus tags on things people have likely pinned, or are illuminating in terms of tool(s) used. – tadman Aug 31 '23 at 20:44
  • 1
    @Finomnis I'm running `RUSTFLAGS='-C target-feature=+avx512f,+avx512bw' cargo run --release` on an AMD 7950x CPU. Is there anything else I need to do to ensure AVX512 runs? How can I check if the executable has those instructions? – Chris Aug 31 '23 at 20:45
  • @PeterCordes: yes, that's the image blend function I want in this case. The alpha of `this_image` will always be 255 because it's for the background frame of a video. I'm using an AMD 7950x CPU which has good AVX512 support. I'm going to write some benchmarking code now to get a better idea of the speed ratio. It seems to be about 2x slower but that's just an estimate. I'm pretty sure that this_ptr is **not** aligned to 512-bits and image_ptr **is** aligned. – Chris Aug 31 '23 at 20:52
  • 1
    @tadman: If there were 5 other important tags that strongly applied, we could do without alpha-blending. But if someone else actually is looking for how to implement with AVX-512, a search on [avx512][alphablending] will take them right to this. I hope you're not arguing that rarer or more-specific tags shouldn't be tagged even on questions where they *do* apply, because that sounds insane and would defeat their purpose. – Peter Cordes Aug 31 '23 at 21:33
  • @Chris: Those details (CPU model, and especially that you're intentionally ignoring alpha from one of the images) should be in the question. You could disassemble the code and look for usage of ZMM registers, or x/ymm16-31 (which also require EVEX prefixes). I'm surprised you only enabled AVX-512BW, not letting the compiler use newer AVX-512 features Zen 4 has. `-C target-cpu=native` should set tuning and ISA extensions appropriately. ([Does rustc / cargo have a -march=native equivalent?](https://stackoverflow.com/q/46780074)) – Peter Cordes Aug 31 '23 at 21:41
  • @PeterCordes I've always interpreted tags as "things people looking to answer questions have pinned in their Watched Tags" and/or "things that add important context to the question" as in `[python-2.x]` vs. `[python]` etc. I see tags for `[loops]` and `[if]` which is ridiculously specific. Maybe `[alphablending]` is right on the line here, and likely superfluous. I'd argue `[image-processing]` has that covered, it's routine in that space. In any case, no strong opinions here, was just trying to keep tag clutter to a minimum. The keywords are in the question and searchable anyways. – tadman Aug 31 '23 at 22:35
  • @tadman: I don't think I'd have created a tag for alphablending if it didn't exist, at least not for questions like this. (Maybe for using it in OpenGL). But since it does exist, no harm in tagging it since it pretty clearly applies. I agree the primary purpose of tags is to let potential answerers find them, but if there aren't 5 such tags, other tags aren't crowding out higher value tags, and can be useful for people searching for other reasons in the future. (At least hypothetically; I know SO's built-in search is bad, but filtering by tag is one of the few things it can do well.) – Peter Cordes Aug 31 '23 at 23:47
  • I couldn't get this to compile so I couldn't check, but using `lazy_static` for the vector constants is probably a mistake. It's a mistake in C++, and it seems like a similar mistake in Rust. These constants will be simply loaded from memory, there is no non-trivial initialization to avoid with a laziness mechanism, the laziness is entirely wasted. – harold Sep 01 '23 at 02:07
  • *"Is this just not a good problem to solve with AVX512?"* - to answer that, you should try to find out what your bottleneck is. There's a good chance that the bottleneck is the throughput of your RAM, in which case you can optimize the computation as much as you want, you won't ever get faster than the RAM can deliver data. – Finomnis Sep 01 '23 at 05:44
  • Depending on what else your program does, it might be faster to switch to a planar format (i.e. `[a0, a1, … r0, r1, … g0, g1, … b0, b1, …]`) which will be much more efficient for most image processing algorithm (including alpha blending) with or without AVX. – Jmb Sep 01 '23 at 06:59
  • If your image color components are 8-bit, you should try to utilize `pmulhrsw` instruction. This will require dropping precision of alpha components by one bit, since one of `pmulhrsw` arguments is a vector of *signed* 8-bit integers, but that's usually an acceptable cost for the gained performance. If the color components are larger than 8 bits, you can similarly use `pmaddwd`. Both instructions perform multiplication and addition, which is exactly what you need for an alpha blending algorithm. The only complication is to rearrange color components before and after the operation. – Andrey Semashev Sep 01 '23 at 11:04
  • @Finomnis there's a good chance that it *isn't* bottlenecked by RAM throughput. With a RAM throughput of 50GB/s and a processor running at 5GHz you get around 10 bytes per cycle on a single thread (that's a ballpark estimate of a typical situation, could be a bit lower but also even a bit higher), that's a lot if you're manipulating individual bytes with scalar code. With 3 multiplications (assuming the division is compiled into a multiplication) per two byte reads, it shouldn't be possible to even consume 1 byte per cycle, off by about an order of magnitude. – harold Sep 01 '23 at 15:36

1 Answers1

1

I managed to figure out what was going on!

Basically, the implementation in my question is wrong and ends up generating a video with lots of random-looking pixels. In my case, I'm piping the output of each frame to ffmpeg and it was having a really difficult time compressing the frames because of all the random colors and this was the reason my program ran twice as slowly - not because of anything to do with the AVX512 code.

I spent a long time figuring out how to do this properly in AVX512 and fixing my code. I then benchmarked the function directly and found that it runs 5.38x faster than the one pixel code. I intentionally wrote it so that I only rely on the avx512f and avx512bw features for better CPU compatibility. It might be possible to save a couple of instructions by using _mm512_permutexvar_epi8 but that requires avx512vbmi.

My working implementation is here and process 32 pixels at a time:

unsafe fn overlay_chunk(this_chunk: &mut [u8], image_chunk: &[u8], c: &AVX512Constants) {
    let this_ptr = this_chunk.as_mut_ptr() as *mut i8;
    let image_ptr = image_chunk.as_ptr() as *const i8;

    let this_argb = _mm256_loadu_epi8(this_ptr);
    let image_argb = _mm256_loadu_epi8(image_ptr);

    // Extend each 8-bit integer into a 16-bit integer (zero filled).
    let this_u16 = _mm512_cvtepu8_epi16(this_argb);
    let image_u16 = _mm512_cvtepu8_epi16(image_argb);

    // Copy the alpha channel over each rgb channel.
    let image_alpha = _mm512_shuffle_epi8(image_u16, c.copy_alpha_to_rgb);

    // Calculate (255 - alpha) and set each u16 alpha value to 256.
    // We shift right by 8 bits later and 256 >> 8 equals 1.
    let image_inv_alpha = _mm512_sub_epi8(c.inv_alpha_minuend, image_alpha);

    // Apply the alpha blending formula (https://graphics.fandom.com/wiki/Alpha_blending).
    let this_blended = _mm512_mullo_epi16(this_u16, image_inv_alpha);
    let image_blended = _mm512_mullo_epi16(image_u16, image_alpha); // TODO: premultiply alpha

    let blended = _mm512_add_epi16(this_blended, image_blended);

    // Shift the u16 values right by 8 bits which divides by 256. We should
    // divide by 255 but this is faster and is close enough. The alpha value
    // of this_argb is preserved because of the 1 bits in the minuend.
    let divided = _mm512_srli_epi16(blended, 8);

    // Convert back to 8-bit integers, discarding the high bits that are zero.
    let divided_u8 = _mm512_cvtepi16_epi8(divided);

    _mm256_storeu_epi8(this_ptr, divided_u8);
}

struct AVX512Constants {
    copy_alpha_to_rgb: __m512i,
    inv_alpha_minuend: __m512i,
}

const X: i8 = -1;

impl AVX512Constants {
    fn new() -> Self {
        unsafe {
            Self {
                copy_alpha_to_rgb: _mm512_set_epi8(
                  X, 56, X, 56, X, 56, X, X, X, 48, X, 48, X, 48, X, X,
                  X, 40, X, 40, X, 40, X, X, X, 32, X, 32, X, 32, X, X,
                  X, 24, X, 24, X, 24, X, X, X, 16, X, 16, X, 16, X, X,
                  X, 8,  X, 8,  X, 8,  X, X, X, 0,  X, 0,  X, 0,  X, X, // right to left
                                                            //    v  v
                                                            // high  low
                ),
                inv_alpha_minuend: _mm512_set_epi8(
                    0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
                    0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
                    0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
                    0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, // right to left
                                                              //    v  v
                                                              // high  low
                ),
            }
        }
    }
}
Chris
  • 1,501
  • 17
  • 32