Faster assembly optimized way to convert between 8-bit grayscale and RGB32 image with SSE

Question

I'm trying to find an optimized method for RGB8 (actually grayscale) to RGB32 image conversion.

Source is an 8 bits grey image, Destination should be an 32 bits grey image (BGRA) with 4th channel (alpha) to be ignored. Source address is not guaranteed to be 16 byte aligned, Count is a multiple of 16, Destination address is 16 byte aligned.

INPUT: 8 bits single channel grey image
OUTPUT: 32 bits BGRA (alpha channel ignored)
COUNT: Image size is a multiple of 16
CPU: x86-32 (SSE2/SSE3 allowed)

Here is my optimized assembly code. Is there an even faster way of conversion?

void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
    static unsigned int __declspec(align(64)) Masks[] = {
        0x80000000, 0x80010101, 0x80020202, 0x80030303,
        0x80040404, 0x80050505, 0x80060606, 0x80070707,
        0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
        0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
    };
    
    __asm {
        mov esi, Source
        mov edi, Destination
        mov edx, Count
        xor ecx, ecx
        movdqa xmm4, xmmword ptr [Masks + 0]
        movdqa xmm5, xmmword ptr [Masks + 16]
        movdqa xmm6, xmmword ptr [Masks + 32]
        movdqa xmm7, xmmword ptr [Masks + 48]
l1:
        movdqu xmm0, xmmword ptr [esi + ecx]
        movdqa xmm1, xmm0
        movdqa xmm2, xmm0
        movdqa xmm3, xmm0
        pshufb xmm0, xmm4
        pshufb xmm1, xmm5
        pshufb xmm2, xmm6
        pshufb xmm3, xmm7
        movntdq [edi + 0], xmm0
        movntdq [edi + 16], xmm1
        movntdq [edi + 32], xmm2
        movntdq [edi + 48], xmm3
        add edi, 64
        add ecx, 16
        cmp ecx, edx
        jb l1
    }
}

There is another approach using several PUNPCKLBW and PUNPCKHBW but that seems to be slightly slower.

Update: This is the basic non optimized algorithm:

BGRA* Destination = ...
unsigned char* Source ...
for (unsigned int i = 0; i < Size; i++) {
    Destination[i].Blue = Source[i];
    Destination[i].Green = Source[i];
    Destination[i].Red = Source[i]; 
}

PS: I also tried using C code with MS VS2008 SSE compiler intrinsics. It turned out that the compiler generated a lot of unnecessary memory moves which causes the code to be 10-20% slower than pure assembly.

Update 2: This is the same code by using intrinsics only.

void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
    static const unsigned int __declspec(align(64)) Masks[] = {
        0x80000000, 0x80010101, 0x80020202, 0x80030303,
        0x80040404, 0x80050505, 0x80060606, 0x80070707,
        0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
        0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
    };

    register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
    register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 4));
    register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 8));
    register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 12));

    for (unsigned int i = 0; i < Count / 16; i++) {
        __m128i r0 = _mm_load_si128(Source + i);

        _mm_stream_si128(Destination + (i * 4) + 0, _mm_shuffle_epi8(r0, m0));
        _mm_stream_si128(Destination + (i * 4) + 1, _mm_shuffle_epi8(r0, m1));
        _mm_stream_si128(Destination + (i * 4) + 2, _mm_shuffle_epi8(r0, m2));
        _mm_stream_si128(Destination + (i * 4) + 3, _mm_shuffle_epi8(r0, m3));
    }
}

Update 3: This is the compiler generated code (beautified) (Visual Studio 2012, all optimization on):

    push ebp 
    mov ebp, esp 
    mov edx, dword ptr [ebp+8]
    movdqa xmm1, xmmword ptr ds:[Masks + 0]
    movdqa xmm2, xmmword ptr ds:[Masks + 16]
    movdqa xmm3, xmmword ptr ds:[Masks + 32]
    movdqa xmm4, xmmword ptr ds:[Masks + 48]
    push esi 
    test ecx, ecx 
    je l2
    lea esi, [ecx-1] 
    shr esi, 4 
    inc esi
l1:
    mov ecx, edx 
    movdqu xmm0, xmmword ptr [ecx] 
    mov ecx, eax 
    movdqa xmm5, xmm0 
    pshufb xmm5, xmm1 
    movdqa xmmword ptr [ecx], xmm5 
    movdqa xmm5, xmm0 
    pshufb xmm5, xmm2 
    movdqa xmmword ptr [eax+10h], xmm5 
    movdqa xmm5, xmm0
    pshufb xmm5, xmm3
    movdqa xmmword ptr [eax+20h], xmm5 
    lea ecx, [eax+30h]
    add edx, 10h 
    add eax, 40h 
    dec esi 
    pshufb xmm0, xmm4 
    movdqa xmmword ptr [ecx], xmm0 
    jne l1
l2:
    pop esi
    pop ebp
    ret

It seems that interleaving movdqa with pshufb is some what faster.

Update 4: This seems to be the optimal hand optimized code:

   __asm {
        mov esi, Source
        mov edi, Destination
        mov ecx, Count
        movdqu xmm0, xmmword ptr [esi]
        movdqa xmm4, xmmword ptr [Masks + 0]
        movdqa xmm5, xmmword ptr [Masks + 16]
        movdqa xmm6, xmmword ptr [Masks + 32]
        movdqa xmm7, xmmword ptr [Masks + 48]
l1:
        dec ecx
        lea edi, [ edi + 64 ]
        lea esi, [ esi + 16 ]
        movdqa xmm1, xmm0
        movdqa xmm2, xmm0
        movdqa xmm3, xmm0
        pshufb xmm0, xmm4
        movdqa [edi - 64], xmm0
        pshufb xmm1, xmm5
        movdqa [edi - 48], xmm1
        pshufb xmm2, xmm6
        movdqa [edi - 32], xmm2
        pshufb xmm3, xmm7
        movdqa [edi - 16], xmm3
        movdqu xmm0, xmmword ptr [esi]
        ja l1
    }

Update 5: This conversion algorithm uses punpck instruction. However this conversion routine is a bit slower than using masks and pushfb.

for (unsigned int i = 0; i < Count; i += 16) {
    register __m128i r0 = _mm_load_si128(Source++);
    register __m128i r1 = _mm_unpackhi_epi8(r0, r0);
    register __m128i r2 = _mm_unpacklo_epi8(r0, r0);
    register __m128i r3 = _mm_unpackhi_epi8(r1, r1);
    register __m128i r4 = _mm_unpacklo_epi8(r1, r1);
    register __m128i r5 = _mm_unpackhi_epi8(r2, r2);
    register __m128i r6 = _mm_unpacklo_epi8(r2, r2);

    _mm_store_si128(Destination++, r6);
    _mm_store_si128(Destination++, r5);
    _mm_store_si128(Destination++, r4);
    _mm_store_si128(Destination++, r3);
}

Update 6: For the sake of completeness this is the inverse method to convert from 32 bits back to 8 bits grey image.

static void ConvertRgb32ToGrey(const __m128i* Source, __m128i* Destination, unsigned int Count) {
    static const unsigned char __declspec(align(64)) Masks[] = {
        0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
        0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80,
        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c,
    };
    
    register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
    register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 16));
    register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 32));
    register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 48));
    
    for (unsigned int i = 0; i < Count / 64; i++) {
        __m128i a = _mm_load_si128(Source + (i * 4) + 0);
        __m128i b = _mm_load_si128(Source + (i * 4) + 1);
        __m128i c = _mm_load_si128(Source + (i * 4) + 2);
        __m128i d = _mm_load_si128(Source + (i * 4) + 3);

        a = _mm_shuffle_epi8(a, m0);
        b = _mm_shuffle_epi8(b, m1);
        c = _mm_shuffle_epi8(c, m2);
        d = _mm_shuffle_epi8(d, m3);

        __m128i e = _mm_or_si128(a, b);
        __m128i f = _mm_or_si128(c, d);
        __m128i g = _mm_or_si128(e, f);

        _mm_stream_si128(Destination + i, g);

    }
}

Ok, I forgot to mention. My C/C++ code was by far slower, even with unrolled loops (4). In my C implementation I loaded 4 bytes at once from my grey image and stored a sequence of 4 * 4 bytes. — bkausbk, Aug 17 '12 at 12:33
@Hans Passant: while I agree that this isn't necessarily a prime usecase for assembly, I'd advocate _against_ (too much) "simplify". Write it in C yes, but use SSE intrinsics. This code (even the assembly) already _is_ simple. — FrankH., Aug 17 '12 at 13:02
RE your SSE intrinsics postscript: Are you compiling with all optimisations turned on? — huon, Aug 17 '12 at 13:16
Yes I explicitly tested with all optimization inclusive /arch:SSE2 switch on Microsoft Visual Studio 2008 compiler. But may be the intrinsic calls were badly ordered. — bkausbk, Aug 17 '12 at 13:20
VS2008 is notoriously bad at generating code for intrinsics. Dumb moves all over the place. Try VS2010, it should be a bit better. — harold, Aug 17 '12 at 13:44
OK I tested, VS2012 generated code is really better, almost perfect. — bkausbk, Aug 17 '12 at 14:24
You could use `movntdq` for storing the image, depending on the size. If it is bigger than the L2 or L3 cache this will speed up the routine. Otherwise there might be a slight slowdown of the conversion but a slight speedup of the rest of the program, which may or may not be worth it. — Gunther Piez, Aug 17 '12 at 15:50
Visual Studio 2012 generated intrinsic code is even 0,5% faster than hand optimized one (both using `movdqu`), at least on my system with Intel I7 2600K cpu. — bkausbk, Aug 17 '12 at 18:18
@harold: I updated my article, it seems that interleaving `movdqa` with `pshufb` is some what faster. — bkausbk, Aug 17 '12 at 18:38
This kind of thing is pretty much a perfect task for the GPU. If you *just* need to perform this one conversion, it probably wouldn't buy you much, but if you need to do more processing before/after the conversion, uploading the image to the GPU and doing the conversion there would likely be *a lot* faster than anything you can do with SSE — jalf, Aug 17 '12 at 18:38

score 6 · Accepted Answer · edited May 18 '22 at 16:18

Would try:

    __asm {
        mov esi, Source
        mov edi, Destination
        mov ecx, Count
        movdqu xmm0, xmmword ptr [esi]
        movdqa xmm4, xmmword ptr [Masks + 0]
        movdqa xmm5, xmmword ptr [Masks + 16]
        movdqa xmm6, xmmword ptr [Masks + 32]
        movdqa xmm7, xmmword ptr [Masks + 48]
l1:
        dec ecx        // modern Intel can macro-fuse this with jnz if adjacent
        lea edi, [ edi + 64 ]
        lea esi, [ esi + 16 ]
        movdqa xmm1, xmm0
        movdqa xmm2, xmm0
        movdqa xmm3, xmm0
        pshufb xmm0, xmm4
        pshufb xmm1, xmm5
        pshufb xmm2, xmm6
        pshufb xmm3, xmm7
        movntdq [edi - 64], xmm0
        movntdq [edi - 48], xmm1
        movntdq [edi - 32], xmm2
        movntdq [edi - 16], xmm3
        movdqu xmm0, xmmword ptr [esi]
        jnz l1
    }

Haven't benchmarked it though; assumptions behind these changes:

the movdqu xmm0,... latency can be a little more hidden within the loop (your code has the load of xmm0 followed directly by an instruction using the value in that register)
the add ops on two regs as well as the cmp aren't really all necessary; address generation (lea) and the implicit zero test by dec/jnz can be used. That way, there'll be no EFLAGS dependencies caused by operations on ecx/esi/edi as the only ALU op in the loop is decrementing the loop counter.

In the end, this is likely load/store bound in any case so the arithmetics are "free game"; I therefore expect little difference, even with the arguments as given.

If the input is large, then it'd make sense to strip the "unaligned head/tail" off, i.e. to do a duff's device for the first/last [0..15] bytes, and the main loop using movdqa.

Edit:

Running your intrinsics sources through gcc -msse4.2 -O8 -c (GCC 4.7.1) gives the following assembly:

Disassembly of section .text:

0000000000000000 <ConvertGreyToRgb32Assembler>:
   0:   85 d2                   test   edx,edx
   2:   74 76                   je     7a <ConvertGreyToRgb32Assembler+0x7a>
   4:   66 0f 6f 2d 00 00 00 00         movdqa xmm5,XMMWORD PTR [rip+0x0]
        # c <ConvertGreyToRgb32Assembler+0xc>
   c:   48 89 f8                mov    rax,rdi
   f:   66 0f 6f 25 00 00 00 00         movdqa xmm4,XMMWORD PTR [rip+0x0]
        # 17 <ConvertGreyToRgb32Assembler+0x17>
  17:   66 0f 6f 1d 00 00 00 00         movdqa xmm3,XMMWORD PTR [rip+0x0]
        # 1f <ConvertGreyToRgb32Assembler+0x1f>
  1f:   66 0f 6f 15 00 00 00 00         movdqa xmm2,XMMWORD PTR [rip+0x0]
        # 27 <ConvertGreyToRgb32Assembler+0x27>
  27:   66 0f 1f 84 00 00 00 00 00      nop    WORD PTR [rax+rax*1+0x0]
  30:   f3 0f 6f 00             movdqu xmm0,XMMWORD PTR [rax]
  34:   48 89 f1                mov    rcx,rsi
  37:   48 83 c0 10             add    rax,0x10
  3b:   66 0f 6f c8             movdqa xmm1,xmm0
  3f:   66 0f 38 00 cd          pshufb xmm1,xmm5
  44:   66 0f e7 0e             movntdq XMMWORD PTR [rsi],xmm1
  48:   66 0f 6f c8             movdqa xmm1,xmm0
  4c:   66 0f 38 00 cc          pshufb xmm1,xmm4
  51:   66 0f e7 4e 10          movntdq XMMWORD PTR [rsi+0x10],xmm1
  56:   66 0f 6f c8             movdqa xmm1,xmm0
  5a:   66 0f 38 00 c2          pshufb xmm0,xmm2
  5f:   66 0f 38 00 cb          pshufb xmm1,xmm3
  64:   66 0f e7 4e 20          movntdq XMMWORD PTR [rsi+0x20],xmm1
  69:   66 0f e7 41 30          movntdq XMMWORD PTR [rcx+0x30],xmm0
  6e:   89 c1                   mov    ecx,eax
  70:   29 f9                   sub    ecx,edi
  72:   48 83 c6 40             add    rsi,0x40
  76:   39 ca                   cmp    edx,ecx
  78:   77 b6                   ja     30 <ConvertGreyToRgb32Assembler+0x30>
  7a:   f3 c3                   repz ret

This reminds me extremely strongly of your initial assembly code. If MSVC creates something significantly worse than that, I'd say it's a bug/limitation in the compiler (version) you used.

Benchmark results were equal, at least on my system with Pentium Dual-Core E6700. I tested with a stream of 120 images/s. But you are right, with 1 and 2, it should be slightly faster. Would it make sense to use prefetch instruction some where? — bkausbk, Aug 17 '12 at 12:54
you could try a prefetch right at entry to the loop, but I don't think it'll make a difference; on most post-P4 CPUs, prefetch is pretty much a NOP / automatic anyway, http://stackoverflow.com/questions/10323420/prefetching-data-to-cache-for-x86-64 — FrankH., Aug 17 '12 at 14:39
I've tested the difference of `lddqu` and `movdqu` on my home system (Intel I7 2600K). First one was about 1,2% slower (66 million cycles to 65 million cycles average values over 300 * 100 iterations). Both uses `movdqa` instruction to store data which was faster than `movntdq` in this case which took about 93 million cycles. — bkausbk, Aug 17 '12 at 18:10
Intel since Nehalem does macro-fusion in 64-bit mode for `dec/jnz` if they adjacent. (DEC doesn't modify CF, so `dec / ja` either exits right away or loops forever.) You also don't need LEA. Even on older CPUs, out-of-order exec can run the loop stuff faster than the SIMD work, so dec/jnz wouldn't be a bottleneck. Before Sandybridge, it might have been a win to schedule smaller instructions between some of the longer SIMD instructions, to help the decoders. — Peter Cordes, May 17 '22 at 14:22
@PeterCordes I guess it's fair to say the answer is a decade old. I'd trust the compiler even more today :-) but also, today, for the problem asked (RGB8 to RGB32), I'd probably try broadcast instructions. Not that CPUs have stood still. — FrankH., May 18 '22 at 15:52
Correction to my previous comment; Nehalem only fuses `cmp/jcc` or `test/jcc`. It was Sandybridge that fuses `dec/jcc`. Nehalem was out in late 2008, SnB in Jan 2011. My point was that even in 2012, your code wasn't optimal on Intel CPUs that were then-recent. And for pre-sandybridge, a `cmp/jcc` on one of the pointers would save a uop in the loop vs. a `dec/jcc`. — Peter Cordes, May 18 '22 at 16:09
And at the very least you should fix the correctness bug: `ja` depends on CF, which `dec` doesn't set. An answer being old is no reason to leave it wrong or sub-optimal; the pixel-format conversion problem is still something potentially relevant. — Peter Cordes, May 18 '22 at 16:10
A separate answer with AVX2 intrinsics might make sense at this point. (With 16-byte vectors, `pshufb` is probably still the best strategy; with 32-byte vectors yeah maybe cheap `vpbroadcastq` loads to set up for in-lane `vpshufb`. `vpmovzxbd` would still need another shuffle inside each dword.) — Peter Cordes, May 18 '22 at 16:15

Faster assembly optimized way to convert between 8-bit grayscale and RGB32 image with SSE

1 Answers1