packing 10 bit values into a byte stream with SIMD

Question

I'm trying to packing 10 bit pixels in to a continuous byte stream, using SIMD instructions. The code below does it "in principle" but the SIMD version is slower than the scalar version.

The problem seem to be that I can't find good gather/scatter operations that load the register efficiently.

Any suggestions for improvement?

// SIMD_test.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

#include "Windows.h"
#include <tmmintrin.h>
#include <stdint.h>
#include <string.h>

// reference non-SIMD implementation that "works"
// 4 uint16 at a time as input, and 5 uint8 as output per loop iteration

void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    for(uint32_t j=0;j<NCOL;j+=4)
    {
        streamBuffer[0] = (uint8_t)(ptr[0]);
        streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2));
        streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4));
        streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6));
        streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ;
        streamBuffer += 5;
        ptr += 4;
    }
}


// poorly written SIMD implementation. Attempts to do the same
// as the packSlow, but 8 iterations at a time

void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
    const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F);
    const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F);
    const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03);

    for(uint32_t j=0;j<NCOL;j+=4*8)
    {
        _mm_prefetch((const char*)(ptr+j),_MM_HINT_T0);
    }

    for(uint32_t j=0;j<NCOL;j+=4*8)
    {
        // this "fetch" stage is costly. Each term takes 2 cycles
        __m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]);
        __m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]);
        __m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]);
        __m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]);

           // I think this part is fairly well optimized
        __m128i streamBuffer0 =  ptr0;
        __m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2)));
        __m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4)));
        __m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6)));
        __m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ;

        // this again is terribly slow. ~2 cycles per byte output
        for(int j=15;j>=0;j-=2)
        {
            streamBuffer[0] = streamBuffer0.m128i_u8[j];
            streamBuffer[1] = streamBuffer1.m128i_u8[j];
            streamBuffer[2] = streamBuffer2.m128i_u8[j];
            streamBuffer[3] = streamBuffer3.m128i_u8[j];
            streamBuffer[4] = streamBuffer4.m128i_u8[j];
            streamBuffer += 5;
        }
        ptr += 32;
    }

}

int _tmain(int argc, _TCHAR* argv[])
{

    uint16_t pixels[512];
    uint8_t packed1[512*10/8];
    uint8_t packed2[512*10/8];

    for(int i=0;i<512;i++)
    {
        pixels[i] = i;
    }

    LARGE_INTEGER t0,t1,t2;

    QueryPerformanceCounter(&t0);
    for(int k=0;k<1000;k++) packSlow(pixels,packed1,512);
    QueryPerformanceCounter(&t1);
    for(int k=0;k<1000;k++) packFast(pixels,packed2,512);
    QueryPerformanceCounter(&t2);

    printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart);

    if (memcmp(packed1,packed2,sizeof(packed1)))
    {
        printf("failed\n");
    }


    return 0;
}

This seems like more of a code review question http://codereview.stackexchange.com/ — Dave, May 14 '14 at 20:04
Yep, this is going to perform really poorly. You need to keep the entire loop SIMD, not just a tiny portion of it. — Cory Nelson, May 14 '14 at 21:01
@CoryNelson - yes it is slow. Do you have a suggestion on how to rewrite the non SIMD portions as SIMD? Is there an easy way to do gather/scatter loads efficiently using the SSE instructions? — Mark Lakata, May 14 '14 at 21:32
If nobody has answered by the time I get home from work I may take a crack at it. This has a lot of room for improvement. General advice: avoid loading in with `_mm_set_epi16`, and don't access the `__m128i` members. — Cory Nelson, May 14 '14 at 22:14

Jeff · Answer 1 · 2014-05-15T01:04:08.000

On re-reading your code, it looks like you are almost definitely murdering your load/store unit, which wouldn't even get complete relief with the new AVX2 VGATHER[D/Q]P[D/S] instruction family. Even Haswell's architecture still requires a uop per load element, each hitting the L1D TLB and cache, regardless of locality, with efficiency improvements showing in Skylake ca. 2016 at earliest.

Your best recourse at present is probably to do 16B register reads and manually construct your streamBuffer values with register copies, _mm_shuffle_epi8(), and _mm_or_si128() calls, and the inverse for the finishing stores.

In the near future, AVX2 will provide (and does for newer desktops already) VPS[LL/RL/RA]V[D/Q] instructions that allow variable element shifting that, combined with a horizontal add, could do this packing pretty quickly. In this case, you could use simple MOVDQU instructions for loading your values, since you could process contiguous uint16_t input values in a single xmm register.

Also, consider reworking your prefetching. Your j in NCOL loop is processing 64B/1 cache line at a time, so you should probably do a single prefetch for ptr + 32 at the beginning of your second loop's body. You might even consider omitting it, since it's a simple forward scan that the hardware prefetcher will detect and automate for you after a very small number of iterations anyway.

alexander · Answer 2 · 2014-05-18T10:40:22.923

I have no experience specifically in SSE. But I would have tried to optimize the code as follows.

// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.

void packSlow1(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    for(uint32_t j=0;j<NCOL;j+=4*4)
    {
        uint64_t *dst;
        uint64_t src[4][4];

        // __m128i s01 = _mm_set_epi64(ptr[0], ptr[1]);
        // __m128i s23 = _mm_set_epi64(ptr[2], ptr[3]);
        // ---- or ----
        // __m128i s0123 = _mm_load_si128(ptr[0])
        // __m128i s01   = _?????_(s0123) // some instruction to extract s01 from s0123
        // __m128i s23   = _?????_(s0123) // some instruction to extract s23

        src[0][0] = ptr[0] & 0x3ff;
        src[0][1] = ptr[1] & 0x3ff;
        src[0][2] = ptr[2] & 0x3ff;
        src[0][3] = ptr[3] & 0x3ff;

        src[1][0] = ptr[4] & 0x3ff;
        src[1][1] = ptr[5] & 0x3ff;
        src[1][2] = ptr[6] & 0x3ff;
        src[1][3] = ptr[7] & 0x3ff;

        src[2][0] = ptr[8] & 0x3ff;
        src[2][1] = ptr[9] & 0x3ff;
        src[2][2] = ptr[10] & 0x3ff;
        src[2][3] = ptr[11] & 0x3ff;

        src[3][0] = ptr[12] & 0x3ff;
        src[3][1] = ptr[13] & 0x3ff;
        src[3][2] = ptr[14] & 0x3ff;
        src[3][3] = ptr[15] & 0x3ff;

        // looks like _mm_maskmoveu_si128 can store result efficiently
        dst = (uint64_t*)streamBuffer;
        dst[0] = src[0][0] | (src[0][1] << 10) | (src[0][2] << 20) | (src[0][3] << 30);

        dst = (uint64_t*)(streamBuffer + 5);
        dst[0] = src[1][0] | (src[1][1] << 10) | (src[1][2] << 20) | (src[1][3] << 30);

        dst = (uint64_t*)(streamBuffer + 10);
        dst[0] = src[2][0] | (src[2][1] << 10) | (src[2][2] << 20) | (src[2][3] << 30);

        dst = (uint64_t*)(streamBuffer + 15);
        dst[0] = src[3][0] | (src[3][1] << 10) | (src[3][2] << 20) | (src[3][3] << 30);

        streamBuffer += 5 * 4;
        ptr += 4 * 4;
    }
}

UPDATE:

Benchmarks:

Ubuntu 12.04, x86_64 GNU/Linux, gcc v4.6.3 (Virtual Box)
Intel Core i7 (Macbook pro)
compiled with -O3

5717633386 (1X):   packSlow
3868744491 (1.4X): packSlow1 (version from the post)
4471858853 (1.2X): packFast2 (from Mark Lakata's post)
1820784764 (3.1X): packFast3 (version from the post)

Windows 8.1, x64, VS2012 Express
Intel Core i5 (Asus)
compiled with standard 'Release' options and SSE2 enabled

00413185 (1X)   packSlow
00782005 (0.5X) packSlow1
00236639 (1.7X) packFast2
00148906 (2.8X) packFast3

I see completely different results on Asus notebook with Windows 8.1 and VS Express 2012 (code compiled with -O2). packSlow1 is 2x slower than original packSlow, while packFast2 is 1.7X (not 2.9X) faster than packSlow. After researching this problem, I understood the reason. VC compiler was unable to save all the constants into XMMS registers for packFast2 , so it inserted additional memory accesses into the loop (see generated assembly). Slow memory access explains performance degradation.

In order to get more stable results I increased pixels buffer to 256x512 and increased loop counter from 1000 to 10000000/256.

Here is my version of SSE optimized function.

// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.

void packFast3(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i m0 = _mm_set_epi16(0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF);
    const __m128i m1 = _mm_set_epi16(0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0);
    const __m128i m2 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
    const __m128i m3 = _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
    const __m128i m4 = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
    const __m128i m5 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
    __m128i s0, t0, r0, x0, x1;

    // unrolled and normal loop gives the same result
    for(uint32_t j=0;j<NCOL;j+=8)
    {
        // load 8 samples into s0
        s0 = _mm_loadu_si128((__m128i*)ptr);            // s0=00070006_00050004_00030002_00010000

        // join 16-bit samples into 32-bit words
        x0 = _mm_and_si128(s0, m0);                     // x0=00000006_00000004_00000002_00000000
        x1 = _mm_and_si128(s0, m1);                     // x1=00070000_00050000_00030000_00010000
        t0 = _mm_or_si128(x0, _mm_srli_epi32(x1, 6));   // t0=00001c06_00001404_00000c02_00000400

        // join 32-bit words into 64-bit dwords
        x0 = _mm_and_si128(t0, m2);                     // x0=00000000_00001404_00000000_00000400
        x1 = _mm_and_si128(t0, m3);                     // x1=00001c06_00000000_00000c02_00000000
        t0 = _mm_or_si128(x0, _mm_srli_epi64(x1, 12));  // t0=00000001_c0601404_00000000_c0200400

        // join 64-bit dwords
        x0 = _mm_and_si128(t0, m4);                     // x0=00000000_00000000_00000000_c0200400
        x1 = _mm_and_si128(t0, m5);                     // x1=00000001_c0601404_00000000_00000000
        r0 = _mm_or_si128(x0, _mm_srli_si128(x1, 3));   // r0=00000000_000001c0_60140400_c0200400

        // and store result
        _mm_storeu_si128((__m128i*)streamBuffer, r0);

        streamBuffer += 10;
        ptr += 8;
    }
}

Thanks. This runs a lot faster than my naive code, 275 ticks compared to 751 (2.7x faster). — Mark Lakata, May 14 '14 at 23:46
Except that there is a "bug" or unintended side effect. There is a buffer overflow when you write the last `dst[0]` at `streamBuffer+15`. You want to only write 5 bytes (40 bits) but you write 8 bytes, so the last 3 bytes over write part of the stack, and in my case, clobbered a different variable. I suppose the work around is to make sure that there are at least 3 bytes of "padding" at the end of streamBuffer. — Mark Lakata, May 14 '14 at 23:59
@Mark Lakata, i think that the code can run even 2x faster, if it is rewritten using SIMD. I'll try to play with it. — alexander, May 15 '14 at 08:51
I'm not sure what changed, but for some reason, today the speed improvement is different. 368 ticks vs 213 ticks (1.7X faster). I do have an improvement with SIMDs, although not as much as I would have liked. — Mark Lakata, May 16 '14 at 18:46
in your comments, you mention that `_mm_maskmoveu_si128 ` can be used efficiently, but I found that `_mm_storeu_si128` was significantly faster. In this case, I am doing the same trick as you, using a 64 bit register to write 40 bits, except I'm using 128 bit register to write 80 bits, so the last 58 bits constantly get overwritten, which is ok except that the output buffer must have an extra 8 bytes to avoid overflow. — Mark Lakata, May 16 '14 at 18:48
@Mark Lakata, it needs to increase data buffer size during checks in order to not measure function prologs and epilogs. Also unrolling loop doesn't give benefits. Looks like SSE unrolled loop doesn't run in parallel in hardware in contrast to DSP unrolled loop. — alexander, May 17 '14 at 22:30
The new packFast3 is fastest, about 1.6X faster than packFast2. Note that it over writes 48 (e.g. 128 - 80) bits to the end of the buffer, so the output buffer needs some post padding to avoid data corruption. — Mark Lakata, May 20 '14 at 00:12

Mark Lakata · Answer 3 · 2014-05-16T20:54:37.453

I came up with a "better" solution using SIMD, but it doesn't not leverage parallelization, just more efficient loads and stores (I think).

I'm posting it here for reference, not necessarily the best answer.

The benchmarks are (in arbitrary ticks)

 gcc4.8.1 -O3    VS2012 /O2      Implementation
 ----------------------------------------- 
 369 (1X)        3394 (1X)       packSlow (original code)
 212 (1.7X)      2010 (1.7X)     packSlow (from @alexander)
 147 (2.5X)      1178 (2.9X)     packFast2 (below)

Here's the code. Essentially @alexander's code except using 128 bit registers instead of 64 bit registers, and unrolled 2x instead of 4x.

void packFast2(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
    const __m128i mask0 = _mm_set_epi16(0,0,0,0,0,0,0,0x3FF);
    const __m128i mask1 = _mm_set_epi16(0,0,0,0,0,0,0x3FF,0);
    const __m128i mask2 = _mm_set_epi16(0,0,0,0,0,0x3FF,0,0);
    const __m128i mask3 = _mm_set_epi16(0,0,0,0,0x3FF,0,0,0);
    const __m128i mask4 = _mm_set_epi16(0,0,0,0x3FF,0,0,0,0);
    const __m128i mask5 = _mm_set_epi16(0,0,0x3FF,0,0,0,0,0);
    const __m128i mask6 = _mm_set_epi16(0,0x3FF,0,0,0,0,0,0);
    const __m128i mask7 = _mm_set_epi16(0x3FF,0,0,0,0,0,0,0);

    for(uint32_t j=0;j<NCOL;j+=16)
    {
        __m128i s = _mm_load_si128((__m128i*)ptr); // load 8 16 bit values
        __m128i s2 = _mm_load_si128((__m128i*)(ptr+8)); // load 8 16 bit values

        __m128i a = _mm_and_si128(s,mask0);
        a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask1),6));
        a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask2),12));
        a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask3),18));
        a = _mm_or_si128( a, _mm_srli_si128 (_mm_and_si128(s, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8)
        a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes.
        a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits).
        a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask7),42));

        _mm_storeu_si128((__m128i*)streamBuffer, a);

        __m128i a2 = _mm_and_si128(s2,mask0);
        a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask1),6));
        a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask2),12));
        a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask3),18));
        a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_and_si128(s2, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8)
        a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes.
        a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits).
        a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask7),42));

        _mm_storeu_si128((__m128i*)(streamBuffer+10), a2);

        streamBuffer += 20 ;
        ptr += 16 ;
    }
}

btw, it is annoying that `_mm_srli_si128` shifts in *bytes* instead of *bits*. That explains the silliness I have with mask5,mask6 and mask 7 — Mark Lakata, May 16 '14 at 19:04

packing 10 bit values into a byte stream with SIMD

3 Answers3

Linked