Intel intrinsics : multiply interleaved 8bit values

Question

I'm working on a RGBA32 buffer (8bits per component), and I'd need to multiply each component by a constant, then add each of the results of the multiplication to the others as such :

Result = r*x + g * y + b * z + a*w (dot product between the two vectors rgba and xyzw)

I'm trying to use intel SSE intrinsics to speed up the process, but I don't know how to do such a thing without shuffling the input.

Is there any way to do this? Like building a register contaning {x,y,z,w,x,y,z,w,x,y,z,w,x,y,z,w} and perform an 8 bit multiply with saturation?

The final objective is to multiply RGBA vector by the corresponding color conversion matrix :

[ 66 129  25 0]   [R]
[-38 -74 112 0] * [G]
[112 -94 -18 0]   [B]
[0     0   0 0]   [A]

Thanks

Edit 1 : here's the final function, using floating point calculation for more color precision, which converts a rgba image to a YUV444 one using SSE. Function takes between 1.9 and 3.5ms to convert a full HD image on an intel i5 3570k, using only one thread (it's really easy to thread this function, and it could yield significant performance improvments) :

void SSE_rgba2YUV444_FP(char* a, char* y, char* u, char* v)
{
    __m128i mask = _mm_setr_epi8(0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f); // Masque de mélange, chaque uint8 donne la position à donner (en offset en octet) du uint8 correspondant
    float m[9] = {0.299, 0.587, 0.114, -0.1687, -0.3313, 0.5, 0.5, -0.4187, -0.0813};                                                         // Dans le __m128i que l'on mélange

    __m128i row[4];
    for(int i=0; i<4; i++) {
        row[i] = _mm_loadu_si128((__m128i*)&a[16*i]);
        row[i] = _mm_shuffle_epi8(row[i],mask);
    }
    // row[i] = {rrrrggggbbbbaaaa} tous en uint8t
    __m128i t0 = _mm_unpacklo_epi32(row[0], row[1]); //to = {rrrrrrrrgggggggg}
    __m128i t1 = _mm_unpacklo_epi32(row[2], row[3]); //t1 = {rrrrrrrrgggggggg}
    __m128i t2 = _mm_unpackhi_epi32(row[0], row[1]); //t2 = {bbbbbbbbaaaaaaaa}
    __m128i t3 = _mm_unpackhi_epi32(row[2], row[3]); //t3 = {bbbbbbbbaaaaaaaa}
    row[0] = _mm_unpacklo_epi64(t0, t1); // row[0] = {rrrrrrrrrrrrrrrr}
    row[1] = _mm_unpackhi_epi64(t0, t1); // etc
    row[2] = _mm_unpacklo_epi64(t2, t3);

    __m128i v_lo[3], v_hi[3];
    for(int i=0; i<3; i++) {
        v_lo[i] = _mm_unpacklo_epi8(row[i],_mm_setzero_si128()); // On entrelace chaque row avec des 0, ce qui fait passer les valeurs
        v_hi[i] = _mm_unpackhi_epi8(row[i],_mm_setzero_si128()); // de 8bits à 16bits pour pouvoir travailler dessus
    }

    __m128 v32_lo1[3], v32_hi1[3], v32_lo2[3], v32_hi2[3];
    for(int i=0; i<3; i++) {
        v32_lo1[i] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_lo[i],_mm_setzero_si128()));
        v32_lo2[i] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_lo[i],_mm_setzero_si128()));
        v32_hi1[i] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_hi[i],_mm_setzero_si128()));
        v32_hi2[i] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_hi[i],_mm_setzero_si128()));
    } // On a nos rgb sur 32 bits

    __m128i yuv[3]; // {Y, U, V} 
    __m128 ylo1 = _mm_add_ps(_mm_mul_ps(v32_lo1[0], _mm_set1_ps(m[0])), _mm_add_ps(_mm_mul_ps(v32_lo1[1], _mm_set1_ps(m[1])), _mm_mul_ps(v32_lo1[2], _mm_set1_ps(m[2]))));
    __m128 ylo2 = _mm_add_ps(_mm_mul_ps(v32_lo2[0], _mm_set1_ps(m[0])), _mm_add_ps(_mm_mul_ps(v32_lo2[1], _mm_set1_ps(m[1])), _mm_mul_ps(v32_lo2[2], _mm_set1_ps(m[2]))));
    __m128 yhi1 = _mm_add_ps(_mm_mul_ps(v32_hi1[0], _mm_set1_ps(m[0])), _mm_add_ps(_mm_mul_ps(v32_hi1[1], _mm_set1_ps(m[1])), _mm_mul_ps(v32_hi1[2], _mm_set1_ps(m[2]))));
    __m128 yhi2 = _mm_add_ps(_mm_mul_ps(v32_hi2[0], _mm_set1_ps(m[0])), _mm_add_ps(_mm_mul_ps(v32_hi2[1], _mm_set1_ps(m[1])), _mm_mul_ps(v32_hi2[2], _mm_set1_ps(m[2]))));

    __m128i ylo1i = _mm_cvtps_epi32(ylo1);
    __m128i ylo2i = _mm_cvtps_epi32(ylo2);
    __m128i yhi1i = _mm_cvtps_epi32(yhi1);
    __m128i yhi2i = _mm_cvtps_epi32(yhi2);

    __m128i ylo = _mm_packus_epi32(ylo1i, ylo2i);
    __m128i yhi = _mm_packus_epi32(yhi1i, yhi2i);

    yuv[0] = _mm_packus_epi16(ylo, yhi);

    ylo1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_lo1[0], _mm_set1_ps(m[3])), _mm_add_ps(_mm_mul_ps(v32_lo1[1], _mm_set1_ps(m[4])), _mm_mul_ps(v32_lo1[2], _mm_set1_ps(m[5])))), _mm_set1_ps(128.0f));
    ylo2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_lo2[0], _mm_set1_ps(m[3])), _mm_add_ps(_mm_mul_ps(v32_lo2[1], _mm_set1_ps(m[4])), _mm_mul_ps(v32_lo2[2], _mm_set1_ps(m[5])))), _mm_set1_ps(128.0f));
    yhi1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_hi1[0], _mm_set1_ps(m[3])), _mm_add_ps(_mm_mul_ps(v32_hi1[1], _mm_set1_ps(m[4])), _mm_mul_ps(v32_hi1[2], _mm_set1_ps(m[5])))), _mm_set1_ps(128.0f));
    yhi2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_hi2[0], _mm_set1_ps(m[3])), _mm_add_ps(_mm_mul_ps(v32_hi2[1], _mm_set1_ps(m[4])), _mm_mul_ps(v32_hi2[2], _mm_set1_ps(m[5])))), _mm_set1_ps(128.0f));

    ylo1i = _mm_cvtps_epi32(ylo1);
    ylo2i = _mm_cvtps_epi32(ylo2);
    yhi1i = _mm_cvtps_epi32(yhi1);
    yhi2i = _mm_cvtps_epi32(yhi2);

    ylo = _mm_packus_epi32(ylo1i, ylo2i);
    yhi = _mm_packus_epi32(yhi1i, yhi2i);

    yuv[1] = _mm_packus_epi16(ylo, yhi);

    ylo1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_lo1[0], _mm_set1_ps(m[6])), _mm_add_ps(_mm_mul_ps(v32_lo1[1], _mm_set1_ps(m[7])), _mm_mul_ps(v32_lo1[2], _mm_set1_ps(m[8])))), _mm_set1_ps(128.0f));
    ylo2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_lo2[0], _mm_set1_ps(m[6])), _mm_add_ps(_mm_mul_ps(v32_lo2[1], _mm_set1_ps(m[7])), _mm_mul_ps(v32_lo2[2], _mm_set1_ps(m[8])))), _mm_set1_ps(128.0f));
    yhi1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_hi1[0], _mm_set1_ps(m[6])), _mm_add_ps(_mm_mul_ps(v32_hi1[1], _mm_set1_ps(m[7])), _mm_mul_ps(v32_hi1[2], _mm_set1_ps(m[8])))), _mm_set1_ps(128.0f));
    yhi2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_hi2[0], _mm_set1_ps(m[6])), _mm_add_ps(_mm_mul_ps(v32_hi2[1], _mm_set1_ps(m[7])), _mm_mul_ps(v32_hi2[2], _mm_set1_ps(m[8])))), _mm_set1_ps(128.0f));

    ylo1i = _mm_cvtps_epi32(ylo1);
    ylo2i = _mm_cvtps_epi32(ylo2);
    yhi1i = _mm_cvtps_epi32(yhi1);
    yhi2i = _mm_cvtps_epi32(yhi2);

    ylo = _mm_packus_epi32(ylo1i, ylo2i);
    yhi = _mm_packus_epi32(yhi1i, yhi2i);

    yuv[2] = _mm_packus_epi16(ylo, yhi);

    _mm_storeu_si128((__m128i*)y,yuv[0]);
    _mm_storeu_si128((__m128i*)u,yuv[1]);
    _mm_storeu_si128((__m128i*)v,yuv[2]);
}

How many bits does `Result` occupy? I would guess 18 bits, but I am not sure. Also, are x, y, z, w all 8-bit unsigned? — anatolyg, Sep 29 '14 at 09:14
result is 8 bits too, it's a basic RGB to YUV calculation, but in integers. All components are 8bit unsigned — Kustom666, Sep 29 '14 at 09:17
It's going to take several instructions to do this, but you can probably do most of it with `_mm_maddubs_epi16`, assuming you have SSSE3 as a minumum. — Paul R, Sep 29 '14 at 09:38
I have every instruction set up to sse4.2, but sadly no AVX. The problem with _mm_maddubs_epi16 is that it uses signed multiplication and stores the results in 16bits. — Kustom666, Sep 29 '14 at 09:44
@Kustom666: actually it does a signed-unsigned multiplication - put the pixel values in the unsigned input vector and the coefficients in the signed input vector. Afterwards you just need to sum and pack your output values. — Paul R, Sep 29 '14 at 10:01
@PaulR So using _mm_addubs_epi16 I get a __m128i result containing 8 16bits results that I have to add 2 by 2 to get my 4 final Y coefficients? That would indeed be useful. — Kustom666, Sep 29 '14 at 10:10
@Kustom666: yes, also note that a quick way to combine the 16 bit values is to use `_mm_madd_epi16` with a multiplier of 1. — Paul R, Sep 29 '14 at 10:12
Ok, and in the end to yield my 4 8 bit results I'll just have to get the lower 8 bit of each 32 bit values that I retrieved from _mm_madd_epi16, right? — Kustom666, Sep 29 '14 at 10:19
@Kustom666: yes, you'll need to use a couple of `_mm_pack_xxx` instructions to reduce the 32 bit values to 8 bits. Ideally you would process 4 x 16 input points so that at this stage you have 4 x 4 x 32 bit output vectors and then you can pack these down into a single 16 x 8 bit vector. — Paul R, Sep 29 '14 at 10:31
@PaulR Thanks! I've got most of the algorithm figured out, but my calculations need to be done on 32 bit or i'll lose data. Is there any way to get the from the 8 bit data and perform calculations on 32 bits? — Kustom666, Sep 29 '14 at 12:43
Oh - OK - that wasn't clear from the question, but anyway yes, if you want to work at a higher accuracy then just unpack the data to 16 bits first and use e.g. `_mm_madd_epi16` to generate 32 bit terms. 16 bit cofficients should be more than adequate given that you only have 8 bit pixel data. — Paul R, Sep 29 '14 at 14:10
You said you this is a "basic RGB to YUV calculation". Does that mean you want U and V as well? The most efficient solution depends on if you want only one component are all three. — Z boson, Sep 29 '14 at 20:16
Yes I do need U and V (sub sampled 2*2 and interleaved, with Y in its own plane, also known as NV12). Thanks @PaulR i'll see what I can do. — Kustom666, Sep 29 '14 at 22:01
@Kustom666, could you update your question with the code to determine Y, U, and V? As I said the answer is different for if you only want Y vs. if you want Y, U, and V. If it's the latter case then you probably don't want to use horizontal operations such as `_mm_madd_epi16`. — Z boson, Sep 30 '14 at 15:10
@Zboson I've updated the question with the full matrix. You'll notice that the Alpha is completely negated by the matrix, this is done on purpose as I do not need the alpha, but it's part of the data I get to work with. — Kustom666, Sep 30 '14 at 16:25
That's much better. I think I can can come up with something which does not need horizontal operations. It might take me a few days to find time to look into it this... — Z boson, Sep 30 '14 at 18:48
One more thing. What are you doing with YUV once you calculate them? Are you writing them to a buffer? If so how is it encoded. 32-bits (rgba) to 24-bit (yuv)? Is it okay to process multiple pixels at once (e.g. four or eight) or do you only want YUV one at a time? — Z boson, Sep 30 '14 at 19:02
@Zboson I'm trying to get some NV12 out of it. Which means all the Y gets written to a buffer, and Us and Vs are written to the same buffer and interleaved. Processing multiple pixels at once is actually the idea :D — Kustom666, Sep 30 '14 at 21:17
What exactly do you mean by "Y gets written to a buffer, and Us and Vs are written to the same buffer and interleaved."? I have an idea right now that produces eight pixels and produces three independent sets with eight bytes each of yyyyyyyy, uuuuuuuu, vvvvvvv. Would this be useful or do you need something like yuvyuvyuv...? — Z boson, Sep 30 '14 at 22:41
It just means that i'll have to store all Ys contiguously in memory whilst the u and vs will be interleaved. In the end i'll have two arrays : y[y1,y2,...,yn] and uv[u1, v1, u2, v2,...,un, vn] — Kustom666, Sep 30 '14 at 22:58
To be more precise. Are you using the formula for (RGB ->YUV) at http://stackoverflow.com/questions/1737726/how-to-perform-rgb-yuv-conversion-in-c-c/14697130#14697130 ? — Z boson, Oct 01 '14 at 18:18
E.g. `Y = CLIP(66 * (R) + 129 * (G) + 25 * (B) + 128) >> 8) + 16` — Z boson, Oct 01 '14 at 18:20
I changed my answer to a more efficient version which uses `_mm_maddubs_epi16`. — Z boson, Oct 02 '14 at 10:14
@PaulR, I answered the question using many of your comments. If you have time do you think you could look it over and let me know if you have any suggestions? I don't have a lot of experience with 8-bit integers with SSE/AVX. — Z boson, Oct 02 '14 at 10:15
@Zboson: looks good - it would be interesting to benchmark it against scalar code to see what sort of factor improvement you're getting. — Paul R, Oct 02 '14 at 11:01
@PaulR, I originally wanted to do this without horizontal operators by doing a AOS to SOA transpose and calculating `u` and `v` as well. But I think it's hard to beat `_mm_maddubs_epi16`. You get multiplication, one of the two additions and 8-bit to 16-bit conversion for the price of one. I have not checked the instruction tables. I'm know that `_mm_hadd_epi32` should be avoided but in this case I think the horizontal operators are the best solution. — Z boson, Oct 02 '14 at 11:05
@Zboson: yes, I think it would be hard to beat - it might be a different story if you're doing Y, U and V simultaneously though. — Paul R, Oct 02 '14 at 11:08
@PaulR, I manged to get a version working for Y, U, and V without using the horizontal operators. But then I found a bug in all the version which gets the wrong result for RGBA>127. I don't know what the problem is but the OP does not seem to care anyway. — Z boson, Oct 02 '14 at 22:00
@Zboson: oh well if you can get it debugged you could always polish it up and put it on sourceforge - I'm sure plenty of people would find it useful. — Paul R, Oct 03 '14 at 05:48
@PaulR, I fixed the problems. One was in the formula I compared to from another link. Another was using `packs` instead of `packus` and the third was even more subtle. Subtracting over 128 was causing overflow beyond -32768. Using 64 instead fixed this (and I proved it always gets it right). If you want to now more see my edits. — Z boson, Oct 03 '14 at 12:23
@PaulR, I added another answer which calculate Y,U, and V using only the vertical operations. — Z boson, Oct 03 '14 at 12:24
@Zboson: Wow - you've been busy! Have you done any benchmarking with these implementations? — Paul R, Oct 03 '14 at 12:37
@PaulR, not yet. But it's all working now so it could be done. — Z boson, Oct 03 '14 at 15:06

score 3 · Answer 1 · edited May 23 '17 at 12:21

Here is a solution based on the comments by the OP and Paul R. The intrinsic _mm_maddubs_epi16 requires that the second parameter by signed which is a problem for the 129 factor of g. However, we can get around this by doing this

y = ((66-64)*r + (129-64)*g + (25-64)*b + -64*a) + (64*r + 64*g + 64*b + 64*a)
  = (2*r + 65*g + -39*b -64*a) + 64(r + g + a)

Using this we only need 16-bit integers and we can calculate 16 y bytes at a time like this:

Note that I originally used 128 but this caused overflow since 255*((25-128)-128)<-32768.

__m128i yk = _mm_set1_epi32(0xc0d94102); -64,-39,64,2
__m128i y4[4];
for(int i=0; i<4; i++) {
    __m128i a4 = _mm_loadu_si128((__m128i*)&a[16*i]);
    __m128i t1 = _mm_maddubs_epi16(a4, yk);
    __m128i t2 = _mm_maddubs_epi16(a4, _mm_set1_epi8(1));
    t2 = _mm_slli_epi16(t2, 6);  //multiply by 64
    y4[i] = _mm_add_epi16(t1,t2);
}
short tmp[8];
_mm_storeu_si128((__m128i*)tmp, y4[0]);
__m128i y8_lo = _mm_hadd_epi16(y4[0], y4[1]);
__m128i y8_hi = _mm_hadd_epi16(y4[2], y4[3]);

y8_lo = _mm_add_epi16(y8_lo, _mm_set1_epi16(128));
y8_lo = _mm_srli_epi16(y8_lo, 8);
y8_lo = _mm_add_epi16(y8_lo, _mm_set1_epi16(16));

y8_hi = _mm_add_epi16(y8_hi, _mm_set1_epi16(128));
y8_hi = _mm_srli_epi16(y8_hi, 8);
y8_hi = _mm_add_epi16(y8_hi, _mm_set1_epi16(16));

__m128i y16 = _mm_packus_epi16(y8_lo,y8_hi);

Here is code showing this works. I compared the result with the formula (with modifications) from how to perform rgb yuv conversion in C/C++ which is:

#define CLIP(X) ( (X) > 255 ? 255 : (X) < 0 ? 0 : X)
#define RGB2Y(R, G, B) CLIP(( (  66 * (0xff & R) + 129 * (0xff & G) +  25 * (0xff & B) + 128) >> 8) +  16)

The code:

#include <stdio.h>
#include <x86intrin.h>
#include <stdlib.h>

#define CLIP(X) ( (X) > 255 ? 255 : (X) < 0 ? 0 : X)
#define RGB2Y(R, G, B) CLIP(( (  66 * (0xff & R) + 129 * (0xff & G) +  25 * (0xff & B) + 128) >> 8) +  16)

void rgba2y_SSE_v1(char *a, char *b) {
    __m128i yk = _mm_setr_epi16(66,129,25,0, 66,129,25,0);
    __m128i out[4];
    for(int i=0; i<4; i++) {        
        __m128i a4, lo, hi;
        a4 = _mm_loadu_si128((__m128i*)&a[16*i]);
        lo = _mm_unpacklo_epi8(a4,_mm_setzero_si128());
        hi = _mm_unpackhi_epi8(a4,_mm_setzero_si128());

        lo = _mm_madd_epi16(lo,yk);
        lo = _mm_hadd_epi32(lo,lo);

        hi  = _mm_madd_epi16(hi,yk);
        hi  = _mm_hadd_epi32(hi,hi);
        out[i] = _mm_unpackhi_epi64(lo,hi);
    }
    __m128i out_lo = _mm_packus_epi32(out[0], out[1]);
    __m128i out_hi = _mm_packus_epi32(out[2], out[3]);

    out_lo = _mm_add_epi16(out_lo, _mm_set1_epi16(128));
    out_lo = _mm_srli_epi16(out_lo, 8);
    out_lo = _mm_add_epi16(out_lo, _mm_set1_epi16(16)); 

    out_hi = _mm_add_epi16(out_hi, _mm_set1_epi16(128));
    out_hi = _mm_srli_epi16(out_hi, 8);
    out_hi = _mm_add_epi16(out_hi, _mm_set1_epi16(16)); 

    __m128i y16 = _mm_packus_epi16(out_lo,out_hi);
    _mm_storeu_si128((__m128i*)b,y16);
}

void rgba2y_SSE_v2(char *a, char *b) {
    __m128i yk = _mm_set1_epi32(0xc0d94102);
    __m128i y4[4];
    for(int i=0; i<4; i++) {
        __m128i a4 = _mm_loadu_si128((__m128i*)&a[16*i]);
        __m128i t1 = _mm_maddubs_epi16(a4, yk);
        __m128i t2 = _mm_maddubs_epi16(a4, _mm_set1_epi8(1));
        t2 = _mm_slli_epi16(t2, 6);
        y4[i] = _mm_add_epi16(t1,t2); 
    } 
    short tmp[8];
    _mm_storeu_si128((__m128i*)tmp, y4[0]);
    __m128i y8_lo = _mm_hadd_epi16(y4[0], y4[1]);
    __m128i y8_hi = _mm_hadd_epi16(y4[2], y4[3]);

    y8_lo = _mm_add_epi16(y8_lo, _mm_set1_epi16(128));
    y8_lo = _mm_srli_epi16(y8_lo, 8);
    y8_lo = _mm_add_epi16(y8_lo, _mm_set1_epi16(16)); 

    y8_hi = _mm_add_epi16(y8_hi, _mm_set1_epi16(128));
    y8_hi = _mm_srli_epi16(y8_hi, 8);
    y8_hi = _mm_add_epi16(y8_hi, _mm_set1_epi16(16)); 

    __m128i y16 = _mm_packus_epi16(y8_lo,y8_hi);
    _mm_storeu_si128((__m128i*)b,y16);
}

void rgba2yuv_SSE(char *a, char *b) {
    __m128i mask = _mm_setr_epi8(0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f);
    short m[9] = {66, 129, 25, -38, -74, 112, 112, -94, -18};

    __m128i row[4];
    for(int i=0; i<4; i++) {
        row[i] = _mm_loadu_si128((__m128i*)&a[16*i]);
        row[i] = _mm_shuffle_epi8(row[i],mask);
    }

    __m128i t0 = _mm_unpacklo_epi32(row[0], row[1]);
    __m128i t1 = _mm_unpacklo_epi32(row[2], row[3]);
    __m128i t2 = _mm_unpackhi_epi32(row[0], row[1]);
    __m128i t3 = _mm_unpackhi_epi32(row[2], row[3]);
    row[0] = _mm_unpacklo_epi64(t0, t1);
    row[1] = _mm_unpackhi_epi64(t0, t1);
    row[2] = _mm_unpacklo_epi64(t2, t3);

    __m128i v_lo[3], v_hi[3];
    for(int i=0; i<3; i++) {
        v_lo[i] = _mm_unpacklo_epi8(row[i],_mm_setzero_si128());
        v_hi[i] = _mm_unpackhi_epi8(row[i],_mm_setzero_si128());
    }

    __m128i yuv[3];
    for(int i=0; i<3; i++) {
        __m128i yuv_lo, yuv_hi;
        yuv_lo = _mm_add_epi16(_mm_add_epi16(
                       _mm_mullo_epi16(v_lo[0], _mm_set1_epi16(m[3*i+0])),
                       _mm_mullo_epi16(v_lo[1], _mm_set1_epi16(m[3*i+1]))),
                       _mm_mullo_epi16(v_lo[2], _mm_set1_epi16(m[3*i+2])));
        yuv_lo = _mm_add_epi16(yuv_lo, _mm_set1_epi16(128));
        yuv_lo = _mm_srli_epi16(yuv_lo, 8);
        yuv_lo = _mm_add_epi16(yuv_lo, _mm_set1_epi16(16)); 

        yuv_hi = _mm_add_epi16(_mm_add_epi16(
                       _mm_mullo_epi16(v_hi[0], _mm_set1_epi16(m[3*i+0])),
                       _mm_mullo_epi16(v_hi[1], _mm_set1_epi16(m[3*i+1]))),
                       _mm_mullo_epi16(v_hi[2], _mm_set1_epi16(m[3*i+2])));
        yuv_hi = _mm_add_epi16(yuv_hi, _mm_set1_epi16(128));
        yuv_hi = _mm_srli_epi16(yuv_hi, 8);
        yuv_hi = _mm_add_epi16(yuv_hi, _mm_set1_epi16(16)); 

        yuv[i] = _mm_packus_epi16(yuv_lo,yuv_hi);
    }   
    _mm_storeu_si128((__m128i*)b,yuv[0]);
}



int main(void) {
    char rgba[64];
    char y1[16], y2[16], yuv[48];
    for(int i=0; i<64; i++) rgba[i] = rand()%256;
    rgba2y_SSE_v1(rgba,y1);
    rgba2y_SSE_v2(rgba,y2);
    rgba2yuv_SSE(rgba,yuv);

    printf("RGB2Y: "); for(int i=0; i<16; i++) printf("%x ", 0xff & RGB2Y(rgba[4*i+0], rgba[4*i+1], rgba[4*i+2])); printf("\n");
    printf("SSE_v1 "); for(int i=0; i<16; i++) printf("%x ", 0xff & y1[i]); printf("\n");
    printf("SSE_v2 "); for(int i=0; i<16; i++) printf("%x ", 0xff & y2[i]); printf("\n");
    printf("SSE_v3 "); for(int i=0; i<16; i++) printf("%x ", 0xff & yuv[i]); printf("\n");

}

The output:

RGB2Y: 99 ad 94 e3 9a a2 60 81 45 59 49 a5 aa 9b 60 4d 
SSE_v1 99 ad 94 e3 9a a2 60 81 45 59 49 a5 aa 9b 60 4d 
SSE_v2 99 ad 94 e3 9a a2 60 81 45 59 49 a5 aa 9b 60 4d 
SSE_v3 99 ad 94 e3 9a a2 60 81 45 59 49 a5 aa 9b 60 4d

First code segment, where you explain the split-by-64, wouldn't the last parenthesis in line 2 have to be `64(r + g + b + a)`? — grandchild, May 16 '21 at 13:26

score 3 · Accepted Answer · answered Oct 03 '14 at 12:22

Here is a solution which finds Y,U, and V all at once and only uses vertical operators

To do this I first tranpose four pixels like this

rgbargbargbargba -> rrrrggggbbbbaaaa

using the intrinsic _mm_shuffle_epi8 with a mask. I do this to 16 pixels and then transpose them again

from

row[0] : rrrrggggbbbbaaaa
row[1] : rrrrggggbbbbaaaa
row[2] : rrrrggggbbbbaaaa
ro2[3] : rrrrggggbbbbaaaa

to

row[0] : rrrrrrrrrrrrrrrr    
row[1] : gggggggggggggggg    
row[2] : bbbbbbbbbbbbbbbb

This is done the same way as transposing a 4x4 integer matrix like this:

__m128i t0 = _mm_unpacklo_epi32(row[0], row[1]);
__m128i t1 = _mm_unpacklo_epi32(row[2], row[3]);
__m128i t2 = _mm_unpackhi_epi32(row[0], row[1]);
__m128i t3 = _mm_unpackhi_epi32(row[2], row[3]);
row[0] = _mm_unpacklo_epi64(t0, t1);
row[1] = _mm_unpackhi_epi64(t0, t1);
row[2] = _mm_unpacklo_epi64(t2, t3);

Now I split each row into high and low and expand to 16-bits like this

__m128i v_lo[3], v_hi[3];
for(int i=0; i<3; i++) {
    v_lo[i] = _mm_unpacklo_epi8(row[i],_mm_setzero_si128());
    v_hi[i] = _mm_unpackhi_epi8(row[i],_mm_setzero_si128());
}

Finally, I calculate Y,U, and V like this:

 short m[9] = {66, 129, 25, -38, -74, 112, 112, -94, -18};
__m128i yuv[3];
for(int i=0; i<3; i++) {
    __m128i yuv_lo, yuv_hi;
    yuv_lo = _mm_add_epi16(_mm_add_epi16(
                   _mm_mullo_epi16(v_lo[0], _mm_set1_epi16(m[3*i+0])),
                   _mm_mullo_epi16(v_lo[1], _mm_set1_epi16(m[3*i+1]))),
                   _mm_mullo_epi16(v_lo[2], _mm_set1_epi16(m[3*i+2])));
    yuv_lo = _mm_add_epi16(yuv_lo, _mm_set1_epi16(128));
    yuv_lo = _mm_srli_epi16(yuv_lo, 8);
    yuv_lo = _mm_add_epi16(yuv_lo, _mm_set1_epi16(16));

    yuv_hi = _mm_add_epi16(_mm_add_epi16(
                   _mm_mullo_epi16(v_hi[0], _mm_set1_epi16(m[3*i+0])),
                   _mm_mullo_epi16(v_hi[1], _mm_set1_epi16(m[3*i+1]))),
                   _mm_mullo_epi16(v_hi[2], _mm_set1_epi16(m[3*i+2])));
    yuv_hi = _mm_add_epi16(yuv_hi, _mm_set1_epi16(128));
    yuv_hi = _mm_srli_epi16(yuv_hi, 8);
    yuv_hi = _mm_add_epi16(yuv_hi, _mm_set1_epi16(16));

    yuv[i] = _mm_packus_epi16(yuv_lo,yuv_hi);
}

For a working example of this code see my first answer and the function rgba2yuv_SSE.

Yep, perfect. My original code was based on _mm_set_epi8 to unpack all the rgba data, which was horrendously slow. — Kustom666, Oct 04 '14 at 20:52
I have corrected your function, as it contains a little error in the u and v calculation, and fixed to convert to NV12 (which was my original goal). Will post the full code soon. — Kustom666, Oct 04 '14 at 23:45
@Kustom666, yeah, I had to leave yous something to do :-) I did not even bother writing out `u` and `v`. I also added `16` to `u` and `v` like for `y` instead of 128 as in the link. With a bit more effort you can interleave `u` and `v` before you store them. I'll try and update my answer at some point in the next few days to show this. — Z boson, Oct 05 '14 at 08:59
@Kustom666, BTW, I would not write off the horizontal method right away. I only calculate `y` with it but it should be clear how to calculate `u` and `v` as well. Since it does not need to shuffle the data going in it could be faster than the vertical method. — Z boson, Oct 05 '14 at 09:00

Intel intrinsics : multiply interleaved 8bit values

2 Answers2