4

I have two __m256i vectors, filled with 32 8-bit integers. Something like this:

    __int8 *a0 = new __int8[32] {2};
    __int8 *a1 = new __int8[32] {3};

    __m256i v0 = _mm256_loadu_si256((__m256i*)a0);
    __m256i v1 = _mm256_loadu_si256((__m256i*)a1);

How can i multiply these vectors, using something like _mm256_mul_epi8(v0, v1) (which does not exist) or any another way?

I want 2 vectors of results, because the output element width is twice the input element width. Or something that works similarly to _mm_mul_epu32 would be ok, using only the even input elements (0, 2, 4, etc.)

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
KaraUL
  • 41
  • 4
  • 2
    you may see this https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t – user338371 Sep 17 '18 at 15:25
  • 2
    [`_mm256_mul_epi32` *does* exist.](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32&expand=3651,3651,3651) Do you mean `_mm256_mul_epi8` ? Please also specify what sort of result you want: 16 bit ? Low order 8 bits ? High order 8 bits ? Saturated 8 bit result ? – Paul R Sep 17 '18 at 15:26
  • @PaulR, sorry i really mean '_mm256_mul_epi8', which does not exist. I want to get 16 bit result, if possible. – KaraUL Sep 18 '18 at 07:19
  • I edited your question to be specific about signedness of inputs/outputs and include the detail from your comment. Please edit again if anything I added isn't actually what you want. – Peter Cordes Sep 18 '18 at 07:27
  • 1
    The obvious way is to unpack to 16-bit elements (inconvenient for signed because of lane-crossing, but you could always unpack and use `vpmovsx`), and use `_mm256_mullo_epi16`. You might also be able to use [`_mm256_maddubs_epi16`](http://felixcloutier.com/x86/PMADDUBSW.html) with some masking to get odd/even elements, if you can deal with one input being unsigned and the other signed. (`0*anything + a*b` = `a*b`, so you have a widening multiply that just takes one AND instruction to set up for.) – Peter Cordes Sep 18 '18 at 07:31
  • @PeterCordes thank you, `_mm256_maddubs_epi16` is definitely what i need! (In my case all input is unsigned, now i see how can this detail be important) – KaraUL Sep 18 '18 at 08:10
  • 1
    @KaraUL: Read the docs *carefully*: `maddubs` takes one signed input, and one unsigned input. If one of your unsigned inputs is limited range, like always 0..127, then you can use it as the signed input. – Peter Cordes Sep 18 '18 at 08:15

1 Answers1

3

You want the result separated in two vectors so this is my suggestion for your question. I've tried to be clear, simple and realizable:

#include <stdio.h>
#include <x86intrin.h>
 void _mm256_print_epi8(__m256i );
 void _mm256_print_epi16(__m256i );
 void _mm256_mul_epi8(__m256i , __m256i , __m256i* , __m256i* );


int main()
{
    char a0[32] = {1, 2, 3, -4, 5, 6, 7, 8, 9, -10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, -24, 25, 26, 27, 28, 29, 30, 31, 32};
    char a1[32] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -13, 14, 15, 16, 17, 18, 19, -20, 21, 22, 23, 24, -25, 26, 27, 28, 29, 30, 31, 32, 33};

    __m256i v0 = _mm256_loadu_si256((__m256i*) &a0[0]);
    __m256i v1 = _mm256_loadu_si256((__m256i*) &a1[0]);

    __m256i r0, r1;//for 16 bit results

    _mm256_mul_epi8(v0, v1, &r0, &r1);

    printf("\nv0 = ");_mm256_print_epi8(v0);
    printf("\nv1 = ");_mm256_print_epi8(v1);
    printf("\nr0 = ");_mm256_print_epi16(r0);
    printf("\nr1 = ");_mm256_print_epi16(r1);
    printf("\nfinished\n");


    return 0;
}
//v0 and v1 are 8 bit input vectors. r0 and r1 are 18 bit results of multiplications
 void _mm256_mul_epi8(__m256i v0, __m256i v1, __m256i* r0, __m256i* r1)
{
    __m256i tmp0, tmp1;
    __m128i m128_v0, m128_v1;

    m128_v0 = _mm256_extractf128_si256 (v0, 0);
    m128_v1 = _mm256_extractf128_si256 (v1, 0);

    tmp0= _mm256_cvtepi8_epi16 (m128_v0); //printf("\ntmp0 = ");_mm256_print_epi16(tmp0);
    tmp1= _mm256_cvtepi8_epi16 (m128_v1); //printf("\ntmp1 = ");_mm256_print_epi16(tmp1);


    *r0 =_mm256_mullo_epi16(tmp0, tmp1);

    m128_v0 = _mm256_extractf128_si256 (v0, 1);
    m128_v1 = _mm256_extractf128_si256 (v1, 1);

    tmp0= _mm256_cvtepi8_epi16 (m128_v0); //printf("\ntmp0 = ");_mm256_print_epi16(tmp0);
    tmp1= _mm256_cvtepi8_epi16 (m128_v1); //printf("\ntmp1 = ");_mm256_print_epi16(tmp1);

    *r1 =_mm256_mullo_epi16(tmp0, tmp1);


}
 void _mm256_print_epi8(__m256i vec)
{
    char temp[32];
    _mm256_storeu_si256((__m256i*)&temp[0], vec);
    int i;
    for(i=0; i<32; i++)
        printf(" %3i,", temp[i]);


}

 void _mm256_print_epi16(__m256i vec)
{
    short temp[16];
    _mm256_storeu_si256((__m256i*)&temp[0], vec);
    int i;
    for(i=0; i<16; i++)
        printf(" %3i,", temp[i]);   
}

The output is:

[martin@mrt Stack over flow]$ gcc -O2 -march=native mul_epi8.c -o out
[martin@mrt Stack over flow]$ ./out

v0 =    1,   2,   3,  -4,   5,   6,   7,   8,   9, -10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23, -24,  25,  26,  27,  28,  29,  30,  31,  32,
v1 =    2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12, -13,  14,  15,  16,  17,  18,  19, -20,  21,  22,  23,  24, -25,  26,  27,  28,  29,  30,  31,  32,  33,
r0 =    2,   6,  12, -20,  30,  42,  56,  72,  90, -110, 132, -156, 182, 210, 240, 272,
r1 =  306, 342, -380, 420, 462, 506, 552, 600, 650, 702, 756, 812, 870, 930, 992, 1056,
finished
[martin@mrt Stack over flow]$ 

NOTE: I've commented the intermediate results tmp0 and tmp1 in the recommended code. In addition, as peter suggested in comments and provided a godbolt link, if your program loads from memory and you don't need to multiply elements in vectors you can use this code:

#include <immintrin.h>

//v0 and v1 are 8 bit input vectors. r0 and r1 are 18 bit results of multiplications
__m256i mul_epi8_to_16(__m128i v0, __m128i v1)
{
    __m256i tmp0 = _mm256_cvtepi8_epi16 (v0); //printf("\ntmp0 = ");_mm256_print_epi16(tmp0);
    __m256i tmp1 = _mm256_cvtepi8_epi16 (v1); //printf("\ntmp1 = ");_mm256_print_epi16(tmp1);

    return _mm256_mullo_epi16(tmp0, tmp1);
}

__m256i mul_epi8_to_16_memsrc(char *__restrict a, char *__restrict b){

    __m128i v0 = _mm_loadu_si128((__m128i*) a);
    __m128i v1 = _mm_loadu_si128((__m128i*) b);
    return mul_epi8_to_16(v0, v1);
}


int main()
{
    char a0[32] = {1, 2, 3, -4, 5, 6, 7, 8, 9, -10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, -24, 25, 26, 27, 28, 29, 30, 31, 32};
    char a1[32] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -13, 14, 15, 16, 17, 18, 19, -20, 21, 22, 23, 24, -25, 26, 27, 28, 29, 30, 31, 32, 33};

    __m256i r0 = mul_epi8_to_16_memsrc(a0, a1);

}
Amiri
  • 2,417
  • 1
  • 15
  • 42
  • 1
    You probably want to use 128-bit input vectors, so this can compile to `vpmovsxbw ymm0, [mem]` loads, and not bottleneck on the shuffle port from also unpacking with `_mm256_extractf128_si256`. Also, that would let your helper function just return a single `__m256i`. – Peter Cordes Sep 23 '18 at 00:01
  • Some negative inputs would be good for your test-cases, to show that `_mm256_cvtepi8_epi16` is properly sign-extending, unlike unpacking with zero or using `_mm256_cvtepu8_epi16` – Peter Cordes Sep 23 '18 at 00:03
  • @PeterCordes, Hey Peter, actually I think using `vpmovsxbw` which the intrinsic is `__m256i _mm256_cvtepi8_epi16 (__m128i a)` on the elements that are presented in a vector might not be better than extraction. Because I have to write to cache and read from it again. Do I miss something? – Amiri Sep 23 '18 at 00:19
  • 2
    Compilers can fold a `_mm_loadu_si128` into a memory source for `_mm256_cvtepi8_epi16`. There's no store/reload. https://godbolt.org/z/GyX-V7 is a version of your function with 128-bit inputs, and a wrapper that inlines it on the result of 2 loads. The OP shows that they're loading from memory, so they don't already have `__m256i` vectors as a result of some calculation. – Peter Cordes Sep 23 '18 at 00:25