4

I have a __m256i vector containing 16x16-bit elements.I want to apply a three adjacent horizontal addition on it. In scalar mode I use the following code:

unsigned short int temp[16];
__m256i sum_v;//has some values. 16 elements of 16-bit vector.   | 0 | x15 | x14 | x13 | ... | x3 | x2 | x1 |
_mm256_store_si256((__m256i *)&temp[0], sum_v);
output1 = (temp[0] + temp[1] + temp[2]);
output2 = (temp[3] + temp[4] + temp[5]);
output3 = (temp[6] + temp[7] + temp[8]);
output4 = (temp[9] + temp[10] + temp[11]);
output5 = (temp[12] + temp[13] + temp[14]); 
// Dont want the 15th element

Because this part is placed in the bottleneck section of my program, I decided to vectorize is using AVX2. Dreamy I can add them like the following pseudo:

sum_v                                     //|  0  | x15 | x14 | x13 |...| x10 |...| x7 |...| x4 |...| x1 | 
sum_v1 = sum_v >> 1*16                    //|  0  |  0  | x15 | x14 |...| x11 |...| x8 |...| x5 |...| x2 |  
sum_v2 = sumv >> 2*16                     //|  0  |  0  |  0  | x15 |...| x12 |...| x9 |...| x6 |...| x3 |
result_vec = add_epi16 (sum_v,sum_v1,sum_v2)

//then I should extact the result_vec to outputs 

Adding them vertically will provide the answer. But unfortunately, AVX2 has not a shift operation for 256 bits while the 256-bit register is viewed as two 128-bit lanes. I should use permutation for this case. But I could not find an appropriate permut, shuffle, etc. to do this. Is there any suggestion for this implementation that should be as fast as possible.

Using gcc, linux mint, intrinsics, skylake.

phuclv
  • 37,963
  • 15
  • 156
  • 475
Amiri
  • 2,417
  • 1
  • 15
  • 42
  • 1
    Do you care about potential overflow, i.e. would it be better to unpack to 32 bits prior to the addition, or is this not a problem ? – Paul R Feb 08 '17 at 08:54
  • 1
    @PaulR, Each 16-bit element contains a number between 0 and 255 as a pixel. So overflows might be saturated, but I don't care for this program. You mentioned a good point there are some extra and unused bits that I can use to increase the accuracy. – Amiri Feb 10 '17 at 10:26

3 Answers3

6

You can try to use something like this:

#include <immintrin.h>
#include <iostream>

template<class T> inline void Print(const __m256i & v)
{
    T b[sizeof(v) / sizeof(T)];
    _mm256_storeu_si256((__m256i*)b, v);
    for (int i = 0; i < sizeof(v) / sizeof(T); i++)
        std::cout << int(b[i]) << " ";
    std::cout << std::endl;
}

template<int shift> inline __m256i Shift(const __m256i & a)
{
    return _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, shift * 2);
}

int main()
{
    __m256i v0 = _mm256_setr_epi16(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 0);
    __m256i v1 = Shift<1>(v0);
    __m256i v2 = Shift<2>(v0);
    __m256i r = _mm256_add_epi16(v0, _mm256_add_epi16(v1, v2));

    Print<short>(v0);
    Print<short>(v1);
    Print<short>(v2);
    Print<short>(r);
}

Output:

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0
2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 0
3 4 5 6 7 8 9 10 11 12 13 14 15 0 0 0
6 9 12 15 18 21 24 27 30 33 36 39 42 29 15 0
ErmIg
  • 3,980
  • 1
  • 27
  • 40
  • I like your solution better than mine – Z boson Feb 08 '17 at 12:49
  • Counting instructions, your solution uses 4 before the add and mine uses 5. I'm not so familiar with `vpalignr`. – Z boson Feb 08 '17 at 12:51
  • 1
    @Zboson I found this instruction recently - it is very useful in order to calculate convolution. – ErmIg Feb 08 '17 at 13:00
  • @Zboson A [solution](http://stackoverflow.com/a/42132108/2439725) with only 2 instructions before the adds is also possible, because we don't need an exact 256 bit shift. Some 'holes' are allowed. – wim Feb 09 '17 at 10:02
  • It turns out my original answer was correct so now my answer uses the same number of instructions as yours. Of course counting instructions is not the definitive metric for performance. It would be interesting to see what method wins. In any case it appears @wim has the best solution now. – Z boson Feb 09 '17 at 10:45
  • @ErmIg, could you take a look at [this question](http://stackoverflow.com/questions/43023141/how-to-add-an-avx2-vector-5-by-5) – Amiri Mar 26 '17 at 02:52
5

You can do this with two adds and only 2 "shuffles": _mm256_bsrli_epi128 shifts in zeros at positions that are not of interest to the answer. For _mm256_permutevar8x32_epi32 we choose a permutation that duplicates the upper 32 bits, but these bits are also not relevant for the answer.

#include <stdio.h>
#include <x86intrin.h>
/*  gcc -O3 -Wall -m64 -march=haswell hor_sum3x3.c   */
int print_vec_short(__m256i x);
int print_12_9_6_3_0_short(__m256i x);

int main() {
   short x[16];

   for(int i=0; i<16; i++) x[i] = i+1; x[15] = 0;

   __m256i t0   = _mm256_loadu_si256((__m256i*)x);                              


   __m256i t1   = _mm256_bsrli_epi128(t0,2);             /* Shift 128 bit lanes in t0 right by 2 bytes while shifting in zeros. Fortunately the zeros are in the positions that we don't need */ 
   __m256i t2   = _mm256_permutevar8x32_epi32(t0,_mm256_set_epi32(7,7,6,5,4,3,2,1)); /* Shift right by 4 bytes     */
   __m256i sum  = _mm256_add_epi16(_mm256_add_epi16(t0,t1),t2);

   printf("t0  = ");print_vec_short(t0);
   printf("t1  = ");print_vec_short(t1);
   printf("t2  = ");print_vec_short(t2);
   printf("sum = ");print_vec_short(sum);

   printf("\nvector elements of interest: columns 12, 9, 6, 3, 0:\n");
   printf("t0[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t0);
   printf("t1[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t1);
   printf("t2[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t2);
   printf("sum[12, 9, 6, 3, 0] = ");print_12_9_6_3_0_short(sum);
   return 0;
}


int print_vec_short(__m256i x){
   short int v[16];
   _mm256_storeu_si256((__m256i *)v,x);
   printf("%4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi  | %4hi %4hi %4hi %4hi \n",
          v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
   return 0;
}

int print_12_9_6_3_0_short(__m256i x){
   short int v[16];
   _mm256_storeu_si256((__m256i *)v,x);
   printf("%4hi %4hi %4hi %4hi %4hi  \n",v[12],v[9],v[6],v[3],v[0]);
   return 0;
}

The output is:

$ ./a.out
t0  =    0   15   14   13 |   12   11   10    9 |    8    7    6    5  |    4    3    2    1 
t1  =    0    0   15   14 |   13   12   11   10 |    0    8    7    6  |    5    4    3    2 
t2  =    0   15    0   15 |   14   13   12   11 |   10    9    8    7  |    6    5    4    3 
sum =    0   30   29   42 |   39   36   33   30 |   18   24   21   18  |   15   12    9    6 

vector elements of interest: columns 12, 9, 6, 3, 0:
t0[12, 9, 6, 3, 0]  =   13   10    7    4    1  
t1[12, 9, 6, 3, 0]  =   14   11    8    5    2  
t2[12, 9, 6, 3, 0]  =   15   12    9    6    3  
sum[12, 9, 6, 3, 0] =   42   33   24   15    6  
wim
  • 3,702
  • 19
  • 23
  • Awesome! But the second column is wrong. It should be 15 not 30. I original had 4 instructions and but then had to add one to fix that. One other point worth considering, I'm not sure the OP means we can assume the last element is zero. Maybe the OP meant the last element should not be used in the addition. I don't know. I mean what happens if you set the last element of t0 to 16 does this propagate to the sum? If so that's not what the OP wants I think. I may have made a mistake by doing `x[15] = 0`. – Z boson Feb 09 '17 at 10:12
  • Yes I just removed the `x[15] = 0` and the last three elements of the sum become `32 46 45` but they should be `0 15 29`. – Z boson Feb 09 '17 at 10:17
  • @Zboson Do you mean that the `30` in the row `sum = 0 30 29 42 | 39 ...` is wrong? As far as I understood the question this `30` was not of interest to the OP. output1...output5 is column 0,3 6,9,12, counting from the right? – wim Feb 09 '17 at 10:25
  • 1
    @Zboson I think we don't have to worry about the value of `x[15]` because `sum[12, 9, 6, 3, 0] = 42 33 24 15 6 ` is independent of the value of `x[15]` . – wim Feb 09 '17 at 10:35
  • Thanks guys for great answers. I think this answer might be the best. because of these points (not tested yet) First, this answer calculate the sum with 3 instructions but the accepted one use 2 set of `valignr` and `vperm*` to reorder them and then adding them totally 5 instructions which `valignr` , `vperm*`, and `vpsrldq` executed in shuffling port that might be better to use this method. Second I asked the question with intrinsics in a C program but The accepted answer is written in C++ and template. I want to change the accepted answer to this after I tested the performance and penalties – Amiri Feb 10 '17 at 10:50
  • The speedup of this answer over accepted answer is 1 for small matrices, 1.0613810742, 1.0574408069, So there are not much diffrences. – Amiri Feb 11 '17 at 06:13
  • @FackedDoctor You write that this piece of code is placed in the bottleneck section of your program. It is very likely that this piece of code is only a very small part of this bottleneck, otherwise I would have expected a better speed up. – wim Feb 13 '17 at 12:38
  • @FackedDoctor Have you vectorized the code that computes `v_sum` (the code before the horizontal sum)? What do you do with the results of `sum = _mm256_add_epi16(_mm256_add_epi16(t0,t1),t2);`? You should certainly not process output1, output2,... further in scalar mode! Moreover, do not store `sum` to memory and read it back with `_mm256_store_si256((__m256i *)&temp[0], sum); output1=sum[0];`. You have to vectorize (almost) everything which is within the main loop of your program to get good results. – wim Feb 13 '17 at 12:39
  • @wim, Thank you.yes, I vectorized all the program and this piece of code is just for addition. and yes, the whole speed up over scalar is 15x but not because of this horizontal addition the last speed up was 13x when I changed the addition method the speedup changed to 15x. yes you are right. and yes I applied these points. I write this speed ups to compare your answer with the accepted answer. – Amiri Feb 14 '17 at 18:48
  • yes, I use avx2 instructions to extract the result and I have to store the result to a matrix. assume that each output X is a matrix place but in a row of the matrix and the largest distance between them is 12 element. – Amiri Feb 14 '17 at 19:02
  • @FackedDoctor: Good work! Thanks for your reply. It's a pity that you cannot process outputX in a more SIMD way. Nevertheless, a speed op of 15x is great! – wim Feb 14 '17 at 21:36
  • 1
    @wim, Actually I can process the outputs in a more SIMD way. I think it will yield another great speedup, I have to mask the results in a vector. The program has 3 steps. These five results are for step 1. Step 2 another five results and step 3 will compute another 5 results. it will generate 15 results that can be stored in an aligned address but I should play with it. these results are for these places:step 1: {12,9,6,3,0}, step 2: {13,10,7,4,1} step 3: {14,11,8,5,2} finally a vector containing {14,13,12,11,10,9,8,7,6,5,4,3,2,1,0} by unaligned store will finish the job. – Amiri Feb 15 '17 at 12:44
  • @FackedDoctor : I think you are on the right track. – wim Feb 15 '17 at 15:23
  • @wim, could you take a look at [this question](http://stackoverflow.com/questions/43023141/how-to-add-an-avx2-vector-5-by-5) – Amiri Mar 26 '17 at 02:52
3

You could try something like this

__m256i idx1 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1);
__m256i idx2 = _mm256_setr_epi32(1,2,3,4,5,6,7,0);

__m256i t1 = _mm256_shuffle_epi8 (t0, idx1);
__m256i t2 = _mm256_permute2x128_si256(t1, t1, 1);
__m256i t3 = _mm256_blend_epi16(t1,t2,0x80);
__m256i t4 = _mm256_permutevar8x32_epi32(t0, idx2);
__m256i s = _mm256_add_epi16(t0, _mm256_add_epi16(t3,t4));

I based this example off this question.

Here is a working example

#include <stdio.h>
#include <x86intrin.h>

int main(void) {
  short x[16];

  for(int i=0; i<16; i++) x[i] = i;
  __m256i idx1 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
                  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1);
  __m256i idx2 = _mm256_setr_epi32(1,2,3,4,5,6,7,0);

  __m256i t0 = _mm256_loadu_si256((__m256i*)x);
  __m256i t1 = _mm256_shuffle_epi8 (t0, idx1);
  __m256i t2 = _mm256_permute2x128_si256(t1, t1, 1);
  __m256i t3 = _mm256_blend_epi16(t1,t2,0x80);
  __m256i t4 = _mm256_permutevar8x32_epi32(t0, idx2);
  __m256i s = _mm256_add_epi16(t0, _mm256_add_epi16(t3,t4));

  short y[16];
  _mm256_storeu_si256((__m256i*)y, t0);
  for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
  _mm256_storeu_si256((__m256i*)y, t3);
  for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
  _mm256_storeu_si256((__m256i*)y, t4);
  for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
  _mm256_storeu_si256((__m256i*)y, s);
  for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
}

Output

0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f 
1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0 
2  3  4  5  6  7  8  9  a  b  c  d  e  f  0  1 
3  6  9  c  f 12 15 18 1b 1e 21 24 27 2a 1d 10 
Community
  • 1
  • 1
Z boson
  • 32,619
  • 11
  • 123
  • 226