0

I have 2 pyrDown implementation with SSE2 and AVX instructions set. They are differ and AVX implementation get wrong image result. Also AVX implementation is slower that SSE2 impl. It's strange. Whats wrong with AVX implementation and how it make faster?

// SSE2 implementation
static __inline __m128i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
  __m128i v0 = _mm_load_si128((const __m128i *)src);
  __m128i v1 = _mm_load_si128((const __m128i *)&src[srcStep]);
  return _mm_avg_epu8(v0, v1);
}

// SSSE3 version
// I used `__restrict__` to give the compiler more flexibility in unrolling
void average2Rows(const uint8_t* __restrict__ src,
                  uint8_t*__restrict__ dst,
                  size_t srcStep,
                  size_t size)
{
    const __m128i vk1 = _mm_set1_epi8(1);
    const __m128i add2 = _mm_set1_epi16(2);
    size_t dstsize = size/2;
    for (size_t i = 0; i < dstsize - 15; i += 16)
    {
        const size_t ii = i*2;
        // based on https://stackoverflow.com/a/45564565/820795
        __m128i left  = average2RowsSingle(src+ii, srcStep);
        __m128i right = average2RowsSingle(src+ii+16, srcStep);
        
        __m128i w0 = _mm_maddubs_epi16(left, vk1);        // unpack and horizontal add
        __m128i w1 = _mm_maddubs_epi16(right, vk1);
        w0 = _mm_srli_epi16(w0, 1);                     // divide by 2
        w1 = _mm_srli_epi16(w1, 1);
        w0 = _mm_packus_epi16(w0, w1);                  // pack
        
        _mm_storeu_si128((__m128i *)&dst[i], w0);
    }
}
// AVX implementation
static __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
  auto v0 = _mm256_load_si256((const __m256i*)src);
  auto v1 = _mm256_load_si256((const __m256i*)&src[srcStep]);
  return _mm256_avg_epu8(v0, v1);
}

void average2Rows(const uint8_t* __restrict__ src,
                     uint8_t*__restrict__ dst,
                     size_t srcStep,
                     size_t size) {
  const __m128i vk1 = _mm_set1_epi8(1);
  size_t dstsize = size/2;
  const signed char o = -1; // make shuffle zero
  const __m256i vec_r_i16 = _mm256_set_epi8(o,30, o,28, o,26, o,24, o,22, o,20, o,18, o,16,
                                            o,14, o,12, o,10, o, 8, o, 6, o, 4, o, 2, o, 0);
  const __m256i vec_l_i16 = _mm256_set_epi8(o,31, o,29, o,27, o,25, o,23, o,21, o,19, o,17,
                                            o,15, o,13, o,11, o, 9, o, 7, o, 5, o, 3, o, 1);
  for (size_t i = 0; i < dstsize - 31; i += 32)
  {
    const size_t ii = i * 2;
    auto left = average2RowsSingle(src + ii, srcStep);
    auto right = average2RowsSingle(src + ii + 32, srcStep);

    auto w0 = _mm256_shuffle_epi8(left, vec_r_i16);
    auto w1 = _mm256_shuffle_epi8(left, vec_l_i16);
    left = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);

    w0 = _mm256_shuffle_epi8(right, vec_r_i16);
    w1 = _mm256_shuffle_epi8(right, vec_l_i16);
    right = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);

    left = _mm256_packus_epi16(left, right);

    _mm256_storeu_si256((__m256i *) &dst[i], left);
  }
}

Wrong result after AVX implementation: wrong_result_img

Gralex
  • 4,285
  • 7
  • 26
  • 47
  • 2
    Without looking into more details, a likely reason for the wrong results of the AVX2 version is that `_mm256_packus_epi16` works in both lanes independently. And I don't quite understand why you use two shuffles and an addition instead of `_mm256_maddubs_epi16` in the AVX2 variant. – chtz Jan 09 '23 at 22:22
  • @chtz Good point with `_mm256_maddubs_epi16`. I started with AVX512 version and compiler didn't know about this operation for `_mm512`. So I wrote this code using shuffle. But my computer didn't support AVX512 and I switched to AVX instructions. With `_mm256_maddubs_epi16` method works faster. Can you suggest what can I do with `packus`? – Gralex Jan 10 '23 at 06:50
  • 2
    You can do one `_mm256_permute4x64_epi64` afterwards. Or you can load your inputs in halves (`in0 = {src[0], src[2]}, in1 = {src[1], src[3]}`) -- but that only makes sense if you are bounded by the shuffle-port and not by loading. – chtz Jan 10 '23 at 13:35
  • Thanks @chtz . You can post it like answer and I will upvote it. – Gralex Jan 11 '23 at 12:18
  • 2
    Is your update still part of the question, or does it work? If it works, you should post it as an answer yourself; don't edit answers into the question. For lane-crossing pack with `vpacksswb` and then `vpermq` to handle the lane-crossing, there are a few existing Q&As. – Peter Cordes Jan 11 '23 at 15:00

1 Answers1

2

With help of @chtz I come up to this code:

inline __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
  auto v0 = _mm256_loadu_si256((const __m256i *)src);
  auto v1 = _mm256_loadu_si256((const __m256i *)&src[srcStep]);
  return _mm256_avg_epu8(v0, v1);
}

void average2Rows(const uint8_t* __restrict__ src,
                  uint8_t*__restrict__ dst,
                  size_t srcStep,
                  size_t size) {
  const auto vk1 = _mm256_set1_epi8(1);
  const size_t dstSize = size/2;
  for (size_t i = 0; i < dstSize - 31; i += 32)
  {
    const size_t ii = i * 2;
    // based on https://stackoverflow.com/a/45564565/820795
    auto left = average2RowsSingle(src + ii, srcStep);
    auto right = average2RowsSingle(src + ii + 32, srcStep);

    auto w0 = _mm256_maddubs_epi16(left, vk1);        // unpack and horizontal add
    auto w1 = _mm256_maddubs_epi16(right, vk1);
    w0 = _mm256_srli_epi16(w0, 1);                     // divide by 2
    w1 = _mm256_srli_epi16(w1, 1);
    w0 = _mm256_packus_epi16(w0, w1);                  // pack
    w0 = _mm256_permute4x64_epi64(w0, 0xd8);           // shuffle to get correct order

    _mm256_storeu_si256((__m256i *)&dst[i], w0);
  }
}

Result image: correct_image

Gralex
  • 4,285
  • 7
  • 26
  • 47