8

I often need to calculate integral image. This is simple algorithm:

uint32_t void integral_sum(const uint8_t * src, size_t src_stride, size_t width, size_t height, uint32_t * sum, size_t sum_stride)
{
    memset(sum, 0, (width + 1) * sizeof(uint32_t));
    sum += sum_stride + 1;
    for (size_t row = 0; row < height; row++)
    {
        uint32_t row_sum = 0;
        sum[-1] = 0;
        for (size_t col = 0; col < width; col++)
        {
            row_sum += src[col];
            sum[col] = row_sum + sum[col - sum_stride];
        }
        src += src_stride;
        sum += sum_stride;
    }
}

And I have a question. Can I speed up this algorithm (for example, with using of SSE or AVX)?

Tristan
  • 105
  • 1
  • 5
  • see [Is computing integral image on GPU really faster than on CPU?](https://stackoverflow.com/a/43909260/2521214) or you can even use multi-threading on CPU. – Spektre Oct 02 '17 at 06:37
  • 1
    You could remove the `memset` because you immediately overwrite the buffer. – Galik Oct 02 '17 at 07:05
  • @Galik There is no overwriting (sum += sum_stride + 1;). – ErmIg Oct 02 '17 at 08:10

1 Answers1

9

There is a nuisance feature in the algorithm: integral sum in the each point of the image depends on previous value of integral sum in the row. This circumstance obstruct to vectorization of the algorithm (the using of vector instructions such as SSE or AVX). But there is a trick with using of special instruction vpsadbw (AVX2) or vpsadbw (AVX-512BW).

AVX2 version of algorithm:

void integral_sum(const uint8_t * src, size_t src_stride, size_t width, size_t height, uint32_t * sum, size_t sum_stride)
{
    __m256i MASK = _mm_setr_epi64(0x00000000000000FF, 0x000000000000FFFF, 0x0000000000FFFFFF, 0x00000000FFFFFFFF);
    __m256i PACK = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
    __m256i ZERO = _mm256_set1_epi32(0);

    memset(sum, 0, (width + 1)*sizeof(uint32_t));
    sum += sum_stride + 1;
    size_t aligned_width = width/4*4;

    for(size_t row = 0; row < height; row++)
    {
        sum[-1] = 0;
        size_t col = 0;
        __m256i row_sums = ZERO;
        for(; col < aligned_width; col += 4)
        {
            __m256i _src = _mm256_and_si256(_mm256_set1_epi32(*(uint32_t*)(src + col)), MASK);
            row_sums = _mm256_add_epi32(row_sums, _mm256_sad_epu8(_src, ZERO));
            __m128i curr_row_sums = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(row_sums, PACK));
            __m128i prev_row_sums = _mm_loadu_si128((__m128i*)(sum + col - sum_stride));
            _mm_storeu_si128((__m128i*)(sum + col), _mm_add_epi32(curr_row_sums, prev_row_sums));
            row_sums = _mm256_permute4x64_epi64(row_sums, 0xFF);
        }
        uint32_t row_sum = sum[col - 1] - sum[col - sum_stride - 1];
        for (; col < width; col++)
        {
            row_sum += src[col];
            sum[col] = row_sum + sum[col - sum_stride];
        }
        src += src_stride;
        sum += sum_stride;
    }
}

This trick can improve performance in 1.8 times.

Analogue with using of AVX-512BW:

void integral_sum(const uint8_t * src, size_t src_stride, size_t width, size_t height, uint32_t * sum, size_t sum_stride)
{
    __m512i MASK = _mm_setr_epi64(
        0x00000000000000FF, 0x000000000000FFFF, 0x0000000000FFFFFF, 0x00000000FFFFFFFF
        0xFFFFFFFFFFFFFFFF, 0x00FFFFFFFFFFFFFF, 0x0000FFFFFFFFFFFF, 0x000000FFFFFFFFFF);
    __m512i K_15 = _mm512_set1_epi32(15);
    __m512i ZERO = _mm512_set1_epi32(0);

    memset(sum, 0, (width + 1)*sizeof(uint32_t));
    sum += sum_stride + 1;
    size_t aligned_width = width/8*8;

    for(size_t row = 0; row < height; row++)
    {
        sum[-1] = 0;
        size_t col = 0;
        __m512i row_sums = ZERO;
        for(; col < aligned_width; col += 8)
        {
            __m512i _src = _mm512_and_si512(_mm512_set1_epi32(*(uint32_t*)(src + col)), MASK);
            row_sums = _mm512_add_epi512(row_sums, _mm512_sad_epu8(_src, ZERO));
            __m256i curr_row_sums = _mm512_cvtepi64_epi32(row_sums);
            __m256i prev_row_sums = _mm256_loadu_si256((__m256i*)(sum + col - sum_stride));
            _mm_storeu_si128((__m128i*)(sum + col), _mm_add_epi32(curr_row_sums, prev_row_sums));
            row_sums = _mm512_permutexvar_epi64(row_sums, K_15);
        }
        uint32_t row_sum = sum[col - 1] - sum[col - sum_stride - 1];
        for (; col < width; col++)
        {
            row_sum += src[col];
            sum[col] = row_sum + sum[col - sum_stride];
        }
        src += src_stride;
        sum += sum_stride;
    }
}

This trick can improve performance in 3.5 times.

P.S. Original algorithm are placed here: AVX2 and AVX-512BW.

ErmIg
  • 3,980
  • 1
  • 27
  • 40