The attached code uses SSE optimization.
The implementation use C intrinsic - no Assembly...
For simplicity I assumed R, G and B are three different planes,
stored in memory as R matrix, G matrix and B matrix, and not in data-ordered r,g,b,r,g,b,r,g,b...
The code uses fixed point implementation for better performance.
Important notices:
- Multiply by (1/3) is much for efficient than dividing by 3.
- Adding 0.5 before integer cast can be used for rounding a positive value.
- Fixed point implementation of (1/3) scaling performed by expanding, scaling and shifting. Example: avg = (sum*scale + rounding) >> 15; [When scale = (1/3)*2^15].
- _mm_mulhrs_epi16 intrinsic is doing the above operation: (x*scl + 2^14) >> 15.
Implementation comments include more explanations:
//Calculate elements average of 3 vectors R, G and B, and store result into J.
//Implementation uses SSE intrinsics for performance optimization.
//Use fixed point computations for better performance.
//R - Plain of red pixels: RRRRRRRRRRRRRRRR
//G - Plain of green pixels: GGGGGGGGGGGGGGGG
//B - Plain of blue pixels: BBBBBBBBBBBBBBBB
//image_size: Total number of pixels (width * height).
//J - Destination Grayscale plane: JJJJJJJJJJJJJJJJ
//Limitations:
//1. image_size must be a multiple of 16.
void RgbAverage(const unsigned char R[],
const unsigned char G[],
const unsigned char B[],
int image_size,
unsigned char J[])
{
int x;
/*
//1. Plain C code:
//--------------------
for (x = 0; x < image_size; x++)
{
//Add 0.5 for rounding (round instead of floor).
//Multiply by (1.0/3.0) - much more efficient than dividing by 3.
J[x] = (unsigned char)(((double)R[x] + (double)G[x] + (double)B[x])*(1.0/3.0) + 0.5);
}
*/
/*
//2. Plain C fixed point implementation:
// Read this code first, for better understanding the SSE implementation.
const unsigned int scale = (unsigned int)((1.0/3.0)*(1 << 15) + 0.5); //scale equals 1/3 expanded by 2^15.
const unsigned int roundig_ofs = (unsigned int)(1 << 14); //Offset of 2^14 for rounding (equals 0.5 expanded by 2^15).
for (x = 0; x < image_size; x++)
{
unsigned int r0 = (unsigned int)R[x];
unsigned int g0 = (unsigned int)G[x];
unsigned int b0 = (unsigned int)B[x];
unsigned int sum = r0 + g0 + b0;
//Multiply by (1/3) with rounding:
//avg = (sum*(1/3)*2^15 + 2^14) / 2^15 = floor(sum*(1/3) + 0.5)
unsigned int avg = (sum * scale + roundig_ofs) >> 15;
J[x] = (unsigned char)avg;
}
*/
//3. SSE optimized implementation:
const unsigned int scale = (unsigned int)((1.0/3.0)*(1 << 15) + 0.5); //scale equals 1/3 expanded by 2^15.
const __m128i vscl = _mm_set1_epi16((short)scale); //8 packed int16 scale elements - scl_scl_scl_scl_scl_scl_scl_scl
//Process 16 pixels per iteration.
for (x = 0; x < image_size; x += 16)
{
__m128i r15_to_r0 = _mm_loadu_si128((__m128i*)&R[x]); //Load 16 uint8 R elements.
__m128i g15_to_g0 = _mm_loadu_si128((__m128i*)&G[x]); //Load 16 uint8 G elements.
__m128i b15_to_b0 = _mm_loadu_si128((__m128i*)&B[x]); //Load 16 uint8 B elements.
//Unpack uint8 elements to uint16 elements:
__m128i r7_to_r0 = _mm_unpacklo_epi8(r15_to_r0, _mm_setzero_si128()); //Lower 8 R elements
__m128i r15_to_r8 = _mm_unpackhi_epi8(r15_to_r0, _mm_setzero_si128()); //Upper 8 R elements
__m128i g7_to_g0 = _mm_unpacklo_epi8(g15_to_g0, _mm_setzero_si128()); //Lower 8 G elements
__m128i g15_to_g8 = _mm_unpackhi_epi8(g15_to_g0, _mm_setzero_si128()); //Upper 8 G elements
__m128i b7_to_b0 = _mm_unpacklo_epi8(b15_to_b0, _mm_setzero_si128()); //Lower 8 B elements
__m128i b15_to_b8 = _mm_unpackhi_epi8(b15_to_b0, _mm_setzero_si128()); //Upper 8 B elements
__m128i sum7_to_sum0 = _mm_add_epi16(_mm_add_epi16(r7_to_r0, g7_to_g0), b7_to_b0); //Lower 8 sum elements.
__m128i sum15_to_sum8 = _mm_add_epi16(_mm_add_epi16(r15_to_r8, g15_to_g8), b15_to_b8); //Upper 8 sum elements.
//Most important trick:
//_mm_mulhrs_epi16 intrinsic does exactly what we wanted i.e: avg = (sum*scl + (1<<14)) >> 15.
//Each SSE instruction perform the described operation on 8 elements.
__m128i avg7_to_avg0 = _mm_mulhrs_epi16(sum7_to_sum0, vscl); //Lower 8 average elements.
__m128i avg15_to_avg8 = _mm_mulhrs_epi16(sum15_to_sum8, vscl); //Upper 8 average elements.
//Pack the result to 16 uint8 elements.
__m128i j15_to_j0 = _mm_packus_epi16(avg7_to_avg0, avg15_to_avg8);
_mm_storeu_si128((__m128i*)(&J[x]), j15_to_j0); //Store 16 elements of J.
}
}