I need to apply Gausian filter on large image source. I have implemented below algorithm.I have optimized it using neon and got significant performance gain but still need improvement to match real time.Can some one please suggest if there is room for further improvement, specially in neon code. I feel my neon code is not completely optimized and could have processed 16 pixels at a time. I am very beginner to neon so could not write very improved code i will be very helpful if some one can provide improved code if possible.
void BlurRow( src, dest, gausian )
{
process each pixel from src and calculate destination pixel value r g b a
by calling ComputeFinalPixelvalue
}
void BlurImage( src, dest )
{
for each row call BlurRow with gausian kerner gx
transpose matrix
for each row call blur row with gausian kerner gy
transpose matrix
}
void ComputeFinalPixelvalue(const uint32_t* sourcePixels,
uint32_t pixelcount, uint16_t* pGaussElements,
uint32_t& rvalue, uint32_t& gvalue, uint32_t& bvalue, uint32_t& avalue )
{
// initialize all vectors lane with 0
uint32x4_t sumOfChannelG_32x4 = { 0, 0, 0, 0 }, sumOfChannelB_32x4 = { 0, 0, 0, 0 }, sumOfChannelR_32x4 = { 0, 0, 0, 0 }, sumOfChannelA_32x4 = { 0, 0, 0, 0 };
int32x4_t SrcPixels32x4_low, SrcPixels32x4_high, vGaussElement_32x4_low, vGaussElement_32x4_high;
for (int i = 0; i< pixelcount / 8; i++)
{
// load interleaved 8 pixel at a time
uint8x8x4_t SrcPixels8x8x4 = vld4_u8( reinterpret_cast< const unsigned char* >( sourcePixels ) );
// load 8 GaussElement at a time
uint16x8_t vGaussElement_16x8 = vld1q_u16(pGaussElements);
vGaussElement_32x4_low = vmovl_u16(vget_low_u16(vGaussElement_16x8));
vGaussElement_32x4_high = vmovl_u16(vget_high_u16(vGaussElement_16x8));
// channel 0
sumOfChannelR_32x4 = vmlaq_u32(sumOfChannelB_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[0]))), vGaussElement_32x4_low);
sumOfChannelR_32x4 = vmlaq_u32(sumOfChannelB_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[0]))), vGaussElement_32x4_high);
/// channel 1
sumOfChannelG_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[1]))), vGaussElement_32x4_low);
sumOfChannelG_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[1]))), vGaussElement_32x4_high);
/// channel 2
sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[2]))), vGaussElement_32x4_low);
sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[2]))), vGaussElement_32x4_high);
/// channel 3
sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[3]))), vGaussElement_32x4_low);
sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[3]))), vGaussElement_32x4_high);
sourcePixels = sourcePixels + 8;
pGaussElements = pGaussElements + 8;
}
gvalue += vgetq_lane_u32(sumOfChannelG_32x4, 0) + vgetq_lane_u32(sumOfChannelG_32x4, 1) + vgetq_lane_u32(sumOfChannelG_32x4, 2) + vgetq_lane_u32(sumOfChannelG_32x4, 3);
// simillarily calculate others
}