I would like to use ARM Neon to resize a 8-bit grey image by a factor of 4 from 1280x960 to 320x240.
As an example, I already have a resize by a factor of 2 from 640x480 to 320x240:
void divideimageby2(uint8_t * src, uint8_t * dest) {
//src is 640 x 480
//dst is 320 x 240
int h;
for (h = 0; h < 240; h++)
resizeline2(src + 640 * (h * 2 + 0), src + 640 * (h * 2 + 1), dt + 320 * h);
}
void resizeline2(uint8_t * __restrict src1, uint8_t * __restrict src2, uint8_t * __restrict dest) {
int w;
for (w = 0; w < 640; w += 16) {
uint16x8_t a = vpaddlq_u8(vld1q_u8(src1));
uint16x8_t b = vpaddlq_u8(vld1q_u8(src2));
uint16x8_t ab = vaddq_u16(a, b);
vst1_u8(dest, vshrn_n_u16(ab, 2));
src1 += 16;
src2 += 16;
dest += 8;
}
}
If I want to do something similar, what kind of Neon instructions could I use in resizeline4 to aggregate 4 lines?
void divideimageby4(uint8_t * src, uint8_t * dest) {
//src is 1280 x 960
//dst is 320 x 240
int h;
for (h = 0; h < 240; h++)
resize_line2(src + 640 * (h * 4 + 0), src + 640 * (h * 4 + 1), src + 640 * (h * 4 + 2), src + 640 * (h * 4 + 3), dt + 320 * h);
}
void resizeline4(uint8_t * __restrict src1, uint8_t * __restrict src2, uint8_t * __restrict src3, uint8_t * __restrict src4, uint8_t * __restrict dest) {
int w;
for (w = 0; w < 1280; w += 16) {
//What to put here?
src1 += 16;
src2 += 16;
src3 += 16;
src4 += 16;
dest += 4;
}
}