Assuming your input matrix is CV_64FC2
, call computeSumX2
C function for each row.
Untested.
#include <arm_neon.h>
#include <stdint.h>
#include <stddef.h>
// Load 8 FP64 values, add pairwise, narrow uint64 to uint32, combine into a single vector
inline uint32x4_t reduce4( const double* rsi )
{
// Load 8 values
float64x2x4_t f64 = vld1q_f64_x4( rsi );
// Add them pairwise
float64x2_t f64_1 = vpaddq_f64( f64.val[ 0 ], f64.val[ 1 ] );
float64x2_t f64_2 = vpaddq_f64( f64.val[ 2 ], f64.val[ 3 ] );
// Convert FP64 to uint64
uint64x2_t i64_1 = vcvtq_u64_f64( f64_1 );
uint64x2_t i64_2 = vcvtq_u64_f64( f64_2 );
// Convert int64 to int32 in a single vector, using saturation
uint32x2_t low = vqmovn_u64( i64_1 );
return vqmovn_high_u64( low, i64_2 );
}
// Compute pairwise sum of FP64 values, cast to bytes
void computeSumX2( uint8_t* rdi, size_t length, const double* rsi )
{
const double* const rsiEnd = rsi + length * 2;
size_t lengthAligned = ( length / 16 ) * 16;
const double* const rsiEndAligned = rsi + lengthAligned * 2;
for( ; rsi < rsiEndAligned; rsi += 16 * 2, rdi += 16 )
{
// Each iteration of the loop loads 32 source values, stores 16 bytes
uint16x4_t low16 = vqmovn_u32( reduce4( rsi ) );
uint16x8_t u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 ) );
uint8x8_t low8 = vqmovn_u16( u16 );
low16 = vqmovn_u32( reduce4( rsi + 8 * 2 ) );
u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 * 3 ) );
uint8x16_t res = vqmovn_high_u16( low8, u16 );
vst1q_u8( rdi, res );
}
for( ; rsi < rsiEnd; rsi += 2, rdi++ )
{
// Each iteration of the loop loads 2 source values, stores a single byte
float64x2_t f64 = vld1q_f64( rsi );
double sum = vaddvq_f64( f64 );
*rdi = (uint8_t)sum;
}
}