The following code is used to calculate FIR:
void Fir(float* pIn, float* pOut, float* pCoeff, float* pStage, uint32_t N, uint32_t FilterLength)
{
int n, k;
float* pSrc;
float* pCoeffSrc = pCoeff;
float* pDst = pOut;
float s0, s1, s2, s3;
__m128 Vec, Mul;
__m128 Sum0,Sum1,Sum2,Sum3;
__m128 Zero = _mm_set_ps1(0);
memcpy(&pStage[FilterLength - 1], pIn, N * sizeof(float));
for (n = 0; n < N; n+=4)
{
//Sum0
pSrc = &pStage[n];
Sum0 = _mm_set_ps1(0);
pCoeffSrc = pCoeff;
for (k = 0; k < FilterLength >> 2; k++)
{
__m128 Coeff = _mm_load_ps(pCoeffSrc);
Vec = _mm_load_ps(pSrc);
Sum0 = _mm_fmadd_ps(Coeff, Vec, Sum0);
pCoeffSrc += 4;
pSrc += 4;
}
Sum0 = _mm_hadd_ps(Sum0, Zero);
Sum0 = _mm_hadd_ps(Sum0, Zero);
//Sum1
pSrc = &pStage[n+1];
Sum1 = _mm_set_ps1(0);
pCoeffSrc = pCoeff;
for (k = 0; k < FilterLength >> 2; k++)
{
__m128 Coeff = _mm_load_ps(pCoeffSrc);
Vec = _mm_load_ps(pSrc);
Sum1 = _mm_fmadd_ps(Coeff, Vec, Sum1);
pCoeffSrc += 4;
pSrc += 4;
}
Sum1 = _mm_hadd_ps(Sum1, Zero);
Sum1 = _mm_hadd_ps(Sum1, Zero);
//Sum2
pSrc = &pStage[n+2];
Sum2 = _mm_set_ps1(0);
pCoeffSrc = pCoeff;
for (k = 0; k < FilterLength >> 2; k++)
{
__m128 Coeff = _mm_load_ps(pCoeffSrc);
Vec = _mm_load_ps(pSrc);
Sum2 = _mm_fmadd_ps(Coeff, Vec, Sum2);
pCoeffSrc += 4;
pSrc += 4;
}
Sum2 = _mm_hadd_ps(Sum2, Zero);
Sum2 = _mm_hadd_ps(Sum2, Zero);
//Sum3
pSrc = &pStage[n+3];
Sum3 = _mm_set_ps1(0);
pCoeffSrc = pCoeff;
for (k = 0; k < FilterLength >> 2; k++)
{
__m128 Coeff = _mm_load_ps(pCoeffSrc);
Vec = _mm_load_ps(pSrc);
Sum3 = _mm_fmadd_ps(Coeff, Vec, Sum3);
pCoeffSrc += 4;
pSrc += 4;
}
Sum3 = _mm_hadd_ps(Sum3, Zero);
Sum3 = _mm_hadd_ps(Sum3, Zero);
Vec = _mm_set_ps(Sum3.m128_f32[0], Sum2.m128_f32[0], Sum1.m128_f32[0], Sum0.m128_f32[0]);
_mm_store_ps(pDst, Vec);
pDst+=4;
}
}
The result of the each inner loop (4) is a scalar sum of a vector. Then I create a vector from 4 scalars by:
Vec = _mm_set_ps(Sum3.m128_f32[0], Sum2.m128_f32[0], Sum1.m128_f32[0], Sum0.m128_f32[0]);
Vec is stored in RAM by: _mm_store_ps(pDst, Vec);
Can I optimize this code ?
Thank you, Zvika