KissFFT Versus DSPIC - Rounding Errors

Question

Trying to get the KissFFT fixed point implementation to line up with the DSPIC. The issue is that the fixed point implementation in Kiss is a true fixed point but the dspic does the multiplies and sums in a 40 bit register then shifts down to 16 bits after rounding. The KissFFT can be 16/32 bit fixed point or float. So far the float is the closest matching but I need them to be exact. I'm not sure how many times each frequency bin is updated in the ASM code but as you can see the accumulator result is shifted and rounded each time the bin is updated. I don't have enough FFT knowledge to solve this. If anyone can point me in the correct direction it would be greatly appreciated.

Here is the ASM Code:

 .global _FFT
_FFT:
    push.d  w8

    push.d  w10
    push.d  w12
    push    w14
    push    CORCON
    mov     #0x00f1, w7
    mov     w7, CORCON
    push    PSVPAG
    push    w1                                  ; save return value
    mov     #0xff00, w7                         ; check if w3==COEFFS_IN_DATA
    cp      w7, w3
    bra     z, $+6
    bset    CORCON, #2
    mov     w3, PSVPAG
    mov     #1, w3
    sl      w3, w0, w3                          ; w3 = N (1<<log2N)
    mov     #0x8000, w14
    dec2    w0, w12                             ; w12 is # of non-trivial stages
    mov     #4, w0                              ; w0 = 4 * (1, 2, 4, ..., (N/2))
    mov     #0x0018, w9                         ; w9->w12
    mov     #0x8000, w6
    lsr     w3, w3                              ; start of outer loop, w3 = N/2, N/4, N/8, ..., 1
    sl      w3, #2, w1                          ; w1 = 4 * (N/2, N/4, N/8, ..., 1)
    mov     [w15-2], w10                        ; w10->start of butterfly
    lsr     w0, #2, w4                          ; w4 = groups per stage
    dec     w4, w4
    do      w4, $+88                            ; first butterfly in group has trivial multiplications
    add     w0, w2, w8
    mov     w10, w13
    add     w1, w10, w11
    mov     [w10++], w4
    mpy.n   w4*w6, a, [w11]+=2, w5              ; a = Ar/2
    msc     w5*w6, a, [w10]+=2, w7              ; a = (Ar+Br)/2
    mpy.n   w6*w7, b, [w11]+=2, w4              ; b = Ai/2
    msc     w4*w6, b, [w13]+=2                  ; b = (Ai+Bi)/2
    mac     w5*w6, a, [w13]+=2
    sub     w11, #4, w13
    mac     w5*w6, a, [w11]+=2, w5              ; a = (Ar-Br)/2
    mac     w4*w6, b
    mac     w4*w6, b, [w8]+=2, w6, [w11]+=2, w7, [w13]+=2  ; b = (Ai-Bi)/2
    sub     w3, #3, w4
    do      w4, $+28                            ; b = previous Bi, w5 = Br, w6 = Wr, w7 = Bi, w8-> Wi, w10-> Ar, w11-> next Br, w13-> previous Bi
    lac     [w10], a                            ; a=Ar
    add     w5, a                               ; a=Ar+Br
    subr    w5, [w10], w4                       ; w4=Ar-Br
    sac.r   a, #1, [w10]                        ; *w10++=__real__(A+B)
    lac     [++w10], a                          ; a=Ai
    add     w7, a                               ; a=Ai+Bi
    subr    w7, [w10], w5                       ; w5=Ai-Bi
    sac.r   a, #1, [w10++]                      ; *w10++=__imag__(A+B)
    mpy     w4*w6, a, [w8]-=2, w7               ; a=(Ar-Br)*Wr, w7=Wi
    msc     w5*w7, a, [w13]+=2                  ; a=(Ar-Br)*Wr-(Ai-Bi)*Wi, *w13++ = previous Bi
    add     w0, w8, w8                          ; w8->next Wr
    mpy     w5*w6, b, [w11]+=2, w5              ; b=(Ai-Bi)*Wr, w5=next Br, w11->next Bi
    mac     w4*w7, b, [w8]+=2, w6, [w11]+=2, w7, [w13]+=2   ; b=(Ai-Bi)*Wr+(Ar-Br)*Wi, w6=next Wr=*w8++, w7=next Bi=*w11++, *w13++=__real__(A-B)*W
    lac     [w10], a                            ; epilog
    add     w5, a
    subr    w5, [w10], w4
    sac.r   a, #1, [w10]
    lac     [++w10], a
    add     w7, a
    subr    w7, [w10], w5
    sac.r   a, #1, [w10++]
    mpy     w4*w6, a, [w8]+=2, w7
    msc     w5*w7, a, [w13]+=2
    mpy     w5*w6, b, [w9]+=4, w6
    mac     w4*w7, b, [w9]-=4, w6, [w13]+=2
    clr     a, [w13]+=2
    mov     w11, w10                            ; last instruction in group
    sl      w0, w0                              ; next stage, double twiddle factor offset
    dec     w12, w12
    bra     gt, $-104                           ; if w12 > 0, do next stage
    mov     [w15-2], w10                        ; last two stages are done simultaneously
    mov     [w15-2], w13
    add     w10, #8, w11
    lsr     w0, #2, w3
    dec     w3, w3
    clr     w8
    mov     #0x4000, w12
    clr     a, [w9]+=4, w6, [w10]+=2, w4        ; initialize Ar, w6=0x4000
    mov     [w10++], w5                         ; initialize Ai, w10->Br
    do      w3, $+58
    mov     #12, w0                             ; adjust DOSTART to run prolog only once
    add     DOSTARTL
    bra     NC, $+4
    inc     DOSTARTH
    sub     w4, [w11], w0                       ; w0 = Ar-Cr
    bra     $+10                                ; w4 = Ar, w5 = Ai, w6 = 0x4000, w8->w0, w9->w14, w10->Br, w11->Cr, w12= 0x4000, w13->last Di, w14= 0x8000
    add     #12, w11                            ; start of 22-cycle do loop
    msc     w5*w7, b, [w10]+=2, w4, [w13]+=2    ; b = new Di
    sub     w4, [w11], w0                       ; w0 = Ar-Cr
    clr     a, [w9]+=4, w6, [w10]+=2, w5, [w13]+=2
    add     w4, [w11], w4                       ; w4 = Ar+Cr
    sub     w5, [++w11], w1                     ; w1 = Ai-Ci
    add     w5, [w11++], w5                     ; w5 = Ai+Ci, w11->Dr
    mpy     w4*w6, a, [w10]+=2, w4              ; a = Ar+Cr, w4 = Br, *w13++ = Di
    mpy     w5*w6, b, [w9]-=4, w7, [w10]+=6, w5 ; b = Ai+Ci, w5 = Bi
    sub     w4, [w10], w3                       ; w3 = Br-Dr
    add     w4, [w10], w4                       ; w4 = Br+Dr
    sub     w5, [++w10], w2                     ; w2 = Bi-Di
    add     w5, [w10++], w5                     ; w5 = Bi+Di, w10->next Ar
    mac     w4*w6, a                            ; a = new Ar
    mac     w5*w6, b, [w13]+=2                  ; b = new Ai, *w13++ = Ar
    mac     w4*w7, a, [w8]+=2, w4, [w13]+=2     ; a = new Br, w4=Ar-Cr, *w13++ = Ai
    mac     w5*w7, b, [w8]+=2, w5, [w13]+=2     ; b = new Bi, w5=Ai-Ci, *w13++ = Br
    mpy     w4*w6, a, [w8]+=2, w4               ; a = Ar-Cr, w4 = Bi-Di
    mac     w4*w6, a, [w13]+=2                  ; a = new Cr, *w13++ = Bi
    mpy     w5*w6, b, [w8]-=6, w5               ; b = Ai-Ci, w5 = Br-Dr
    msc     w5*w6, b, [w13]+=2                  ; b = new Ci, w6 = *w10++, *w13++ = Cr
    mac     w4*w7, a, [w13]+=2                  ; a = new Dr (last instruction of do loop)
    msc     w5*w7, b, [w13]+=2                  ; epilog
    sac.r   b, [w13]
    pop     w0                                  ; cleanup
    pop     PSVPAG
    pop     CORCON
    pop     w14
    pop.d   w12
    pop.d   w10
    pop.d   w8
    return

Kiss Code

http://sourceforge.net/projects/kissfft/

This is where I think I need to modify kiss to line up with dspic

#   define S_MUL(a,b) ( (a)*(b) )
#define C_MUL(m,a,b) \
    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
#   define C_FIXDIV(c,div) /* NOOP */
#   define C_MULBYSCALAR( c, s ) \
    do{ (c).r *= (s);\
        (c).i *= (s); }while(0)


#define  C_ADD( res, a,b)\
    do { \
        CHECK_OVERFLOW_OP((a).r,+,(b).r)\
        CHECK_OVERFLOW_OP((a).i,+,(b).i)\
        (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
    }while(0)
#define  C_SUB( res, a,b)\
    do { \
        CHECK_OVERFLOW_OP((a).r,-,(b).r)\
        CHECK_OVERFLOW_OP((a).i,-,(b).i)\
        (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
    }while(0)
#define C_ADDTO( res , a)\
    do { \
        CHECK_OVERFLOW_OP((res).r,+,(a).r)\
        CHECK_OVERFLOW_OP((res).i,+,(a).i)\
        (res).r += (a).r;  (res).i += (a).i;\
    }while(0)

#define C_SUBFROM( res , a)\
    do {\
        CHECK_OVERFLOW_OP((res).r,-,(a).r)\
        CHECK_OVERFLOW_OP((res).i,-,(a).i)\
        (res).r -= (a).r;  (res).i -= (a).i; \
    }while(0)

I'm concerned at why... there is goal of needing to be exact, as rounding likely tricky to get the same, rather than implementing the use of a wider accumulator for a decreased error metric. Also after all that, you might not like the performance degradation. — chip_wrangler, Sep 11 '15 at 07:31

score 0 · Answer 1 · answered Oct 18 '16 at 21:07

The dsPIC DSP has a few settings that you can change, I would try disable super saturation in the CORCON register. bit ACCSAT.

You could also try find a Q15() implementation of the dspic fft and use the builtin functions to convert from Q15 to float. I think its _Q15ftoi() and _itofQ15()

Double check if your using the dsPICFJ series or the dsPICEP ? The CORCON register is different between them be careful that you check for the PSV bit.

KissFFT Versus DSPIC - Rounding Errors

1 Answers1