.set noreorder
.text
.globl fir_interp_2x_symm_2ch

/*
     
typedef struct _fir_filter_ctx
{
    int32_t* coeffs;
    int32_t* delays_l;
    int32_t* delays_r;
    int32_t  delays_ptr;
}fir_filter_ctx;
     
1 stereo sample in, 2 out
int32_t fir_interp_2x_symm_2ch(int32_t* in, int32_t* out, fir_filter_ctx* ctx);
*/

# .equ DELAYS_CNT 84
    
fir_interp_2x_symm_2ch:
    # Inputs:
    # a0 = input sample address
    # a1 = output sample address
    # a2 = fir filter context structure
    
    # Store number of taps * 4 in a3
    li      $a3, 336               # a3 = DELAYS_CNT*4
    
    # get the current ringbuf offset
    lw      $v0, 12($a2)           # v0 = delays_ptr
    
    # Advance the delays_ptr
    addiu   $v1, $v0, 4            # v1 = delays_ptr + 1
   
    # delay_ptr > FIR_TAPS ?
    slt      $t0, $v1, $a3          # t0 = delay_ptr < FIR_TAPS 
    movz     $v1, $zero, $t0	    # if delay_ptr >= FIR_TAPS -> delay_ptr = 0

ptr_ready:
    # save updated delays_ptr
    sw       $v1, 12($a2)
    
    # now v0 - ptrx, v1 - ptry
    # next step is to find out how many loop iterations to do before buffer wrap
    addiu    $t3, $v0, 4           # t3 = ptrx + 1 : steps before wrap
    subu     $t4, $a3, $t3         # t4 = DELAYS_CNT - ptrx - 1 : steps after
    
    # if (steps_after_wrap < steps_before_wrap) -> swap
    slt      $t0, $t4, $t3
    movn     $t3, $t4, $t0
    
prepare_loop:
    # now we do two loops, one before and one after ringbuffer wrap
    # preapare the pointers:    
    lw      $t0, 4($a2)           # t0 = delays_l base addr
    lw      $t1, 8($a2)           # t1 = delays_r base addr
    
    # store the new sample in delay line at delays[ptrx] address
    # left
    addu    $t6, $t0, $v0         # delays_l[ptrx] address
    lw      $t4, 0($a0)           # t4 = sample
    sw      $t4, 0($t6)		  # delays_l[ptrx] = sample
    # right
    addu    $t6, $t1, $v0         # delays_r[ptrx] address
    lw      $t4, 4($a0)           # t4 = sample
    sw      $t4, 0($t6)		  # delays_r[ptrx] = sample
               	
    # Clear accumulators by setting them to zero
    mult    $ac0, $zero, $zero
    mult    $ac1, $zero, $zero
    
    # calc coeff start and end, used for load and loop control
    lw      $t5, 0($a2)           # t5 = coeff start address 
    addiu   $t9, $t5, 168         # t9 = coeff end address (42*4)  (packed)
    
    # check if number of steps before wrap is not zero
    beqz    $t3, wrap_ptrx
    
    # calc coeff end addr for first loop
    addu    $t3, $t5, $t3         # t3 = coeff end address for first loop
        
loop_before_wrap:
    # beq     $t5, $t3, wrap_ptrx
    # Load coeff    
    lw      $t4, 0($t5)             # t4 = coeff[i]
    addiu   $t5, $t5, 4             # coeff[i] address increment by 4 bytes
    
    # Load samples - left 
    lwx     $t6, $v0($t0)           # t6 = delays_l[ptrx]
    lwx     $t7, $v1($t0)           # t7 = delays_l[ptry]
    # Multiply-accumulate - left 
    madd    $ac0, $t6, $t4
    madd    $ac0, $t7, $t4  
    # Load samples - right 
    lwx     $t6, $v0($t1)           # t6 = delays_r[ptrx]
    lwx     $t7, $v1($t1)           # t7 = delays_r[ptry]
    # Multiply-accumulate - right 
    madd    $ac1, $t6, $t4
    madd    $ac1, $t7, $t4  
    
    # advance the ptrx, ptry
    addiu    $v0, $v0, -4
    # branch to the starte of the loop
    # j loop_before_wrap
    bne     $t5, $t3, loop_before_wrap
    # counter update in the delay slot
    addiu    $v1, $v1,  4

wrap_ptrx:
    # a3 = DELAYS_CNT, t4 = -1
    addiu   $t4, $zero, -4
    # if (ptrx == -1) ptrx = DELAYS_CNT - 1;
    bne     $v0, $t4, wrap_ptry
    nop
    addiu   $v0, $a3, -4          # v0 = DELAYS_CNT - 1
    # sll     $v0, $v0, 2
    
wrap_ptry:
    # if (ptry == DELAYS_CNT) ptry = 0;
    bne     $v1, $a3, ptr_wrapped
    nop
    addu    $v1, $zero, $zero      # v1 = 0
    
ptr_wrapped:
    # check if number of steps after wrap is not zero
    beq     $t5, $t9, loop_end
    # nop
    
loop_after_wrap:    
    # beq     $t5, $t9, loop_end
    # Load coeff    
    lw      $t4, 0($t5)             # t4 = coeff[i]
    addiu   $t5, $t5, 4             # coeff[i] address increment by 8 bytes
    
    # Load samples - left 
    lwx     $t6, $v0($t0)           # t6 = delays_l[ptrx]
    lwx     $t7, $v1($t0)           # t7 = delays_l[ptry]
    # Multiply-accumulate - left 
    madd    $ac0, $t6, $t4
    madd    $ac0, $t7, $t4  
    # Load samples - right 
    lwx     $t6, $v0($t1)           # t6 = delays_r[ptrx]
    lwx     $t7, $v1($t1)           # t7 = delays_r[ptry]
    # Multiply-accumulate - right 
    madd    $ac1, $t6, $t4
    madd    $ac1, $t7, $t4  
    
    # advance the ptrx, ptry
    addiu    $v0, $v0, -4
    # branch to the starte of the loop
    # j loop_before_wrap
    bne     $t5, $t9, loop_after_wrap
    # counter update in the delay slot
    addiu    $v1, $v1,  4
      
loop_end:
    
    # now need to calc original sample mult by center tap
    # t9 points to center tap coeffitient
    lw      $t4, 0($t9)           # t4 = center tap coeff
    
    # center tap sample is now at ptry offset
    lwx     $t6, $v1($t0)         # t7 = delays_l[ptry]
    lwx     $t7, $v1($t1)         # t7 = delays_l[ptry]
    # Multiply-accumulate - left ch
    mult    $ac2, $t6, $t4   
    # Multiply-accumulate - right ch
    mult    $ac3, $t7, $t4
            
    # Get  result from acc0, acc1 with 
    EXTR_R.W $t0, $ac0, 24
    EXTR_R.W $t1, $ac1, 24
    sw      $t0, 0($a1)
    sw      $t1, 4($a1)
    
    EXTR_R.W $t0, $ac2, 24
    EXTR_R.W $t1, $ac3, 24
    sw      $t0, 8($a1)
    sw      $t1, 12($a1)

    jr      $ra
    nop
    
    .end fir_interp_2x_symm_2ch
