.set noreorder
.text
.globl fir_interp_2x_symm_2ch_2smpl

/*
     
typedef struct _fir_filter_ctx
{
    int32_t* coeffs;
    int32_t* delays_l;
    int32_t* delays_r;
    int32_t  delays_ptr;
}fir_filter_ctx;
     
2 stereo samples in, 4 out
int32_t fir_interp_2x_symm_2ch_2smpl(int32_t* in, int32_t* out, fir_filter_ctx* ctx);
*/

# .equ DELAYS_CNT 84
    
fir_interp_2x_symm_2ch_2smpl:

    # Inputs:
    # a0 = input sample address
    # a1 = output sample address
    # a2 = fir filter context structure
    
    # Store number of taps * 4 in a3
    li      $a3, 336               # a3 = DELAYS_CNT*4 + 4
    
    # get the current ringbuf offset
    lw      $v0, 12($a2)           # v0 = delays_ptr
        
    # load the ringbuf pointers:    
    lw      $t0, 4($a2)            # t0 = delays_l base addr
    lw      $t1, 8($a2)            # t1 = delays_r base addr
    
    # store the first new sample in delay line at delays[delays_ptr] address
    # left
    addu    $t6, $t0, $v0          # delays_l[delays_ptr] address
    lw      $t4, 0($a0)            # t4 = sample
    sw      $t4, 0($t6)		   # delays_l[delays_ptr] = sample
    # right
    addu    $t6, $t1, $v0          # delays_r[delays_ptr] address
    lw      $t4, 4($a0)            # t4 = sample
    sw      $t4, 0($t6)		   # delays_r[delays_ptr] = sample
    
    # Advance the delays_ptr
    addiu   $v0, $v0, 4            # v0 = delays_ptr + 1
    # delay_ptr > FIR_TAPS ?
    slt     $t4, $v0, $a3          # t4 = delay_ptr < FIR_TAPS 
    movz    $v0, $zero, $t4	   # if delay_ptr == FIR_TAPS -> delay_ptr = 0
        
    # store the second new sample in delay line at delays[ptrx] address
    # left
    addu    $t6, $t0, $v0         # delays_l[delays_ptr] address
    lw      $t4, 8($a0)           # t4 = sample
    sw      $t4, 0($t6)		  # delays_l[delays_ptr] = sample
    # right
    addu    $t6, $t1, $v0         # delays_r[delays_ptr] address
    lw      $t4, 12($a0)           # t4 = sample
    sw      $t4, 0($t6)		  # delays_r[delays_ptr] = sample
    
    # Advance the delays_ptr
    addiu   $v1, $v0, 4            # v1 = delays_ptr + 1
    # delay_ptr > FIR_TAPS ?
    slt     $t4, $a3, $v1          # t4 = delay_ptr > FIR_TAPS 
    movn    $v1, $zero, $t4	   # if delay_ptr > FIR_TAPS -> delay_ptr = 0
        
    # save updated delays_ptr
    sw      $v1, 12($a2)
    
    # Two stereo samples are saved
    # $v0 -> ptrx
    # $v1 -> ptry
    
ptr_ready:  
    # now v0 - ptrx, v1 - ptry
    # next step is to find out how many loop iterations to do before buffer wrap
    addiu   $t3, $v0, 0           # t3 = ptrx : samples before wrap
    subu    $t4, $a3, $v1         # t4 = DELAYS_CNT - ptry: samples after
    
    # if (steps_after_wrap < steps_before_wrap) -> swap
    slt     $t2, $t4, $t3
    movn    $t3, $t4, $t2
    
prepare_loop:    
    # calc coeff start and end, used for load and loop control
    lw      $t5, 0($a2)           # t5 = coeff start address 
    addiu   $a0, $t5, 168         # a0 = coeff end address (42*4)  (packed)
    
    # Load coeff    
    lw      $t4, 0($t5)           # t4 = coeff[i]
    
    # load first samples
    lwx     $t6, $v0($t0)         # t6 = delays_l[ptrx]
    lwx     $t7, $v1($t0)         # t7 = delays_l[ptry]   
    lwx     $t8, $v0($t1)         # t6 = delays_r[ptrx]
    lwx     $t9, $v1($t1)         # t7 = delays_r[ptry]  
    
    # Clear accumulators by setting them to zero
    mult    $ac0, $zero, $zero
    mult    $ac1, $zero, $zero
    mult    $ac2, $zero, $zero
    mult    $ac3, $zero, $zero
    
    # check if number of steps before wrap is not zero
    beqz    $t3, wrap_ptrx
    nop
    # check if number of steps before not equal to numbes of steps
    beq     $t3, $a3, wrap_ptrx
    
    # calc coeff end addr for first loop
    addu    $t3, $t5, $t3         # t3 = coeff end address for first loop
        
    
loop_before_wrap:   
    # advance the ptrx, ptry
    addiu    $v0, $v0, -4
    addiu    $v1, $v1,  4
    # Multiply-accumulate - left 
    madd    $ac0, $t7, $t4
    madd    $ac2, $t6, $t4  
    # Multiply-accumulate - right 
    madd    $ac1, $t9, $t4
    madd    $ac3, $t8, $t4 
    
    # Load samples - left 
    lwx     $t6, $v0($t0)           # t6 = delays_l[ptrx]
    lwx     $t7, $v1($t0)           # t7 = delays_l[ptry]  
    # Load samples - right  
    lwx     $t8, $v0($t1)           # t6 = delays_r[ptrx]
    lwx     $t9, $v1($t1)           # t7 = delays_r[ptry]
    # Multiply-accumulate - left 
    madd    $ac0, $t6, $t4
    madd    $ac2, $t7, $t4  
    # Multiply-accumulate - right 
    madd    $ac1, $t8, $t4
    madd    $ac3, $t9, $t4 
    
    # counter update
    addiu   $t5, $t5, 4             # coeff[i] address increment by 4 bytes
 
    # branch to the start of the loop
    bne     $t5, $t3, loop_before_wrap
    # Load coeff in the delay slot
    lw      $t4, 0($t5)             # t4 = coeff[i]
    
wrap_ptrx:

    # if (ptrx == 0) ptrx = DELAYS_CNT + 1;
    bne     $v0, $zero, wrap_ptry
    nop
    addiu   $v0, $a3, 4          # v0 = DELAYS_CNT + 1
    
wrap_ptry:
    # if (ptry == DELAYS_CNT) ptry = 0;
    bne     $v1, $a3, ptr_wrapped
    nop
    addiu    $v1, $zero, -4      # v1 = -4
    
ptr_wrapped:
    # check if number of steps after wrap is not zero
    beq     $t5, $a0, loop_end
    nop
    
loop_after_wrap:            
    # advance the ptrx, ptry
    addiu    $v0, $v0, -4
    addiu    $v1, $v1,  4
    # Multiply-accumulate - left 
    madd    $ac0, $t7, $t4
    madd    $ac2, $t6, $t4  
    # Multiply-accumulate - right 
    madd    $ac1, $t9, $t4
    madd    $ac3, $t8, $t4 
        
    # Load samples - left 
    lwx     $t6, $v0($t0)           # t6 = delays_l[ptrx]
    lwx     $t7, $v1($t0)           # t7 = delays_l[ptry]
    lwx     $t8, $v0($t1)           # t6 = delays_r[ptrx]
    lwx     $t9, $v1($t1)           # t7 = delays_r[ptry]  
    # Multiply-accumulate - left 
    madd    $ac0, $t6, $t4
    madd    $ac2, $t7, $t4  
    # Load samples - right  
    # Multiply-accumulate - right 
    madd    $ac1, $t8, $t4
    madd    $ac3, $t9, $t4 
    
    # counter update
    addiu   $t5, $t5, 4             # coeff[i] address increment by 4 bytes
  
    # branch to the start of the loop
    bne     $t5, $a0, loop_after_wrap
    # Load coeff in the delay slot
    lw      $t4, 0($t5)             # t4 = coeff[i]
      
loop_end:           
    # Get l0, r0
    EXTR_R.W $t8, $ac0, 24
    EXTR_R.W $t9, $ac1, 24
    sw      $t8, 0($a1)
    sw      $t9, 4($a1)
    
    # Get l1, r1
    EXTR_R.W $t8, $ac2, 24
    EXTR_R.W $t9, $ac3, 24
    sw      $t8, 16($a1)
    sw      $t9, 20($a1)    
     
    # now need to calc original sample mult by center tap
    # t4 has the center tap coeff already loaded
    
    # center tap sample is now at ptry/ptrx offsets
    lwx     $t6, $v1($t0)         
    lwx     $t7, $v1($t1) 
    # Multiply-accumulate - left ch
    mult    $ac0, $t6, $t4   
    mult    $ac1, $t7, $t4   
    
    # Get  result from acc0, acc1 with 
    EXTR_R.W $t8, $ac0, 24
    EXTR_R.W $t9, $ac1, 24
    sw      $t8, 8($a1)
    sw      $t9, 12($a1)
    
    # Advance the delays_ptr
    addiu   $v1, $v1, 4            # v0 = delays_ptr + 1
    # delay_ptr > FIR_TAPS ?
    slt     $t2, $v1, $a3          # t4 = delay_ptr < FIR_TAPS 
    movz    $v1, $zero, $t2	   # if delay_ptr == FIR_TAPS -> delay_ptr = 0
    
    # center tap sample is now at ptry offset
    lwx     $t6, $v1($t0)         
    lwx     $t7, $v1($t1) 
    # Multiply-accumulate - right ch
    mult    $ac2, $t6, $t4
    mult    $ac3, $t7, $t4

    EXTR_R.W $t8, $ac2, 24
    EXTR_R.W $t9, $ac3, 24
    sw      $t8, 24($a1)
    sw      $t9, 28($a1)
    
    jr      $ra
    nop
    
    .end fir_interp_2x_symm_2ch_2smpl
