diff --git a/libavcodec/riscv/opusdsp_rvv.S b/libavcodec/riscv/opusdsp_rvv.S index 79ae86c30e..9a8914c78d 100644 --- a/libavcodec/riscv/opusdsp_rvv.S +++ b/libavcodec/riscv/opusdsp_rvv.S @@ -26,40 +26,34 @@ func ff_opus_postfilter_rvv, zve32f flw fa1, 4(a2) // g1 sub t0, a0, t1 flw fa2, 8(a2) // g2 + addi t1, t0, -2 * 4 // data - (period + 2) = initial &x4 + vsetivli zero, 4, e32, m4, ta, ma addi t0, t0, 2 * 4 // data - (period - 2) = initial &x0 - - flw ft4, -16(t0) + vle32.v v16, (t1) addi t3, a1, -2 // maximum parallelism w/o stepping our tail - flw ft3, -12(t0) - flw ft2, -8(t0) - flw ft1, -4(t0) 1: + vslidedown.vi v8, v16, 2 min t1, a3, t3 + vslide1down.vx v12, v16, zero vsetvli t1, t1, e32, m4, ta, ma vle32.v v0, (t0) // x0 sub a3, a3, t1 - vle32.v v28, (a0) + vslide1down.vx v4, v8, zero sh2add t0, t1, t0 - vfslide1up.vf v4, v0, ft1 + vle32.v v28, (a0) addi t2, t1, -4 - vfslide1up.vf v8, v4, ft2 - vfslide1up.vf v12, v8, ft3 - vfslide1up.vf v16, v12, ft4 + vslideup.vi v4, v0, 1 + vslideup.vi v8, v4, 1 + vslideup.vi v12, v8, 1 + vslideup.vi v16, v12, 1 vfadd.vv v20, v4, v12 vfadd.vv v24, v0, v16 - vslidedown.vx v12, v0, t2 + vslidedown.vx v16, v0, t2 vfmacc.vf v28, fa0, v8 - vslidedown.vi v4, v12, 2 vfmacc.vf v28, fa1, v20 - vslide1down.vx v8, v12, zero vfmacc.vf v28, fa2, v24 - vslide1down.vx v0, v4, zero vse32.v v28, (a0) - vfmv.f.s ft4, v12 sh2add a0, t1, a0 - vfmv.f.s ft2, v4 - vfmv.f.s ft3, v8 - vfmv.f.s ft1, v0 bnez a3, 1b ret