lavc/sbrdsp: R-V V hf_apply_noise functions

This is restricted to 128-bit vectors as larger vector sizes could read past the end of the noise array. Support for future hardware with larger vector sizes is left for some other time. hf_apply_noise_0_c: 2319.7 hf_apply_noise_0_rvv_f32: 1229.0 hf_apply_noise_1_c: 2539.0 hf_apply_noise_1_rvv_f32: 1244.7 hf_apply_noise_2_c: 2319.7 hf_apply_noise_2_rvv_f32: 1232.7 hf_apply_noise_3_c: 2541.2 hf_apply_noise_3_rvv_f32: 1244.2
2023-11-10 18:21:27 +02:00 · 2023-11-10 18:21:27 +02:00 · c536e92207
commit c536e92207
parent 20e6195c54
2 changed files with 84 additions and 0 deletions
--- a/libavcodec/riscv/sbrdsp_init.c
+++ b/libavcodec/riscv/sbrdsp_init.c
@ -21,6 +21,7 @@
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/sbrdsp.h"
 void ff_sbr_sum64x5_rvv(float *z);
@ -32,6 +33,14 @@ void ff_sbr_hf_gen_rvv(float (*X_high)[2], const float (*X_low)[2],
                       float bw, int start, int end);
 void ff_sbr_hf_g_filt_rvv(float (*Y)[2], const float (*X_high)[40][2],
                          const float *g_filt, int m_max, intptr_t ixh);
 void ff_sbr_hf_apply_noise_0_rvv(float (*Y)[2], const float *s,
                                 const float *f, int n, int kx, int max);
 void ff_sbr_hf_apply_noise_1_rvv(float (*Y)[2], const float *s,
                                 const float *f, int n, int kx, int max);
 void ff_sbr_hf_apply_noise_2_rvv(float (*Y)[2], const float *s,
                                 const float *f, int n, int kx, int max);
 void ff_sbr_hf_apply_noise_3_rvv(float (*Y)[2], const float *s,
                                 const float *f, int n, int kx, int max);
 av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c)
 {
@ -44,6 +53,14 @@ av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c)
            c->sum_square = ff_sbr_sum_square_rvv;
            c->hf_gen = ff_sbr_hf_gen_rvv;
            c->hf_g_filt = ff_sbr_hf_g_filt_rvv;
            if (ff_get_rv_vlenb() <= 16) {
                c->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_rvv;
                c->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_rvv;
                if (flags & AV_CPU_FLAG_RVB_BASIC) {
                    c->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_rvv;
                    c->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_rvv;
                }
            }
        }
        c->autocorrelate = ff_sbr_autocorrelate_rvv;
    }
--- a/libavcodec/riscv/sbrdsp_rvv.S
+++ b/libavcodec/riscv/sbrdsp_rvv.S
@ -243,3 +243,70 @@ func ff_sbr_hf_g_filt_rvv, zve32f
        ret
 endfunc
 .macro hf_apply_noise n
        lla     a6, ff_sbr_noise_table
        fmv.s.x ft0, zero
        addi    a6, a6, 8
 1:
 .if \n & 1
        min     t0, t0, a5 // preserve parity of t0 for v4 sign injector
        vsetvli zero, t0, e32, m4, ta, mu
 .else
        vsetvli t0, a5, e32, m4, ta, mu
 .endif
        sh3add  t6, a3, a6
        vle32.v v8, (a1)      // s_m
        sub     a5, a5, t0
        vle32.v v12, (a2)     // q_filt
        sh2add  a1, t0, a1
        vmfeq.vf v0, v8, ft0  // s_m == 0.f
        vlseg2e32.v v24, (t6) // ff_sbr_noise_table
        sh2add  a2, t0, a2
 .if \n == 2
        vfneg.v v8, v8
 .endif
 .if \n & 1
        vfsgnjx.vv v8, v8, v4 // could equivalent use vxor.vv
 .endif
        add     a3, t0, a3
        vlseg2e32.v v16, (a0) // Y
        andi    a3, a3, 0x1ff
 .if \n & 1
        vfmul.vv v28, v12, v28
        vfmacc.vv v16, v12, v24, v0.t
        vmerge.vvm v28, v8, v28, v0
        vfadd.vv v20, v20, v28
 .else
        vfmul.vv v24, v12, v24
        vfmacc.vv v20, v12, v28, v0.t
        vmerge.vvm v24, v8, v24, v0
        vfadd.vv v16, v16, v24
 .endif
        vsseg2e32.v v16, (a0)
        sh3add  a0, t0, a0
        bnez    a5, 1b
        ret
 .endm
 func ff_sbr_hf_apply_noise_0_rvv, zve32f
        hf_apply_noise 0
 endfunc
 func ff_sbr_hf_apply_noise_3_rvv, zve32f
       not     a4, a4 // invert parity of kx
       // fall through
 endfunc
 func ff_sbr_hf_apply_noise_1_rvv, zve32f
        vsetvli t0, zero, e32, m4, ta, ma
        vid.v   v4
        vxor.vx v4, v4, a4
        vsll.vi v4, v4, 31 // v4[i] = (kx & 1) ? -0.f : +0.f
        hf_apply_noise 1
 endfunc
 func ff_sbr_hf_apply_noise_2_rvv, zve32f
        hf_apply_noise 2
 endfunc