lavc/alacdsp: unroll RISC-V V loops

This increases the group multiplier as per T-Head C910 benchmarks:

alac_append_extra_bits_mono_c: 803.0
alac_append_extra_bits_stereo_c: 1604.2
alac_decorrelate_stereo_c: 1077.5

LMUL=1
alac_append_extra_bits_mono_rvv_i32: 418.2
alac_append_extra_bits_stereo_rvv_i32: 693.2
alac_decorrelate_stereo_rvv_i32: 673.5

LMUL=2
alac_append_extra_bits_mono_rvv_i32: 382.2
alac_append_extra_bits_stereo_rvv_i32: 648.2
alac_decorrelate_stereo_rvv_i32: 542.7

LMUL=4
alac_append_extra_bits_mono_rvv_i32: 241.5
alac_append_extra_bits_stereo_rvv_i32: 512.7
alac_decorrelate_stereo_rvv_i32: 364.2

LMUL=8
alac_append_extra_bits_mono_rvv_i32: 239.7
alac_append_extra_bits_stereo_rvv_i32: 497.2
alac_decorrelate_stereo_rvv_i32: 426.7
This commit is contained in:
Rémi Denis-Courmont 2023-07-13 23:16:03 +03:00
parent a28aa0475d
commit c541ecf0dc

View File

@ -25,7 +25,7 @@ func ff_alac_decorrelate_stereo_rvv, zve32x
ld a4, 8(a0) ld a4, 8(a0)
ld a0, 0(a0) ld a0, 0(a0)
1: 1:
vsetvli t0, a1, e32, m1, ta, ma vsetvli t0, a1, e32, m4, ta, ma
vle32.v v24, (a4) vle32.v v24, (a4)
sub a1, a1, t0 sub a1, a1, t0
vle32.v v16, (a0) vle32.v v16, (a0)
@ -47,7 +47,7 @@ func ff_alac_append_extra_bits_mono_rvv, zve32x
ld a0, (a0) ld a0, (a0)
ld a1, (a1) ld a1, (a1)
1: 1:
vsetvli t0, a4, e32, m1, ta, ma vsetvli t0, a4, e32, m8, ta, ma
vle32.v v16, (a0) vle32.v v16, (a0)
sub a4, a4, t0 sub a4, a4, t0
vle32.v v24, (a1) vle32.v v24, (a1)
@ -67,7 +67,7 @@ func ff_alac_append_extra_bits_stereo_rvv, zve32x
ld a7, 8(a1) ld a7, 8(a1)
ld a1, (a1) ld a1, (a1)
1: 1:
vsetvli t0, a4, e32, m1, ta, ma vsetvli t0, a4, e32, m8, ta, ma
vle32.v v16, (a0) vle32.v v16, (a0)
sub a4, a4, t0 sub a4, a4, t0
vle32.v v0, (a6) vle32.v v0, (a6)