lavc/bswapdsp: purge RISC-V V bswap32

This cannot beat the Zbb implementation, and it is unlikely that a real meaningful CPU design would support V and not Zbb. The best loop rewrite that I could come up with (4 shifts, 2 ands, 3 ors) is still ~40% slower than Zbb. A proper faster vector implementation should be feasible with the cryptographic vector extensions, but that is a story for another time.
2023-07-16 17:27:45 +03:00
parent 5de1db5370
commit 61e5ca4ded
2 changed files with 1 additions and 27 deletions
--- a/libavcodec/riscv/bswapdsp_init.c
+++ b/libavcodec/riscv/bswapdsp_init.c
@@ -26,7 +26,6 @@
 #include "libavcodec/bswapdsp.h"
 void ff_bswap32_buf_rvb(uint32_t *dst, const uint32_t *src, int len);
 void ff_bswap32_buf_rvv(uint32_t *dst, const uint32_t *src, int len);
 void ff_bswap16_buf_rvv(uint16_t *dst, const uint16_t *src, int len);
 av_cold void ff_bswapdsp_init_riscv(BswapDSPContext *c)
@@ -39,10 +38,8 @@ av_cold void ff_bswapdsp_init_riscv(BswapDSPContext *c)
            c->bswap_buf = ff_bswap32_buf_rvb;
 #endif
 #if HAVE_RVV
-        if (flags & AV_CPU_FLAG_RVV_I32) {
+        if (flags & AV_CPU_FLAG_RVV_I32)
            c->bswap_buf = ff_bswap32_buf_rvv;
            c->bswap16_buf = ff_bswap16_buf_rvv;
        }
 #endif
    }
 }
--- a/libavcodec/riscv/bswapdsp_rvv.S
+++ b/libavcodec/riscv/bswapdsp_rvv.S
@@ -21,29 +21,6 @@
 #include "config.h"
 #include "libavutil/riscv/asm.S"
 func ff_bswap32_buf_rvv, zve32x
        li      t4, 4
        addi    t1, a0, 1
        addi    t2, a0, 2
        addi    t3, a0, 3
 1:
        vsetvli    t0, a2, e8, m1, ta, ma
        vlseg4e8.v v8, (a1)
        sub        a2, a2, t0
        sh2add     a1, t0, a1
        vsse8.v    v8, (t3), t4
        sh2add     t3, t0, t3
        vsse8.v    v9, (t2), t4
        sh2add     t2, t0, t2
        vsse8.v    v10, (t1), t4
        sh2add     t1, t0, t1
        vsse8.v    v11, (a0), t4
        sh2add     a0, t0, a0
        bnez       a2, 1b
        ret
 endfunc
 func ff_bswap16_buf_rvv, zve32x
 1:
        vsetvli t0, a2, e16, m8, ta, ma