lavc/vc1dsp: fix overflow in R-V V inv_trans_8

The last set of additions/subtractions can break the 16-bit limit, and require 17 bits of precision. This uses widening adds accordingly to fix the MSS2 FATE tests. The problem potentially also affects inv_trans_4 with a very low probability, but this is not reproducible under FATE.
2024-06-27 21:21:29 +03:00
parent 2c900d4c11
commit 349c49fd1b
1 changed files with 21 additions and 16 deletions
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -141,44 +141,49 @@ func ff_vc1_inv_trans_8_rvv, zve32x
        vadd.vv v20, v20, v21
        vadd.vv v22, v22, v23
        vsll.vi v21, v3, 2
-        vadd.vv v16, v20, v22 # t1
+        vadd.vv v24, v20, v22 # t1
        vmul.vx v20, v1, t5
        vsll.vi v22, v5, 4
        vmul.vx v23, v7, t3
        vsub.vv v20, v20, v21
        vadd.vv v22, v22, v23
        vsll.vi v21, v3, 4
-        vsub.vv v17, v20, v22 # t2
+        vsub.vv v25, v20, v22 # t2
        vmul.vx v20, v1, t3
        vsll.vi v22, v5, 2
        vmul.vx v23, v7, t5
        vsub.vv v20, v20, v21
        vadd.vv v22, v22, v23
        vmul.vx v21, v3, t3
-        vadd.vv v18, v20, v22 # t3
+        vadd.vv v26, v20, v22 # t3
        vsll.vi v20, v1, 2
        vmul.vx v22, v5, t5
        vsll.vi v23, v7, 4
        vsub.vv v20, v20, v21
        vsub.vv v22, v22, v23
        vadd.vv v27, v20, v22 # t4
        srli    t2, t1, 2
-        vadd.vv v0, v28, v16
+        vwadd.vv  v8, v28, v24
-        vadd.vv v19, v20, v22 # t4
+        vwadd.vv  v10, v29, v25
-        vadd.vv v1, v29, v17
+        vwadd.vv  v12, v30, v26
-        vadd.vv v2, v30, v18
+        vwadd.vv  v14, v31, v27
        vadd.vv v3, v31, v19
        vsub.vv v4, v31, v19
        vsub.vv v5, v30, v18
        vsub.vv v6, v29, v17
        vsub.vv v7, v28, v16
        beqz    t2, 1f # faster than 4x add t2=zero
-        .irp    n,4,5,6,7
+        .irp    n,31,30,29,28
        vadd.vi v\n, v\n, 1
        .endr
 1:
-        .irp n,0,1,2,3,4,5,6,7
+        vwsub.vv  v16, v31, v27
-        vssra.vx v\n, v\n, t1
+        vwsub.vv  v18, v30, v26
-        .endr
+        vwsub.vv  v20, v29, v25
        vwsub.vv  v22, v28, v24
        vnclip.wx v0, v8, t1
        vnclip.wx v1, v10, t1
        vnclip.wx v2, v12, t1
        vnclip.wx v3, v14, t1
        vnclip.wx v4, v16, t1
        vnclip.wx v5, v18, t1
        vnclip.wx v6, v20, t1
        vnclip.wx v7, v22, t1
        jr      t0
 endfunc