lavc/vc1dsp: fix overflow in R-V V inv_trans_8

The last set of additions/subtractions can break the 16-bit limit, and
require 17 bits of precision. This uses widening adds accordingly to fix
the MSS2 FATE tests.

The problem potentially also affects inv_trans_4 with a very low
probability, but this is not reproducible under FATE.
This commit is contained in:
Rémi Denis-Courmont
2024-06-27 21:21:29 +03:00
parent 2c900d4c11
commit 349c49fd1b

View File

@@ -141,44 +141,49 @@ func ff_vc1_inv_trans_8_rvv, zve32x
vadd.vv v20, v20, v21 vadd.vv v20, v20, v21
vadd.vv v22, v22, v23 vadd.vv v22, v22, v23
vsll.vi v21, v3, 2 vsll.vi v21, v3, 2
vadd.vv v16, v20, v22 # t1 vadd.vv v24, v20, v22 # t1
vmul.vx v20, v1, t5 vmul.vx v20, v1, t5
vsll.vi v22, v5, 4 vsll.vi v22, v5, 4
vmul.vx v23, v7, t3 vmul.vx v23, v7, t3
vsub.vv v20, v20, v21 vsub.vv v20, v20, v21
vadd.vv v22, v22, v23 vadd.vv v22, v22, v23
vsll.vi v21, v3, 4 vsll.vi v21, v3, 4
vsub.vv v17, v20, v22 # t2 vsub.vv v25, v20, v22 # t2
vmul.vx v20, v1, t3 vmul.vx v20, v1, t3
vsll.vi v22, v5, 2 vsll.vi v22, v5, 2
vmul.vx v23, v7, t5 vmul.vx v23, v7, t5
vsub.vv v20, v20, v21 vsub.vv v20, v20, v21
vadd.vv v22, v22, v23 vadd.vv v22, v22, v23
vmul.vx v21, v3, t3 vmul.vx v21, v3, t3
vadd.vv v18, v20, v22 # t3 vadd.vv v26, v20, v22 # t3
vsll.vi v20, v1, 2 vsll.vi v20, v1, 2
vmul.vx v22, v5, t5 vmul.vx v22, v5, t5
vsll.vi v23, v7, 4 vsll.vi v23, v7, 4
vsub.vv v20, v20, v21 vsub.vv v20, v20, v21
vsub.vv v22, v22, v23 vsub.vv v22, v22, v23
vadd.vv v27, v20, v22 # t4
srli t2, t1, 2 srli t2, t1, 2
vadd.vv v0, v28, v16 vwadd.vv v8, v28, v24
vadd.vv v19, v20, v22 # t4 vwadd.vv v10, v29, v25
vadd.vv v1, v29, v17 vwadd.vv v12, v30, v26
vadd.vv v2, v30, v18 vwadd.vv v14, v31, v27
vadd.vv v3, v31, v19
vsub.vv v4, v31, v19
vsub.vv v5, v30, v18
vsub.vv v6, v29, v17
vsub.vv v7, v28, v16
beqz t2, 1f # faster than 4x add t2=zero beqz t2, 1f # faster than 4x add t2=zero
.irp n,4,5,6,7 .irp n,31,30,29,28
vadd.vi v\n, v\n, 1 vadd.vi v\n, v\n, 1
.endr .endr
1: 1:
.irp n,0,1,2,3,4,5,6,7 vwsub.vv v16, v31, v27
vssra.vx v\n, v\n, t1 vwsub.vv v18, v30, v26
.endr vwsub.vv v20, v29, v25
vwsub.vv v22, v28, v24
vnclip.wx v0, v8, t1
vnclip.wx v1, v10, t1
vnclip.wx v2, v12, t1
vnclip.wx v3, v14, t1
vnclip.wx v4, v16, t1
vnclip.wx v5, v18, t1
vnclip.wx v6, v20, t1
vnclip.wx v7, v22, t1
jr t0 jr t0
endfunc endfunc