aarch64: Make the indentation more consistent

Some functions have slightly different indentation styles; try
to match the surrounding code.

libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally
uses a layered indentation style to visually show how different
unrolled/interleaved phases fit together.

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö
2023-10-17 13:47:27 +03:00
parent 93cda5a9c2
commit 7f905f3672
7 changed files with 304 additions and 304 deletions

View File

@@ -526,7 +526,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
ld1 {v17.8b}, [x4], x1 ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1 ld1 {v19.8b}, [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra h264_loop_filter_chroma_intra
@@ -554,7 +554,7 @@ h_loop_filter_chroma420_intra:
ld1 {v17.s}[1], [x4], x1 ld1 {v17.s}[1], [x4], x1
ld1 {v19.s}[1], [x4], x1 ld1 {v19.s}[1], [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra h264_loop_filter_chroma_intra
@@ -1017,7 +1017,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
ld1 {v16.8h}, [x4], x1 ld1 {v16.8h}, [x4], x1
ld1 {v19.8h}, [x9], x1 ld1 {v19.8h}, [x9], x1
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10 h264_loop_filter_chroma_intra_10
@@ -1045,7 +1045,7 @@ h_loop_filter_chroma420_intra_10:
ld1 {v19.4h}, [x4], x1 ld1 {v19.4h}, [x4], x1
ld1 {v19.d}[1], [x9], x1 ld1 {v19.d}[1], [x9], x1
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10 h264_loop_filter_chroma_intra_10

View File

@@ -580,8 +580,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
endfunc endfunc
.endm .endm
h264_qpel16_hv put h264_qpel16_hv put
h264_qpel16_hv avg h264_qpel16_hv avg
.macro h264_qpel8 type .macro h264_qpel8 type
function ff_\type\()_h264_qpel8_mc10_neon, export=1 function ff_\type\()_h264_qpel8_mc10_neon, export=1
@@ -759,8 +759,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
endfunc endfunc
.endm .endm
h264_qpel8 put h264_qpel8 put
h264_qpel8 avg h264_qpel8 avg
.macro h264_qpel16 type .macro h264_qpel16 type
function ff_\type\()_h264_qpel16_mc10_neon, export=1 function ff_\type\()_h264_qpel16_mc10_neon, export=1
@@ -931,5 +931,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
endfunc endfunc
.endm .endm
h264_qpel16 put h264_qpel16 put
h264_qpel16 avg h264_qpel16 avg

View File

@@ -239,23 +239,23 @@ function hevc_add_residual_32x32_16_neon, export=0
endfunc endfunc
.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift .macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
sshll v20.4s, \in0, #6 sshll v20.4s, \in0, #6
sshll v21.4s, \in0, #6 sshll v21.4s, \in0, #6
smull v22.4s, \in1, v4.h[1] smull v22.4s, \in1, v4.h[1]
smull v23.4s, \in1, v4.h[3] smull v23.4s, \in1, v4.h[3]
smlal v20.4s, \in2, v4.h[0] //e0 smlal v20.4s, \in2, v4.h[0] //e0
smlsl v21.4s, \in2, v4.h[0] //e1 smlsl v21.4s, \in2, v4.h[0] //e1
smlal v22.4s, \in3, v4.h[3] //o0 smlal v22.4s, \in3, v4.h[3] //o0
smlsl v23.4s, \in3, v4.h[1] //o1 smlsl v23.4s, \in3, v4.h[1] //o1
add v24.4s, v20.4s, v22.4s add v24.4s, v20.4s, v22.4s
sub v20.4s, v20.4s, v22.4s sub v20.4s, v20.4s, v22.4s
add v22.4s, v21.4s, v23.4s add v22.4s, v21.4s, v23.4s
sub v21.4s, v21.4s, v23.4s sub v21.4s, v21.4s, v23.4s
sqrshrn \out0, v24.4s, #\shift sqrshrn \out0, v24.4s, #\shift
sqrshrn \out3, v20.4s, #\shift sqrshrn \out3, v20.4s, #\shift
sqrshrn \out1, v22.4s, #\shift sqrshrn \out1, v22.4s, #\shift
sqrshrn \out2, v21.4s, #\shift sqrshrn \out2, v21.4s, #\shift
.endm .endm
.macro idct_4x4 bitdepth .macro idct_4x4 bitdepth
@@ -294,19 +294,19 @@ endfunc
// uses and clobbers v28-v31 as temp registers // uses and clobbers v28-v31 as temp registers
.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
sshll\p1 v28.4s, \in0, #6 sshll\p1 v28.4s, \in0, #6
mov v29.16b, v28.16b mov v29.16b, v28.16b
smull\p1 v30.4s, \in1, v0.h[1] smull\p1 v30.4s, \in1, v0.h[1]
smull\p1 v31.4s, \in1, v0.h[3] smull\p1 v31.4s, \in1, v0.h[3]
smlal\p2 v28.4s, \in2, v0.h[0] //e0 smlal\p2 v28.4s, \in2, v0.h[0] //e0
smlsl\p2 v29.4s, \in2, v0.h[0] //e1 smlsl\p2 v29.4s, \in2, v0.h[0] //e1
smlal\p2 v30.4s, \in3, v0.h[3] //o0 smlal\p2 v30.4s, \in3, v0.h[3] //o0
smlsl\p2 v31.4s, \in3, v0.h[1] //o1 smlsl\p2 v31.4s, \in3, v0.h[1] //o1
add \out0, v28.4s, v30.4s add \out0, v28.4s, v30.4s
add \out1, v29.4s, v31.4s add \out1, v29.4s, v31.4s
sub \out2, v29.4s, v31.4s sub \out2, v29.4s, v31.4s
sub \out3, v28.4s, v30.4s sub \out3, v28.4s, v30.4s
.endm .endm
.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 .macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
@@ -362,11 +362,11 @@ endfunc
.macro idct_8x8 bitdepth .macro idct_8x8 bitdepth
function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
//x0 - coeffs //x0 - coeffs
mov x1, x0 mov x1, x0
ld1 {v16.8h-v19.8h}, [x1], #64 ld1 {v16.8h-v19.8h}, [x1], #64
ld1 {v20.8h-v23.8h}, [x1] ld1 {v20.8h-v23.8h}, [x1]
movrel x1, trans movrel x1, trans
ld1 {v0.8h}, [x1] ld1 {v0.8h}, [x1]
tr_8x4 7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h tr_8x4 7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
@@ -379,7 +379,7 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23
mov x1, x0 mov x1, x0
st1 {v16.8h-v19.8h}, [x1], #64 st1 {v16.8h-v19.8h}, [x1], #64
st1 {v20.8h-v23.8h}, [x1] st1 {v20.8h-v23.8h}, [x1]
@@ -388,8 +388,8 @@ endfunc
.endm .endm
.macro butterfly e, o, tmp_p, tmp_m .macro butterfly e, o, tmp_p, tmp_m
add \tmp_p, \e, \o add \tmp_p, \e, \o
sub \tmp_m, \e, \o sub \tmp_m, \e, \o
.endm .endm
.macro tr16_8x4 in0, in1, in2, in3, offset .macro tr16_8x4 in0, in1, in2, in3, offset
@@ -418,7 +418,7 @@ endfunc
butterfly v25.4s, v29.4s, v17.4s, v22.4s butterfly v25.4s, v29.4s, v17.4s, v22.4s
butterfly v26.4s, v30.4s, v18.4s, v21.4s butterfly v26.4s, v30.4s, v18.4s, v21.4s
butterfly v27.4s, v31.4s, v19.4s, v20.4s butterfly v27.4s, v31.4s, v19.4s, v20.4s
add x4, sp, #\offset add x4, sp, #\offset
st1 {v16.4s-v19.4s}, [x4], #64 st1 {v16.4s-v19.4s}, [x4], #64
st1 {v20.4s-v23.4s}, [x4] st1 {v20.4s-v23.4s}, [x4]
.endm .endm
@@ -435,14 +435,14 @@ endfunc
.endm .endm
.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
sum_sub v21.4s, \in, \t0, \op0, \p sum_sub v21.4s, \in, \t0, \op0, \p
sum_sub v22.4s, \in, \t1, \op1, \p sum_sub v22.4s, \in, \t1, \op1, \p
sum_sub v23.4s, \in, \t2, \op2, \p sum_sub v23.4s, \in, \t2, \op2, \p
sum_sub v24.4s, \in, \t3, \op3, \p sum_sub v24.4s, \in, \t3, \op3, \p
sum_sub v25.4s, \in, \t4, \op4, \p sum_sub v25.4s, \in, \t4, \op4, \p
sum_sub v26.4s, \in, \t5, \op5, \p sum_sub v26.4s, \in, \t5, \op5, \p
sum_sub v27.4s, \in, \t6, \op6, \p sum_sub v27.4s, \in, \t6, \op6, \p
sum_sub v28.4s, \in, \t7, \op7, \p sum_sub v28.4s, \in, \t7, \op7, \p
.endm .endm
.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
@@ -528,20 +528,20 @@ endfunc
.macro tr_16x4 name, shift, offset, step .macro tr_16x4 name, shift, offset, step
function func_tr_16x4_\name function func_tr_16x4_\name
mov x1, x5 mov x1, x5
add x3, x5, #(\step * 64) add x3, x5, #(\step * 64)
mov x2, #(\step * 128) mov x2, #(\step * 128)
load16 v16.d, v17.d, v18.d, v19.d load16 v16.d, v17.d, v18.d, v19.d
movrel x1, trans movrel x1, trans
ld1 {v0.8h}, [x1] ld1 {v0.8h}, [x1]
tr16_8x4 v16, v17, v18, v19, \offset tr16_8x4 v16, v17, v18, v19, \offset
add x1, x5, #(\step * 32) add x1, x5, #(\step * 32)
add x3, x5, #(\step * 3 *32) add x3, x5, #(\step * 3 *32)
mov x2, #(\step * 128) mov x2, #(\step * 128)
load16 v20.d, v17.d, v18.d, v19.d load16 v20.d, v17.d, v18.d, v19.d
movrel x1, trans, 16 movrel x1, trans, 16
ld1 {v1.8h}, [x1] ld1 {v1.8h}, [x1]
smull v21.4s, v20.4h, v1.h[0] smull v21.4s, v20.4h, v1.h[0]
smull v22.4s, v20.4h, v1.h[1] smull v22.4s, v20.4h, v1.h[1]
@@ -560,19 +560,19 @@ function func_tr_16x4_\name
add_member v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, + add_member v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
add_member v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2 add_member v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2
add x4, sp, #\offset add x4, sp, #\offset
ld1 {v16.4s-v19.4s}, [x4], #64 ld1 {v16.4s-v19.4s}, [x4], #64
butterfly16 v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s butterfly16 v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
.if \shift > 0 .if \shift > 0
scale v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift scale v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
transpose16_4x4_2 v29, v30, v31, v24, v2, v3, v4, v5, v6, v7 transpose16_4x4_2 v29, v30, v31, v24, v2, v3, v4, v5, v6, v7
mov x1, x6 mov x1, x6
add x3, x6, #(24 +3*32) add x3, x6, #(24 +3*32)
mov x2, #32 mov x2, #32
mov x4, #-32 mov x4, #-32
store16 v29.d, v30.d, v31.d, v24.d, x4 store16 v29.d, v30.d, v31.d, v24.d, x4
.else .else
store_to_stack \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s store_to_stack \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s
.endif .endif
add x4, sp, #(\offset + 64) add x4, sp, #(\offset + 64)
@@ -582,13 +582,13 @@ function func_tr_16x4_\name
scale v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift scale v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
transpose16_4x4_2 v29, v30, v31, v20, v2, v3, v4, v5, v6, v7 transpose16_4x4_2 v29, v30, v31, v20, v2, v3, v4, v5, v6, v7
add x1, x6, #8 add x1, x6, #8
add x3, x6, #(16 + 3 * 32) add x3, x6, #(16 + 3 * 32)
mov x2, #32 mov x2, #32
mov x4, #-32 mov x4, #-32
store16 v29.d, v30.d, v31.d, v20.d, x4 store16 v29.d, v30.d, v31.d, v20.d, x4
.else .else
store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s
.endif .endif
ret ret
@@ -601,21 +601,21 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
mov x15, x30 mov x15, x30
// allocate a temp buffer // allocate a temp buffer
sub sp, sp, #640 sub sp, sp, #640
.irp i, 0, 1, 2, 3 .irp i, 0, 1, 2, 3
add x5, x0, #(8 * \i) add x5, x0, #(8 * \i)
add x6, sp, #(8 * \i * 16) add x6, sp, #(8 * \i * 16)
bl func_tr_16x4_firstpass bl func_tr_16x4_firstpass
.endr .endr
.irp i, 0, 1, 2, 3 .irp i, 0, 1, 2, 3
add x5, sp, #(8 * \i) add x5, sp, #(8 * \i)
add x6, x0, #(8 * \i * 16) add x6, x0, #(8 * \i * 16)
bl func_tr_16x4_secondpass_\bitdepth bl func_tr_16x4_secondpass_\bitdepth
.endr .endr
add sp, sp, #640 add sp, sp, #640
ret x15 ret x15
endfunc endfunc
@@ -644,10 +644,10 @@ endfunc
.endm .endm
.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p .macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
sum_sub v24.4s, \in, \t0, \op0, \p sum_sub v24.4s, \in, \t0, \op0, \p
sum_sub v25.4s, \in, \t1, \op1, \p sum_sub v25.4s, \in, \t1, \op1, \p
sum_sub v26.4s, \in, \t2, \op2, \p sum_sub v26.4s, \in, \t2, \op2, \p
sum_sub v27.4s, \in, \t3, \op3, \p sum_sub v27.4s, \in, \t3, \op3, \p
.endm .endm
.macro butterfly32 in0, in1, in2, in3, out .macro butterfly32 in0, in1, in2, in3, out
@@ -841,85 +841,85 @@ idct_32x32 8
idct_32x32 10 idct_32x32 10
.macro tr4_luma_shift r0, r1, r2, r3, shift .macro tr4_luma_shift r0, r1, r2, r3, shift
saddl v0.4s, \r0, \r2 // c0 = src0 + src2 saddl v0.4s, \r0, \r2 // c0 = src0 + src2
saddl v1.4s, \r2, \r3 // c1 = src2 + src3 saddl v1.4s, \r2, \r3 // c1 = src2 + src3
ssubl v2.4s, \r0, \r3 // c2 = src0 - src3 ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
smull v3.4s, \r1, v21.4h // c3 = 74 * src1 smull v3.4s, \r1, v21.4h // c3 = 74 * src1
saddl v7.4s, \r0, \r3 // src0 + src3 saddl v7.4s, \r0, \r3 // src0 + src3
ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3 ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3) mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3)
mul v5.4s, v0.4s, v19.4s // 29 * c0 mul v5.4s, v0.4s, v19.4s // 29 * c0
mul v6.4s, v1.4s, v20.4s // 55 * c1 mul v6.4s, v1.4s, v20.4s // 55 * c1
add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1 add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3 add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3
mul v1.4s, v1.4s, v19.4s // 29 * c1 mul v1.4s, v1.4s, v19.4s // 29 * c1
mul v6.4s, v2.4s, v20.4s // 55 * c2 mul v6.4s, v2.4s, v20.4s // 55 * c2
sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1 sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3 add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3
mul v0.4s, v0.4s, v20.4s // 55 * c0 mul v0.4s, v0.4s, v20.4s // 55 * c0
mul v2.4s, v2.4s, v19.4s // 29 * c2 mul v2.4s, v2.4s, v19.4s // 29 * c2
add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2 add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3 sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3
sqrshrn \r0, v5.4s, \shift sqrshrn \r0, v5.4s, \shift
sqrshrn \r1, v6.4s, \shift sqrshrn \r1, v6.4s, \shift
sqrshrn \r2, v7.4s, \shift sqrshrn \r2, v7.4s, \shift
sqrshrn \r3, v0.4s, \shift sqrshrn \r3, v0.4s, \shift
.endm .endm
function ff_hevc_transform_luma_4x4_neon_8, export=1 function ff_hevc_transform_luma_4x4_neon_8, export=1
ld1 {v28.4h-v31.4h}, [x0] ld1 {v28.4h-v31.4h}, [x0]
movi v18.4s, #74 movi v18.4s, #74
movi v19.4s, #29 movi v19.4s, #29
movi v20.4s, #55 movi v20.4s, #55
movi v21.4h, #74 movi v21.4h, #74
tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7 tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12 tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
st1 {v28.4h-v31.4h}, [x0] st1 {v28.4h-v31.4h}, [x0]
ret ret
endfunc endfunc
// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs) // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
.macro idct_dc size, bitdepth .macro idct_dc size, bitdepth
function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
ld1r {v4.8h}, [x0] ld1r {v4.8h}, [x0]
srshr v4.8h, v4.8h, #1 srshr v4.8h, v4.8h, #1
srshr v0.8h, v4.8h, #(14 - \bitdepth) srshr v0.8h, v4.8h, #(14 - \bitdepth)
srshr v1.8h, v4.8h, #(14 - \bitdepth) srshr v1.8h, v4.8h, #(14 - \bitdepth)
.if \size > 4 .if \size > 4
srshr v2.8h, v4.8h, #(14 - \bitdepth) srshr v2.8h, v4.8h, #(14 - \bitdepth)
srshr v3.8h, v4.8h, #(14 - \bitdepth) srshr v3.8h, v4.8h, #(14 - \bitdepth)
.if \size > 16 /* dc 32x32 */ .if \size > 16 /* dc 32x32 */
mov x2, #4 mov x2, #4
1: 1:
subs x2, x2, #1 subs x2, x2, #1
.endif .endif
add x12, x0, #64 add x12, x0, #64
mov x13, #128 mov x13, #128
.if \size > 8 /* dc 16x16 */ .if \size > 8 /* dc 16x16 */
st1 {v0.8h-v3.8h}, [x0], x13 st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13 st1 {v0.8h-v3.8h}, [x12], x13
st1 {v0.8h-v3.8h}, [x0], x13 st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13 st1 {v0.8h-v3.8h}, [x12], x13
st1 {v0.8h-v3.8h}, [x0], x13 st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13 st1 {v0.8h-v3.8h}, [x12], x13
.endif /* dc 8x8 */ .endif /* dc 8x8 */
st1 {v0.8h-v3.8h}, [x0], x13 st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13 st1 {v0.8h-v3.8h}, [x12], x13
.if \size > 16 /* dc 32x32 */ .if \size > 16 /* dc 32x32 */
bne 1b bne 1b
.endif .endif
.else /* dc 4x4 */ .else /* dc 4x4 */
st1 {v0.8h-v1.8h}, [x0] st1 {v0.8h-v1.8h}, [x0]
.endif .endif
ret ret
endfunc endfunc

View File

@@ -840,19 +840,19 @@ function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1 function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon) b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
endfunc endfunc
function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1 function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc endfunc
function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1 function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc endfunc
function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1 function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
@@ -1560,21 +1560,21 @@ endfunc
#if HAVE_I8MM #if HAVE_I8MM
.macro calc_all2 .macro calc_all2
calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31 calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
b.eq 2f b.eq 2f
calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17 calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
b.eq 2f b.eq 2f
calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19 calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
b.eq 2f b.eq 2f
calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21 calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
b.eq 2f b.eq 2f
calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23 calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
b.eq 2f b.eq 2f
calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25 calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
b.eq 2f b.eq 2f
calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27 calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
b.eq 2f b.eq 2f
calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29 calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
b.hi 1b b.hi 1b
.endm .endm

View File

@@ -34,13 +34,13 @@ endconst
function ff_opus_deemphasis_neon, export=1 function ff_opus_deemphasis_neon, export=1
movrel x4, tab_st movrel x4, tab_st
ld1 {v4.4s}, [x4] ld1 {v4.4s}, [x4]
movrel x4, tab_x0 movrel x4, tab_x0
ld1 {v5.4s}, [x4] ld1 {v5.4s}, [x4]
movrel x4, tab_x1 movrel x4, tab_x1
ld1 {v6.4s}, [x4] ld1 {v6.4s}, [x4]
movrel x4, tab_x2 movrel x4, tab_x2
ld1 {v7.4s}, [x4] ld1 {v7.4s}, [x4]
fmul v0.4s, v4.4s, v0.s[0] fmul v0.4s, v4.4s, v0.s[0]

View File

@@ -330,32 +330,32 @@ endfunc
// v17: hev // v17: hev
// convert to signed value: // convert to signed value:
eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80 eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80 eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
movi v20.8h, #3 movi v20.8h, #3
ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0 ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit) ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80 eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80 eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0) mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
mul v19.8h, v19.8h, v20.8h mul v19.8h, v19.8h, v20.8h
sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1) sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
movi v22.16b, #4 movi v22.16b, #4
movi v23.16b, #3 movi v23.16b, #3
.if \inner .if \inner
and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1) and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
.endif .endif
saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1) saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
saddw2 v19.8h, v19.8h, v20.16b saddw2 v19.8h, v19.8h, v20.16b
sqxtn v18.8b, v18.8h // narrow result back into v18 sqxtn v18.8b, v18.8h // narrow result back into v18
sqxtn2 v18.16b, v19.8h sqxtn2 v18.16b, v19.8h
.if !\inner && !\simple .if !\inner && !\simple
eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80 eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80 eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
.endif .endif
and v18.16b, v18.16b, v16.16b // w &= normal_limit and v18.16b, v18.16b, v16.16b // w &= normal_limit
// registers used at this point.. // registers used at this point..
// v0 -> P3 (don't corrupt) // v0 -> P3 (don't corrupt)
@@ -375,44 +375,44 @@ endfunc
// P0 = s2u(PS0 + c2); // P0 = s2u(PS0 + c2);
.if \simple .if \simple
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3 sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3 sshr v20.16b, v20.16b, #3 // c2 >>= 3
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
.elseif \inner .elseif \inner
// the !is4tap case of filter_common, only used for inner blocks // the !is4tap case of filter_common, only used for inner blocks
// c3 = ((c1&~hev) + 1) >> 1; // c3 = ((c1&~hev) + 1) >> 1;
// Q1 = s2u(QS1 - c3); // Q1 = s2u(QS1 - c3);
// P1 = s2u(PS1 + c3); // P1 = s2u(PS1 + c3);
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3 sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3 sshr v20.16b, v20.16b, #3 // c2 >>= 3
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
bic v19.16b, v19.16b, v17.16b // c1 & ~hev bic v19.16b, v19.16b, v17.16b // c1 & ~hev
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
srshr v19.16b, v19.16b, #1 // c3 >>= 1 srshr v19.16b, v19.16b, #1 // c3 >>= 1
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3) sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3) sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
.else .else
and v20.16b, v18.16b, v17.16b // w & hev and v20.16b, v18.16b, v17.16b // w & hev
sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4) sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3) sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3 sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3 sshr v20.16b, v20.16b, #3 // c2 >>= 3
bic v18.16b, v18.16b, v17.16b // w &= ~hev bic v18.16b, v18.16b, v17.16b // w &= ~hev
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
// filter_mbedge: // filter_mbedge:
// a = clamp((27*w + 63) >> 7); // a = clamp((27*w + 63) >> 7);
@@ -424,35 +424,35 @@ endfunc
// a = clamp((9*w + 63) >> 7); // a = clamp((9*w + 63) >> 7);
// Q2 = s2u(QS2 - a); // Q2 = s2u(QS2 - a);
// P2 = s2u(PS2 + a); // P2 = s2u(PS2 + a);
movi v17.8h, #63 movi v17.8h, #63
sshll v22.8h, v18.8b, #3 sshll v22.8h, v18.8b, #3
sshll2 v23.8h, v18.16b, #3 sshll2 v23.8h, v18.16b, #3
saddw v22.8h, v22.8h, v18.8b saddw v22.8h, v22.8h, v18.8b
saddw2 v23.8h, v23.8h, v18.16b saddw2 v23.8h, v23.8h, v18.16b
add v16.8h, v17.8h, v22.8h add v16.8h, v17.8h, v22.8h
add v17.8h, v17.8h, v23.8h // 9*w + 63 add v17.8h, v17.8h, v23.8h // 9*w + 63
add v19.8h, v16.8h, v22.8h add v19.8h, v16.8h, v22.8h
add v20.8h, v17.8h, v23.8h // 18*w + 63 add v20.8h, v17.8h, v23.8h // 18*w + 63
add v22.8h, v19.8h, v22.8h add v22.8h, v19.8h, v22.8h
add v23.8h, v20.8h, v23.8h // 27*w + 63 add v23.8h, v20.8h, v23.8h // 27*w + 63
sqshrn v16.8b, v16.8h, #7 sqshrn v16.8b, v16.8h, #7
sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7) sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
sqshrn v19.8b, v19.8h, #7 sqshrn v19.8b, v19.8h, #7
sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7) sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
sqshrn v22.8b, v22.8h, #7 sqshrn v22.8b, v22.8h, #7
sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7) sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a) sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a) sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a) sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a) sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a) sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a) sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80 eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80 eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
.endif .endif
.endm .endm
@@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
sub x0, x0, x2, lsl #2 sub x0, x0, x2, lsl #2
sub x1, x1, x2, lsl #2 sub x1, x1, x2, lsl #2
// Load pixels: // Load pixels:
ld1 {v0.d}[0], [x0], x2 // P3 ld1 {v0.d}[0], [x0], x2 // P3
ld1 {v0.d}[1], [x1], x2 // P3 ld1 {v0.d}[1], [x1], x2 // P3
ld1 {v1.d}[0], [x0], x2 // P2 ld1 {v1.d}[0], [x0], x2 // P2
ld1 {v1.d}[1], [x1], x2 // P2 ld1 {v1.d}[1], [x1], x2 // P2
ld1 {v2.d}[0], [x0], x2 // P1 ld1 {v2.d}[0], [x0], x2 // P1
ld1 {v2.d}[1], [x1], x2 // P1 ld1 {v2.d}[1], [x1], x2 // P1
ld1 {v3.d}[0], [x0], x2 // P0 ld1 {v3.d}[0], [x0], x2 // P0
ld1 {v3.d}[1], [x1], x2 // P0 ld1 {v3.d}[1], [x1], x2 // P0
ld1 {v4.d}[0], [x0], x2 // Q0 ld1 {v4.d}[0], [x0], x2 // Q0
ld1 {v4.d}[1], [x1], x2 // Q0 ld1 {v4.d}[1], [x1], x2 // Q0
ld1 {v5.d}[0], [x0], x2 // Q1 ld1 {v5.d}[0], [x0], x2 // Q1
ld1 {v5.d}[1], [x1], x2 // Q1 ld1 {v5.d}[1], [x1], x2 // Q1
ld1 {v6.d}[0], [x0], x2 // Q2 ld1 {v6.d}[0], [x0], x2 // Q2
ld1 {v6.d}[1], [x1], x2 // Q2 ld1 {v6.d}[1], [x1], x2 // Q2
ld1 {v7.d}[0], [x0] // Q3 ld1 {v7.d}[0], [x0] // Q3
ld1 {v7.d}[1], [x1] // Q3 ld1 {v7.d}[1], [x1] // Q3
dup v22.16b, w3 // flim_E dup v22.16b, w3 // flim_E
dup v23.16b, w4 // flim_I dup v23.16b, w4 // flim_I
vp8_loop_filter inner=\inner, hev_thresh=w5 vp8_loop_filter inner=\inner, hev_thresh=w5
// back up to P2: u,v -= stride * 6 // back up to P2: u,v -= stride * 6
sub x0, x0, x2, lsl #2 sub x0, x0, x2, lsl #2
sub x1, x1, x2, lsl #2 sub x1, x1, x2, lsl #2
sub x0, x0, x2, lsl #1 sub x0, x0, x2, lsl #1
sub x1, x1, x2, lsl #1 sub x1, x1, x2, lsl #1
// Store pixels: // Store pixels:
st1 {v1.d}[0], [x0], x2 // P2 st1 {v1.d}[0], [x0], x2 // P2
st1 {v1.d}[1], [x1], x2 // P2 st1 {v1.d}[1], [x1], x2 // P2
st1 {v2.d}[0], [x0], x2 // P1 st1 {v2.d}[0], [x0], x2 // P1
st1 {v2.d}[1], [x1], x2 // P1 st1 {v2.d}[1], [x1], x2 // P1
st1 {v3.d}[0], [x0], x2 // P0 st1 {v3.d}[0], [x0], x2 // P0
st1 {v3.d}[1], [x1], x2 // P0 st1 {v3.d}[1], [x1], x2 // P0
st1 {v4.d}[0], [x0], x2 // Q0 st1 {v4.d}[0], [x0], x2 // Q0
st1 {v4.d}[1], [x1], x2 // Q0 st1 {v4.d}[1], [x1], x2 // Q0
st1 {v5.d}[0], [x0], x2 // Q1 st1 {v5.d}[0], [x0], x2 // Q1
st1 {v5.d}[1], [x1], x2 // Q1 st1 {v5.d}[1], [x1], x2 // Q1
st1 {v6.d}[0], [x0] // Q2 st1 {v6.d}[0], [x0] // Q2
st1 {v6.d}[1], [x1] // Q2 st1 {v6.d}[1], [x1] // Q2
ret ret
endfunc endfunc
@@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
ld1 {v6.d}[1], [x0], x1 ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1 ld1 {v7.d}[1], [x0], x1
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16b, w2 // flim_E dup v22.16b, w2 // flim_E
.if !\simple .if !\simple
@@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
sub x0, x0, x1, lsl #4 // backup 16 rows sub x0, x0, x1, lsl #4 // backup 16 rows
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels: // Store pixels:
st1 {v0.d}[0], [x0], x1 st1 {v0.d}[0], [x0], x1
@@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
sub x1, x1, #4 sub x1, x1, #4
// Load pixels: // Load pixels:
ld1 {v0.d}[0], [x0], x2 // load u ld1 {v0.d}[0], [x0], x2 // load u
ld1 {v0.d}[1], [x1], x2 // load v ld1 {v0.d}[1], [x1], x2 // load v
ld1 {v1.d}[0], [x0], x2 ld1 {v1.d}[0], [x0], x2
ld1 {v1.d}[1], [x1], x2 ld1 {v1.d}[1], [x1], x2
ld1 {v2.d}[0], [x0], x2 ld1 {v2.d}[0], [x0], x2
ld1 {v2.d}[1], [x1], x2 ld1 {v2.d}[1], [x1], x2
ld1 {v3.d}[0], [x0], x2 ld1 {v3.d}[0], [x0], x2
ld1 {v3.d}[1], [x1], x2 ld1 {v3.d}[1], [x1], x2
ld1 {v4.d}[0], [x0], x2 ld1 {v4.d}[0], [x0], x2
ld1 {v4.d}[1], [x1], x2 ld1 {v4.d}[1], [x1], x2
ld1 {v5.d}[0], [x0], x2 ld1 {v5.d}[0], [x0], x2
ld1 {v5.d}[1], [x1], x2 ld1 {v5.d}[1], [x1], x2
ld1 {v6.d}[0], [x0], x2 ld1 {v6.d}[0], [x0], x2
ld1 {v6.d}[1], [x1], x2 ld1 {v6.d}[1], [x1], x2
ld1 {v7.d}[0], [x0], x2 ld1 {v7.d}[0], [x0], x2
ld1 {v7.d}[1], [x1], x2 ld1 {v7.d}[1], [x1], x2
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16b, w3 // flim_E dup v22.16b, w3 // flim_E
dup v23.16b, w4 // flim_I dup v23.16b, w4 // flim_I
@@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
sub x0, x0, x2, lsl #3 // backup u 8 rows sub x0, x0, x2, lsl #3 // backup u 8 rows
sub x1, x1, x2, lsl #3 // backup v 8 rows sub x1, x1, x2, lsl #3 // backup v 8 rows
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels: // Store pixels:
st1 {v0.d}[0], [x0], x2 // load u st1 {v0.d}[0], [x0], x2 // load u
st1 {v0.d}[1], [x1], x2 // load v st1 {v0.d}[1], [x1], x2 // load v
st1 {v1.d}[0], [x0], x2 st1 {v1.d}[0], [x0], x2
st1 {v1.d}[1], [x1], x2 st1 {v1.d}[1], [x1], x2
st1 {v2.d}[0], [x0], x2 st1 {v2.d}[0], [x0], x2
st1 {v2.d}[1], [x1], x2 st1 {v2.d}[1], [x1], x2
st1 {v3.d}[0], [x0], x2 st1 {v3.d}[0], [x0], x2
st1 {v3.d}[1], [x1], x2 st1 {v3.d}[1], [x1], x2
st1 {v4.d}[0], [x0], x2 st1 {v4.d}[0], [x0], x2
st1 {v4.d}[1], [x1], x2 st1 {v4.d}[1], [x1], x2
st1 {v5.d}[0], [x0], x2 st1 {v5.d}[0], [x0], x2
st1 {v5.d}[1], [x1], x2 st1 {v5.d}[1], [x1], x2
st1 {v6.d}[0], [x0], x2 st1 {v6.d}[0], [x0], x2
st1 {v6.d}[1], [x1], x2 st1 {v6.d}[1], [x1], x2
st1 {v7.d}[0], [x0] st1 {v7.d}[0], [x0]
st1 {v7.d}[1], [x1] st1 {v7.d}[1], [x1]
ret ret

View File

@@ -729,9 +729,9 @@ FFT16_FN ns_float, 1
.endm .endm
.macro SR_COMBINE_4 len, part, off .macro SR_COMBINE_4 len, part, off
add x10, x1, x21 add x10, x1, x21
add x11, x1, x21, lsl #1 add x11, x1, x21, lsl #1
add x12, x1, x22 add x12, x1, x22
ldp q0, q1, [x1, #((0 + \part)*32 + \off)] ldp q0, q1, [x1, #((0 + \part)*32 + \off)]
ldp q4, q5, [x1, #((2 + \part)*32 + \off)] ldp q4, q5, [x1, #((2 + \part)*32 + \off)]
@@ -759,9 +759,9 @@ FFT16_FN ns_float, 1
.endm .endm
.macro SR_COMBINE_FULL len, off=0 .macro SR_COMBINE_FULL len, off=0
add x10, x1, x21 add x10, x1, x21
add x11, x1, x21, lsl #1 add x11, x1, x21, lsl #1
add x12, x1, x22 add x12, x1, x22
SR_COMBINE_4 \len, 0, \off SR_COMBINE_4 \len, 0, \off
SR_COMBINE_4 \len, 1, \off SR_COMBINE_4 \len, 1, \off