vp9: save one (PSIGNW) instruction in iadst16_1d sse2/ssse3.
This commit is contained in:
parent
b84232694e
commit
4b66274a86
@ -29,6 +29,7 @@ pw_11585x2: times 8 dw 23170
|
|||||||
pw_m11585x2: times 8 dw -23170
|
pw_m11585x2: times 8 dw -23170
|
||||||
pw_m11585_11585: times 4 dw -11585, 11585
|
pw_m11585_11585: times 4 dw -11585, 11585
|
||||||
pw_11585_11585: times 8 dw 11585
|
pw_11585_11585: times 8 dw 11585
|
||||||
|
pw_m11585_m11585: times 8 dw -11585
|
||||||
|
|
||||||
%macro VP9_IDCT_COEFFS 2-3 0
|
%macro VP9_IDCT_COEFFS 2-3 0
|
||||||
pw_%1x2: times 8 dw %1*2
|
pw_%1x2: times 8 dw %1*2
|
||||||
@ -1716,13 +1717,13 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
|
|||||||
SUMSUB_BA w, 7, 6, 4
|
SUMSUB_BA w, 7, 6, 4
|
||||||
pmulhrsw m7, [pw_m11585x2] ; m8=out7[w]
|
pmulhrsw m7, [pw_m11585x2] ; m8=out7[w]
|
||||||
pmulhrsw m6, [pw_11585x2] ; m1=out8[w]
|
pmulhrsw m6, [pw_11585x2] ; m1=out8[w]
|
||||||
|
SWAP 6, 7
|
||||||
SUMSUB_BA w, 3, 2, 4
|
SUMSUB_BA w, 3, 2, 4
|
||||||
pmulhrsw m3, [pw_11585x2] ; m3=out4[w]
|
pmulhrsw m3, [pw_11585x2] ; m3=out4[w]
|
||||||
pmulhrsw m2, [pw_11585x2] ; m2=out11[w]
|
pmulhrsw m2, [pw_11585x2] ; m2=out11[w]
|
||||||
%else
|
%else
|
||||||
SCRATCH 5, 8, tmpq+10*%%str
|
SCRATCH 5, 8, tmpq+10*%%str
|
||||||
PSIGNW m7, [pw_m1]
|
VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4
|
||||||
VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 5, 4
|
|
||||||
VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4
|
VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4
|
||||||
UNSCRATCH 5, 8, tmpq+10*%%str
|
UNSCRATCH 5, 8, tmpq+10*%%str
|
||||||
%endif
|
%endif
|
||||||
@ -1733,7 +1734,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
|
|||||||
%if %2 == 1
|
%if %2 == 1
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
mova m13, [tmpq+ 6*%%str]
|
mova m13, [tmpq+ 6*%%str]
|
||||||
TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 7, 10
|
TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10
|
||||||
mova [tmpq+ 0*16], m1
|
mova [tmpq+ 0*16], m1
|
||||||
mova [tmpq+ 2*16], m11
|
mova [tmpq+ 2*16], m11
|
||||||
mova [tmpq+ 4*16], m14
|
mova [tmpq+ 4*16], m14
|
||||||
@ -1745,10 +1746,10 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
|
|||||||
mova [tmpq+ 8*16], m3
|
mova [tmpq+ 8*16], m3
|
||||||
mova [tmpq+10*16], m15
|
mova [tmpq+10*16], m15
|
||||||
mova [tmpq+12*16], m13
|
mova [tmpq+12*16], m13
|
||||||
mova [tmpq+14*16], m7
|
mova [tmpq+14*16], m6
|
||||||
|
|
||||||
TRANSPOSE8x8W 6, 1, 11, 2, 9, 14, 0, 5, 10
|
TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10
|
||||||
mova [tmpq+ 1*16], m6
|
mova [tmpq+ 1*16], m7
|
||||||
mova [tmpq+ 3*16], m1
|
mova [tmpq+ 3*16], m1
|
||||||
mova [tmpq+ 5*16], m11
|
mova [tmpq+ 5*16], m11
|
||||||
mova [tmpq+ 7*16], m2
|
mova [tmpq+ 7*16], m2
|
||||||
@ -1759,20 +1760,20 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
|
|||||||
%else
|
%else
|
||||||
mova [tmpq+12*%%str], m2
|
mova [tmpq+12*%%str], m2
|
||||||
mova [tmpq+ 1*%%str], m5
|
mova [tmpq+ 1*%%str], m5
|
||||||
mova [tmpq+15*%%str], m6
|
mova [tmpq+15*%%str], m7
|
||||||
mova m2, [tmpq+ 9*%%str]
|
mova m2, [tmpq+ 9*%%str]
|
||||||
mova m5, [tmpq+ 5*%%str]
|
mova m5, [tmpq+ 5*%%str]
|
||||||
mova m6, [tmpq+ 8*%%str]
|
mova m7, [tmpq+ 8*%%str]
|
||||||
TRANSPOSE8x8W 1, 2, 5, 0, 3, 6, 4, 7, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
|
TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
|
||||||
mova [tmpq+ 0*16], m1
|
mova [tmpq+ 0*16], m1
|
||||||
mova [tmpq+ 2*16], m2
|
mova [tmpq+ 2*16], m2
|
||||||
mova [tmpq+ 4*16], m5
|
mova [tmpq+ 4*16], m5
|
||||||
mova [tmpq+ 6*16], m0
|
mova [tmpq+ 6*16], m0
|
||||||
mova [tmpq+10*16], m6
|
mova [tmpq+10*16], m7
|
||||||
mova m3, [tmpq+12*%%str]
|
mova m3, [tmpq+12*%%str]
|
||||||
mova [tmpq+12*16], m4
|
mova [tmpq+12*16], m4
|
||||||
mova m4, [tmpq+14*%%str]
|
mova m4, [tmpq+14*%%str]
|
||||||
mova [tmpq+14*16], m7
|
mova [tmpq+14*16], m6
|
||||||
|
|
||||||
mova m0, [tmpq+15*%%str]
|
mova m0, [tmpq+15*%%str]
|
||||||
mova m1, [tmpq+ 3*%%str]
|
mova m1, [tmpq+ 3*%%str]
|
||||||
@ -1805,7 +1806,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
|
|||||||
lea dstq, [dstq+strideq*2]
|
lea dstq, [dstq+strideq*2]
|
||||||
VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6
|
VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6
|
||||||
lea dstq, [dstq+strideq*2]
|
lea dstq, [dstq+strideq*2]
|
||||||
VP9_IDCT8_WRITEx2 12, 7, 10, 8, 4, ROUND_REG, 6
|
VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6
|
||||||
lea dstq, [dstq+strideq*2]
|
lea dstq, [dstq+strideq*2]
|
||||||
|
|
||||||
mova m1, [tmpq+ 3*%%str]
|
mova m1, [tmpq+ 3*%%str]
|
||||||
@ -1813,7 +1814,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
|
|||||||
mova m14, [tmpq+11*%%str]
|
mova m14, [tmpq+11*%%str]
|
||||||
mova m0, [tmpq+13*%%str]
|
mova m0, [tmpq+13*%%str]
|
||||||
|
|
||||||
VP9_IDCT8_WRITEx2 6, 1, 10, 8, 4, ROUND_REG, 6
|
VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6
|
||||||
lea dstq, [dstq+strideq*2]
|
lea dstq, [dstq+strideq*2]
|
||||||
VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6
|
VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6
|
||||||
lea dstq, [dstq+strideq*2]
|
lea dstq, [dstq+strideq*2]
|
||||||
@ -1823,9 +1824,9 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
|
|||||||
%else
|
%else
|
||||||
mova [tmpq+ 0*%%str], m2
|
mova [tmpq+ 0*%%str], m2
|
||||||
mova [tmpq+ 1*%%str], m5
|
mova [tmpq+ 1*%%str], m5
|
||||||
mova [tmpq+ 2*%%str], m6
|
mova [tmpq+ 2*%%str], m7
|
||||||
mova m2, [tmpq+ 9*%%str]
|
mova m2, [tmpq+ 9*%%str]
|
||||||
VP9_IDCT8_WRITEx2 1, 2, 5, 6, 4, ROUND_REG, 6
|
VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6
|
||||||
lea dstq, [dstq+strideq*2]
|
lea dstq, [dstq+strideq*2]
|
||||||
mova m5, [tmpq+ 5*%%str]
|
mova m5, [tmpq+ 5*%%str]
|
||||||
VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6
|
VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6
|
||||||
@ -1834,7 +1835,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
|
|||||||
VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6
|
VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6
|
||||||
lea dstq, [dstq+strideq*2]
|
lea dstq, [dstq+strideq*2]
|
||||||
mova m5, [tmpq+ 6*%%str]
|
mova m5, [tmpq+ 6*%%str]
|
||||||
VP9_IDCT8_WRITEx2 5, 7, 1, 2, 4, ROUND_REG, 6
|
VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6
|
||||||
lea dstq, [dstq+strideq*2]
|
lea dstq, [dstq+strideq*2]
|
||||||
|
|
||||||
mova m0, [tmpq+ 2*%%str]
|
mova m0, [tmpq+ 2*%%str]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user