avcodec/h264: sse2, avx h luma mbaff deblock/loop filter
x86-64 only Yorkfield: - sse2: ~2.17x (434 vs. 200 cycles) Nehalem: - sse2: ~2.94x (409 vs. 139 cycles) Skylake: - sse2: ~3.10x (370 vs. 119 cycles) - avx: ~3.29x (370 vs. 112 cycles)
This commit is contained in:
parent
7627df15d4
commit
5336887867
@ -377,10 +377,99 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro DEBLOCK_H_LUMA_MBAFF 0
|
||||
|
||||
cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_
|
||||
movsxd stride_q, stride_d
|
||||
dec alpha_d
|
||||
dec beta_d
|
||||
mov base3_q, pix_q
|
||||
lea stride3_q, [3*stride_q]
|
||||
add base3_q, stride3_q
|
||||
|
||||
movq m0, [pix_q - 4]
|
||||
movq m1, [pix_q + stride_q - 4]
|
||||
movq m2, [pix_q + 2*stride_q - 4]
|
||||
movq m3, [base3_q - 4]
|
||||
movq m4, [base3_q + stride_q - 4]
|
||||
movq m5, [base3_q + 2*stride_q - 4]
|
||||
movq m6, [base3_q + stride3_q - 4]
|
||||
movq m7, [base3_q + 4*stride_q - 4]
|
||||
|
||||
TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
|
||||
|
||||
%assign i 0
|
||||
%rep 8
|
||||
movq [rsp + 16*i], m %+ i
|
||||
%assign i i+1
|
||||
%endrep
|
||||
|
||||
; p2 = m1 [rsp + 16]
|
||||
; p1 = m2 [rsp + 32]
|
||||
; p0 = m3 [rsp + 48]
|
||||
; q0 = m4 [rsp + 64]
|
||||
; q1 = m5 [rsp + 80]
|
||||
; q2 = m6 [rsp + 96]
|
||||
|
||||
SWAP 0, 2
|
||||
SWAP 1, 3
|
||||
SWAP 2, 4
|
||||
SWAP 3, 5
|
||||
|
||||
LOAD_MASK alpha_d, beta_d
|
||||
movd m8, [tc0_q]
|
||||
punpcklbw m8, m8
|
||||
pcmpeqb m9, m9
|
||||
pcmpeqb m9, m8
|
||||
pandn m9, m7
|
||||
pand m8, m9
|
||||
|
||||
movdqa m3, [rsp + 16] ; p2
|
||||
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
|
||||
pand m6, m9
|
||||
psubb m7, m8, m6
|
||||
pand m6, m8
|
||||
LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
|
||||
|
||||
movdqa m4, [rsp + 96] ; q2
|
||||
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
|
||||
pand m6, m9
|
||||
pand m8, m6
|
||||
psubb m7, m6
|
||||
mova m3, [rsp + 80]
|
||||
LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
|
||||
|
||||
DEBLOCK_P0_Q0
|
||||
SWAP 1, 3
|
||||
SWAP 2, 4
|
||||
movq m0, [rsp]
|
||||
movq m1, [rsp + 16]
|
||||
movq m2, [rsp + 32]
|
||||
movq m5, [rsp + 80]
|
||||
movq m6, [rsp + 96]
|
||||
movq m7, [rsp + 112]
|
||||
|
||||
TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
|
||||
movq [pix_q - 4], m0
|
||||
movq [pix_q + stride_q - 4], m1
|
||||
movq [pix_q + 2*stride_q - 4], m2
|
||||
movq [base3_q - 4], m3
|
||||
movq [base3_q + stride_q - 4], m4
|
||||
movq [base3_q + 2*stride_q - 4], m5
|
||||
movq [base3_q + stride3_q - 4], m6
|
||||
movq [base3_q + 4*stride_q - 4], m7
|
||||
|
||||
RET
|
||||
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
DEBLOCK_H_LUMA_MBAFF
|
||||
DEBLOCK_LUMA
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEBLOCK_H_LUMA_MBAFF
|
||||
DEBLOCK_LUMA
|
||||
%endif
|
||||
|
||||
|
@ -137,6 +137,9 @@ LF_IFUNC(h, chroma422_intra, depth, avx) \
|
||||
LF_FUNC(v, chroma, depth, avx) \
|
||||
LF_IFUNC(v, chroma_intra, depth, avx)
|
||||
|
||||
LF_FUNC(h, luma_mbaff, 8, sse2)
|
||||
LF_FUNC(h, luma_mbaff, 8, avx)
|
||||
|
||||
LF_FUNCS(uint8_t, 8)
|
||||
LF_FUNCS(uint16_t, 10)
|
||||
|
||||
@ -297,6 +300,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
|
||||
|
||||
#if ARCH_X86_64
|
||||
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
|
||||
#endif
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
|
||||
@ -307,6 +314,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
|
||||
#if ARCH_X86_64
|
||||
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
|
||||
#endif
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
|
@ -265,6 +265,21 @@
|
||||
SWAP %12, %15
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE_8X8B 8
|
||||
%if mmsize == 8
|
||||
%error "This macro does not support mmsize == 8"
|
||||
%endif
|
||||
punpcklbw m%1, m%2
|
||||
punpcklbw m%3, m%4
|
||||
punpcklbw m%5, m%6
|
||||
punpcklbw m%7, m%8
|
||||
TRANSPOSE4x4W %1, %3, %5, %7, %2
|
||||
MOVHL m%2, m%1
|
||||
MOVHL m%4, m%3
|
||||
MOVHL m%6, m%5
|
||||
MOVHL m%8, m%7
|
||||
%endmacro
|
||||
|
||||
; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
|
||||
%macro PABSW 2
|
||||
%if cpuflag(ssse3)
|
||||
|
Loading…
x
Reference in New Issue
Block a user