x86/aacpsdsp: add ff_ps_hybrid_analysis_ileave_sse
About 2x faster than the c version.
This commit is contained in:
@@ -166,6 +166,112 @@ align 16
|
|||||||
jl .loop
|
jl .loop
|
||||||
REP_RET
|
REP_RET
|
||||||
|
|
||||||
|
;**********************************************************
|
||||||
|
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
|
||||||
|
; float (*in)[32][2],
|
||||||
|
; int i, int len)
|
||||||
|
;**********************************************************
|
||||||
|
INIT_XMM sse
|
||||||
|
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
|
||||||
|
movsxdifnidn iq, id
|
||||||
|
mov lend, 32 << 3
|
||||||
|
lea inq, [inq+iq*4]
|
||||||
|
mov tmpd, id
|
||||||
|
shl tmpd, 8
|
||||||
|
add outq, tmpq
|
||||||
|
mov tmpd, 64
|
||||||
|
sub tmpd, id
|
||||||
|
mov id, tmpd
|
||||||
|
|
||||||
|
test id, 1
|
||||||
|
jne .loop4
|
||||||
|
test id, 2
|
||||||
|
jne .loop8
|
||||||
|
|
||||||
|
align 16
|
||||||
|
.loop16:
|
||||||
|
mov in0q, inq
|
||||||
|
mov in1q, 38*64*4
|
||||||
|
add in1q, in0q
|
||||||
|
mov tmpd, lend
|
||||||
|
|
||||||
|
.inner_loop16:
|
||||||
|
movaps m0, [in0q]
|
||||||
|
movaps m1, [in1q]
|
||||||
|
movaps m2, [in0q+lenq]
|
||||||
|
movaps m3, [in1q+lenq]
|
||||||
|
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||||
|
movaps [outq], m0
|
||||||
|
movaps [outq+lenq], m1
|
||||||
|
movaps [outq+lenq*2], m2
|
||||||
|
movaps [outq+3*32*2*4], m3
|
||||||
|
lea in0q, [in0q+lenq*2]
|
||||||
|
lea in1q, [in1q+lenq*2]
|
||||||
|
add outq, mmsize
|
||||||
|
sub tmpd, mmsize
|
||||||
|
jg .inner_loop16
|
||||||
|
add inq, 16
|
||||||
|
add outq, 3*32*2*4
|
||||||
|
sub id, 4
|
||||||
|
jg .loop16
|
||||||
|
RET
|
||||||
|
|
||||||
|
align 16
|
||||||
|
.loop8:
|
||||||
|
mov in0q, inq
|
||||||
|
mov in1q, 38*64*4
|
||||||
|
add in1q, in0q
|
||||||
|
mov tmpd, lend
|
||||||
|
|
||||||
|
.inner_loop8:
|
||||||
|
movlps m0, [in0q]
|
||||||
|
movlps m1, [in1q]
|
||||||
|
movhps m0, [in0q+lenq]
|
||||||
|
movhps m1, [in1q+lenq]
|
||||||
|
SBUTTERFLYPS 0, 1, 2
|
||||||
|
SBUTTERFLYPD 0, 1, 2
|
||||||
|
movaps [outq], m0
|
||||||
|
movaps [outq+lenq], m1
|
||||||
|
lea in0q, [in0q+lenq*2]
|
||||||
|
lea in1q, [in1q+lenq*2]
|
||||||
|
add outq, mmsize
|
||||||
|
sub tmpd, mmsize
|
||||||
|
jg .inner_loop8
|
||||||
|
add inq, 8
|
||||||
|
add outq, lenq
|
||||||
|
sub id, 2
|
||||||
|
jg .loop16
|
||||||
|
RET
|
||||||
|
|
||||||
|
align 16
|
||||||
|
.loop4:
|
||||||
|
mov in0q, inq
|
||||||
|
mov in1q, 38*64*4
|
||||||
|
add in1q, in0q
|
||||||
|
mov tmpd, lend
|
||||||
|
|
||||||
|
.inner_loop4:
|
||||||
|
movss m0, [in0q]
|
||||||
|
movss m1, [in1q]
|
||||||
|
movss m2, [in0q+lenq]
|
||||||
|
movss m3, [in1q+lenq]
|
||||||
|
movlhps m0, m1
|
||||||
|
movlhps m2, m3
|
||||||
|
shufps m0, m2, q2020
|
||||||
|
movaps [outq], m0
|
||||||
|
lea in0q, [in0q+lenq*2]
|
||||||
|
lea in1q, [in1q+lenq*2]
|
||||||
|
add outq, mmsize
|
||||||
|
sub tmpd, mmsize
|
||||||
|
jg .inner_loop4
|
||||||
|
add inq, 4
|
||||||
|
sub id, 1
|
||||||
|
test id, 2
|
||||||
|
jne .loop8
|
||||||
|
cmp id, 4
|
||||||
|
jge .loop16
|
||||||
|
RET
|
||||||
|
|
||||||
;***********************************************************
|
;***********************************************************
|
||||||
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
|
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
|
||||||
; float (*in)[32][2],
|
; float (*in)[32][2],
|
||||||
|
@@ -44,6 +44,8 @@ void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
|
|||||||
int i, int len);
|
int i, int len);
|
||||||
void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
|
void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
|
||||||
int i, int len);
|
int i, int len);
|
||||||
|
void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
|
||||||
|
int i, int len);
|
||||||
|
|
||||||
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
||||||
{
|
{
|
||||||
@@ -52,6 +54,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
|||||||
if (EXTERNAL_SSE(cpu_flags)) {
|
if (EXTERNAL_SSE(cpu_flags)) {
|
||||||
s->add_squares = ff_ps_add_squares_sse;
|
s->add_squares = ff_ps_add_squares_sse;
|
||||||
s->mul_pair_single = ff_ps_mul_pair_single_sse;
|
s->mul_pair_single = ff_ps_mul_pair_single_sse;
|
||||||
|
s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
|
||||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
|
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
|
||||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
|
s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user