From be3809a521fecfd3a61db99d660f243bd32b30bb Mon Sep 17 00:00:00 2001 From: James Almer Date: Fri, 2 Jun 2017 19:17:28 -0300 Subject: [PATCH] x86/aacpsdsp: optimize ff_ps_stereo_interpolate_sse3 Move the unpacking outside of the loop. 5% to 10% faster. Suggested-by: ubitux Signed-off-by: James Almer --- libavcodec/x86/aacpsdsp.asm | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index bb8a7f5df0..4548bb4257 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -93,6 +93,10 @@ cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n movaps m1, [h_stepq] cmp nd, 0 jle .ret + unpcklps m4, m0, m0 + unpckhps m0, m0 + unpcklps m5, m1, m1 + unpckhps m1, m1 shl nd, 3 add lq, nq add rq, nq @@ -100,15 +104,12 @@ cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n align 16 .loop: + addps m4, m5 addps m0, m1 movddup m2, [lq+nq] movddup m3, [rq+nq] - movaps m4, m0 - movaps m5, m0 - unpcklps m4, m4 - unpckhps m5, m5 mulps m2, m4 - mulps m3, m5 + mulps m3, m0 addps m2, m3 movsd [lq+nq], m2 movhps [rq+nq], m2