From 554d8190624f25cefe079bd7b9ad61a2ade8541a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= Date: Sat, 23 Aug 2014 20:03:10 +0200 Subject: [PATCH] avutil/pixelutils: faster pixelutils_sad_16x16 501 to 439 decicycles. See 45c7f3997ea11c3d1007b2126b1c0049a8c27105. --- libavutil/x86/pixelutils.asm | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm index 15213d92d8..7522f24a42 100644 --- a/libavutil/x86/pixelutils.asm +++ b/libavutil/x86/pixelutils.asm @@ -109,18 +109,24 @@ cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2 ;------------------------------------------------------------------------------- INIT_XMM sse2 cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 - pxor m4, m4 -%rep 8 - movu m0, [src1q] - movu m1, [src1q + stride1q] + movu m4, [src1q] movu m2, [src2q] + movu m1, [src1q + stride1q] + movu m3, [src2q + stride2q] + psadbw m4, m2 + psadbw m1, m3 + paddw m4, m1 +%rep 7 + lea src1q, [src1q + 2*stride1q] + lea src2q, [src2q + 2*stride2q] + movu m0, [src1q] + movu m2, [src2q] + movu m1, [src1q + stride1q] movu m3, [src2q + stride2q] psadbw m0, m2 psadbw m1, m3 paddw m4, m0 paddw m4, m1 - lea src1q, [src1q + 2*stride1q] - lea src2q, [src2q + 2*stride2q] %endrep movhlps m0, m4 paddw m4, m0