From 58dabf7bf2fdd08f79173da0df613127ff783028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= Date: Fri, 3 Feb 2012 21:23:49 +0100 Subject: [PATCH] Fix png decoding on x86. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Line sizes are only 8-byte aligned, so use unaliged loads for add_bytes_l2 pointers. Increasing the alignment requirement to 16 seemed a bit extreme (png may be used for rather small sizes). Also fix a mov that had its arguments swapped, leading add_bytes_l2 being applied on up to 8 bytes too few. Signed-off-by: Reimar Döffinger --- libavcodec/pngdsp.h | 4 ++-- libavcodec/x86/pngdsp.asm | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h index 98d29a8a2f..f89a93a45a 100644 --- a/libavcodec/pngdsp.h +++ b/libavcodec/pngdsp.h @@ -26,8 +26,8 @@ typedef struct PNGDSPContext { void (*add_bytes_l2)(uint8_t *dst /* align 16 */, - uint8_t *src1 /* align 16 */, - uint8_t *src2 /* align 16 */, int w); + uint8_t *src1, + uint8_t *src2, int w); /* this might write to dst[w] */ void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src, diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm index f3ec717177..9c588a97e9 100644 --- a/libavcodec/x86/pngdsp.asm +++ b/libavcodec/x86/pngdsp.asm @@ -43,12 +43,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i and waq, ~(mmsize*2-1) jmp .end_v .loop_v: - mova m0, [src1q+iq] - mova m1, [src1q+iq+mmsize] - paddb m0, [src2q+iq] - paddb m1, [src2q+iq+mmsize] - mova [dstq+iq ], m0 - mova [dstq+iq+mmsize], m1 + movu m0, [src2q+iq] + movu m1, [src2q+iq+mmsize] + paddb m0, [src1q+iq] + paddb m1, [src1q+iq+mmsize] + movu [dstq+iq ], m0 + movu [dstq+iq+mmsize], m1 add iq, mmsize*2 .end_v: cmp iq, waq @@ -56,12 +56,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i %if mmsize == 16 ; vector loop - mov wq, waq + mov waq, wq and waq, ~7 jmp .end_l .loop_l: - movq mm0, [src1q+iq] - paddb mm0, [src2q+iq] + movq mm0, [src2q+iq] + paddb mm0, [src1q+iq] movq [dstq+iq ], mm0 add iq, 8 .end_l: