avcodec/x86/lossless_videodsp : add avx2 version for add_left_pred
This commit is contained in:
parent
cfbcea1cca
commit
4353c35067
@ -114,40 +114,54 @@ MEDIAN_PRED
|
||||
add dstq, wq
|
||||
neg wq
|
||||
%%.loop:
|
||||
pshufb xm0, xm5
|
||||
%if %2
|
||||
mova m1, [srcq+wq]
|
||||
%else
|
||||
movu m1, [srcq+wq]
|
||||
%endif
|
||||
mova m2, m1
|
||||
psllw m1, 8
|
||||
psllw m2, m1, 8
|
||||
paddb m1, m2
|
||||
mova m2, m1
|
||||
pshufb m1, m3
|
||||
pshufb m2, m1, m3
|
||||
paddb m1, m2
|
||||
pshufb m0, m5
|
||||
mova m2, m1
|
||||
pshufb m1, m4
|
||||
pshufb m2, m1, m4
|
||||
paddb m1, m2
|
||||
%if mmsize == 16
|
||||
mova m2, m1
|
||||
pshufb m1, m6
|
||||
%if mmsize >= 16
|
||||
pshufb m2, m1, m6
|
||||
paddb m1, m2
|
||||
%endif
|
||||
paddb m0, m1
|
||||
paddb xm0, xm1
|
||||
%if %1
|
||||
mova [dstq+wq], m0
|
||||
mova [dstq+wq], xm0
|
||||
%else
|
||||
movq [dstq+wq], m0
|
||||
movhps [dstq+wq+8], m0
|
||||
movq [dstq+wq], xm0
|
||||
movhps [dstq+wq+8], xm0
|
||||
%endif
|
||||
|
||||
%if mmsize == 32
|
||||
vextracti128 xm2, m1, 1 ; get second lane of the ymm
|
||||
pshufb xm0, xm5 ; set alls val to last val of the first lane
|
||||
paddb xm0, xm2
|
||||
;store val
|
||||
%if %1
|
||||
mova [dstq+wq+16], xm0
|
||||
%else;
|
||||
movq [dstq+wq+16], xm0
|
||||
movhps [dstq+wq+16+8], xm0
|
||||
%endif
|
||||
%endif
|
||||
add wq, mmsize
|
||||
jl %%.loop
|
||||
%if mmsize == 32
|
||||
mov eax, [dstq -1]
|
||||
and eax, 0xff
|
||||
%else;
|
||||
mov eax, mmsize-1
|
||||
sub eax, wd
|
||||
movd m1, eax
|
||||
pshufb m0, m1
|
||||
movd eax, m0
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
@ -166,15 +180,15 @@ cglobal add_left_pred, 3,3,7, dst, src, w, left
|
||||
|
||||
%macro ADD_LEFT_PRED_UNALIGNED 0
|
||||
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
|
||||
mova m5, [pb_15]
|
||||
mova m6, [pb_zzzzzzzz77777777]
|
||||
mova m4, [pb_zzzz3333zzzzbbbb]
|
||||
mova m3, [pb_zz11zz55zz99zzdd]
|
||||
movd m0, leftm
|
||||
pslldq m0, 15
|
||||
test srcq, 15
|
||||
mova xm5, [pb_15]
|
||||
VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
|
||||
VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
|
||||
VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
|
||||
movd xm0, leftm
|
||||
pslldq xm0, 15
|
||||
test srcq, mmsize - 1
|
||||
jnz .src_unaligned
|
||||
test dstq, 15
|
||||
test dstq, mmsize - 1
|
||||
jnz .dst_unaligned
|
||||
ADD_LEFT_LOOP 1, 1
|
||||
.dst_unaligned:
|
||||
@ -186,6 +200,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
|
||||
INIT_XMM ssse3
|
||||
ADD_LEFT_PRED_UNALIGNED
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
ADD_LEFT_PRED_UNALIGNED
|
||||
%endif
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
||||
;------------------------------------------------------------------------------
|
||||
|
@ -38,6 +38,8 @@ int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
|
||||
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
||||
int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
||||
@ -118,5 +120,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
|
||||
}
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->add_bytes = ff_add_bytes_avx2;
|
||||
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user