diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 4e448623af..e1120b7e15 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -23,7 +23,7 @@ OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o -OBJS-$(CONFIG_LPC) += x86/lpc.o +OBJS-$(CONFIG_LPC) += x86/lpc_init.o OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o @@ -127,6 +127,7 @@ X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o +X86ASM-OBJS-$(CONFIG_LPC) += x86/lpc.o X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm new file mode 100644 index 0000000000..26101b4e25 --- /dev/null +++ b/libavcodec/x86/lpc.asm @@ -0,0 +1,241 @@ +;****************************************************************************** +;* Copyright (c) Lynne +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +one_tab: times 4 dq 1.0 +seq_tab_avx2: dq 3.0, 2.0, 1.0, 0.0 +sub_tab: dq -1.0, -2.0, -3.0, -4.0 +add_tab_avx2: times 4 dq 4.0 +dec_tab_avx2: times 4 dq -4.0 +add_tab_sse2: times 2 dq 2.0 +dec_tab_sse2: times 2 dq -2.0 +dec_tab_scalar: times 2 dq -1.0 +seq_tab_sse2: dq 1.0, 0.0 + +SECTION .text + +%macro APPLY_WELCH_FN 0 +cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2 + cmp lenq, 0 + je .end + cmp lenq, 1 + je .one + + movapd m6, [one_tab] + + movd xm1, lend + cvtdq2pd xm1, xm1 ; len +%if cpuflag(avx2) + vbroadcastsd m1, xm1 +%else + shufpd m1, m1, 00b +%endif + + addpd m0, m6, m6 ; 2.0 + subpd m1, m6 ; len - 1 + divpd m0, m1 ; 2.0 / (len - 1) + + mov off1q, lenq + and off1q, 1 + je .even + + movapd m5, m0 + addpd m0, [sub_tab] + + lea off2q, [lenq*4 - mmsize/2] + sub lenq, mmsize/4 ; avoid overwriting + xor off1q, off1q + + cmp lenq, mmsize/4 + jl .scalar_o + +%if cpuflag(avx2) + movapd m7, [dec_tab_avx2] +%else + movapd m7, [dec_tab_sse2] +%endif + +.loop_o: + movapd m1, m6 + mulpd m2, m0, m0 + subpd m1, m2 +%if cpuflag(avx2) + vpermpd m2, m1, q0123 +%else + shufpd m2, m1, m1, 01b +%endif + + cvtdq2pd m3, [dataq + off1q] + cvtdq2pd m4, [dataq + off2q] + + mulpd m1, m3 + mulpd m2, m4 + + movupd [outq + off1q*2], m1 + movupd [outq + off2q*2], m2 + + addpd m0, m7 + add off1q, mmsize/2 + sub off2q, mmsize/2 + sub lenq, mmsize/4 + jg .loop_o + + add lend, (mmsize/4 - 1) + cmp lend, 0 + je .end + sub lenq, (mmsize/4 - 1) + +.scalar_o: + movapd xm7, [dec_tab_scalar] + subpd xm0, xm7 + + ; Set offsets + add off2q, (mmsize/4) + 4*cpuflag(avx2) + add lenq, mmsize/4 - 2 + +.loop_o_scalar: + movapd xm1, xm6 + mulpd xm2, xm0, xm0 + subpd xm1, xm2 + + cvtdq2pd xm3, [dataq + off1q - (mmsize/4) + 4*cpuflag(avx2)] + cvtdq2pd xm4, [dataq + off2q - (mmsize/4) + 4*cpuflag(avx2)] + + mulpd xm3, xm1 + mulpd xm4, xm1 + + movhpd [outq + off1q*2], xm3 + movhpd [outq + off2q*2], xm4 + + addpd xm0, xm7 + + add off1q, 4 + sub off2q, 4 + + sub lenq, 2 + jg .loop_o_scalar + RET + +.even: +%if cpuflag(avx2) + addpd m0, [seq_tab_avx2] +%else + addpd m0, [seq_tab_sse2] +%endif + + mov off1d, lend + shr off1d, 1 + movd xm1, off1d + cvtdq2pd xm1, xm1 ; len/2 +%if cpuflag(avx2) + vbroadcastsd m1, xm1 +%else + shufpd m1, m1, 00b +%endif + subpd m0, m1 + +%if cpuflag(avx2) + movapd m7, [add_tab_avx2] +%else + movapd m7, [add_tab_sse2] +%endif + + lea off2q, [lenq*2] + lea off1q, [lenq*2 - mmsize/2] + sub lenq, mmsize/4 + + cmp lenq, mmsize/4 + jl .scalar_e + +.loop_e: + movapd m1, m6 + mulpd m2, m0, m0 + subpd m1, m2 +%if cpuflag(avx2) + vpermpd m2, m1, q0123 +%else + shufpd m2, m1, m1, 01b +%endif + + cvtdq2pd m3, [dataq + off1q] + cvtdq2pd m4, [dataq + off2q] + + mulpd m1, m3 + mulpd m2, m4 + + movupd [outq + off1q*2], m1 + movupd [outq + off2q*2], m2 + + addpd m0, m7 + add off2q, mmsize/2 + sub off1q, mmsize/2 + sub lenq, mmsize/4 + jge .loop_e + +.scalar_e: + subpd m0, m7 + movapd m7, [dec_tab_scalar] + subpd m0, m7 + subpd m0, m7 + subpd m0, m7 + + add off1q, (mmsize/2) + sub off2q, (mmsize/2) - 4 - 8*cpuflag(avx2) + + addpd xm0, [sub_tab] + +.loop_e_scalar: + movapd xm1, xm6 + mulpd xm2, xm0, xm0 + subpd xm1, xm2 + + cvtdq2pd m3, [dataq + off1q - 4] + cvtdq2pd m4, [dataq + off2q - 4] + + mulpd m3, m1 + mulpd m4, m1 + + movhpd [outq + off1q*2], xm3 + movhpd [outq + off2q*2], xm4 + + subpd xm0, xm7 + + add off2q, 4 + sub off1q, 4 + jge .loop_e_scalar + RET + +.one: + xorpd xm0, xm0 + movhpd [outq], xm0 +.end: + RET +%endmacro + +INIT_XMM sse2 +APPLY_WELCH_FN + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +APPLY_WELCH_FN +%endif diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc_init.c similarity index 64% rename from libavcodec/x86/lpc.c rename to libavcodec/x86/lpc_init.c index 40fc29fc0f..df77c966c6 100644 --- a/libavcodec/x86/lpc.c +++ b/libavcodec/x86/lpc_init.c @@ -20,65 +20,19 @@ */ #include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/mem_internal.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/lpc.h" +void ff_lpc_apply_welch_window_sse2(const int32_t *data, int len, + double *w_data); +void ff_lpc_apply_welch_window_avx2(const int32_t *data, int len, + double *w_data); + DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; -DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 }; #if HAVE_SSE2_INLINE -static void lpc_apply_welch_window_sse2(const int32_t *data, int len, - double *w_data) -{ - double c = 2.0 / (len-1.0); - int n2 = len>>1; - x86_reg i = -n2*sizeof(int32_t); - x86_reg j = n2*sizeof(int32_t); - __asm__ volatile( - "movsd %4, %%xmm7 \n\t" - "movapd "MANGLE(pd_1)", %%xmm6 \n\t" - "movapd "MANGLE(pd_2)", %%xmm5 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "subpd %%xmm5, %%xmm7 \n\t" - "addsd %%xmm6, %%xmm7 \n\t" - "test $1, %5 \n\t" - "jz 2f \n\t" -#define WELCH(MOVPD, offset)\ - "1: \n\t"\ - "movapd %%xmm7, %%xmm1 \n\t"\ - "mulpd %%xmm1, %%xmm1 \n\t"\ - "movapd %%xmm6, %%xmm0 \n\t"\ - "subpd %%xmm1, %%xmm0 \n\t"\ - "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ - "cvtpi2pd (%3,%0), %%xmm2 \n\t"\ - "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ - "mulpd %%xmm0, %%xmm2 \n\t"\ - "mulpd %%xmm1, %%xmm3 \n\t"\ - "movapd %%xmm2, (%2,%0,2) \n\t"\ - MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ - "subpd %%xmm5, %%xmm7 \n\t"\ - "sub $8, %1 \n\t"\ - "add $8, %0 \n\t"\ - "jl 1b \n\t"\ - - WELCH("movupd", -1) - "jmp 3f \n\t" - "2: \n\t" - WELCH("movapd", -2) - "3: \n\t" - :"+&r"(i), "+&r"(j) - :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len) - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm5", "%xmm6", "%xmm7") - ); -#undef WELCH -} - static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, double *autoc) { @@ -151,12 +105,16 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, av_cold void ff_lpc_init_x86(LPCContext *c) { -#if HAVE_SSE2_INLINE int cpu_flags = av_get_cpu_flags(); - if (INLINE_SSE2_SLOW(cpu_flags)) { - c->lpc_apply_welch_window = lpc_apply_welch_window_sse2; - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; - } -#endif /* HAVE_SSE2_INLINE */ +#if HAVE_SSE2_INLINE + if (INLINE_SSE2_SLOW(cpu_flags)) + c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; +#endif + + if (EXTERNAL_SSE2(cpu_flags)) + c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2; + + if (EXTERNAL_AVX2(cpu_flags)) + c->lpc_apply_welch_window = ff_lpc_apply_welch_window_avx2; } diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index ac02670e64..f330d3a8ab 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -11,6 +11,7 @@ AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o +AVCODECOBJS-$(CONFIG_LPC) += lpc.o AVCODECOBJS-$(CONFIG_ME_CMP) += motion.o AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 6b4a0f22b2..8fd9bba0b0 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -135,6 +135,9 @@ static const struct { #if CONFIG_LLVIDENCDSP { "llviddspenc", checkasm_check_llviddspenc }, #endif + #if CONFIG_LPC + { "lpc", checkasm_check_lpc }, + #endif #if CONFIG_ME_CMP { "motion", checkasm_check_motion }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 171dd06b47..97e909170f 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -68,6 +68,7 @@ void checkasm_check_idctdsp(void); void checkasm_check_jpeg2000dsp(void); void checkasm_check_llviddsp(void); void checkasm_check_llviddspenc(void); +void checkasm_check_lpc(void); void checkasm_check_motion(void); void checkasm_check_nlmeans(void); void checkasm_check_opusdsp(void); diff --git a/tests/checkasm/lpc.c b/tests/checkasm/lpc.c new file mode 100644 index 0000000000..b68ce05bfa --- /dev/null +++ b/tests/checkasm/lpc.c @@ -0,0 +1,80 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/mem_internal.h" + +#include "libavcodec/lpc.h" + +#include "checkasm.h" + +#define randomize_int32(buf, len) \ + do { \ + for (int i = 0; i < len; i++) { \ + int32_t f = rnd() >> 8; \ + buf[i] = f; \ + } \ + } while (0) + +#define EPS 0.005 + +static void test_window(int len) +{ + LOCAL_ALIGNED(16, int32_t, src, [5000]); + LOCAL_ALIGNED(16, double, dst0, [5000]); + LOCAL_ALIGNED(16, double, dst1, [5000]); + + declare_func(void, int32_t *in, int len, double *out); + + randomize_int32(src, len); + + call_ref(src, len, dst0); + call_new(src, len, dst1); + + if (!double_near_abs_eps_array(dst0, dst1, EPS, len)) + fail(); + + bench_new(src, len, dst1); +} + +void checkasm_check_lpc(void) +{ + LPCContext ctx; + ff_lpc_init(&ctx, 32, 16, FF_LPC_TYPE_DEFAULT); + + if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_even")) { + for (int i = 0; i < 64; i += 2) + test_window(i); + } + report("apply_welch_window_even"); + + if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_odd")) { + for (int i = 1; i < 64; i += 2) + test_window(i); + } + report("apply_welch_window_odd"); + + if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_4096")) + test_window(4096); + report("apply_welch_window_4096"); + + if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_4097")) + test_window(4097); + report("apply_welch_window_4097"); + + ff_lpc_end(&ctx); +}