x86/tx_float: add asm call versions of the 2pt and 4pt transforms
Verified to be working.
This commit is contained in:
@@ -682,15 +682,27 @@ SECTION .text
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_XMM sse3
|
INIT_XMM sse3
|
||||||
|
cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride
|
||||||
|
movaps m0, [inq]
|
||||||
|
FFT2 m0, m1
|
||||||
|
movaps [outq], m0
|
||||||
|
add inq, mmsize*1
|
||||||
|
add outq, mmsize*1
|
||||||
|
ret
|
||||||
|
|
||||||
cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
|
cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
|
||||||
movaps m0, [inq]
|
movaps m0, [inq]
|
||||||
FFT2 m0, m1
|
FFT2 m0, m1
|
||||||
movaps [outq], m0
|
movaps [outq], m0
|
||||||
RET
|
RET
|
||||||
|
|
||||||
%macro FFT4 2
|
%macro FFT4_FN 3
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
|
%if %3
|
||||||
|
cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride
|
||||||
|
%else
|
||||||
cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
|
cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
|
||||||
|
%endif
|
||||||
movaps m0, [inq + 0*mmsize]
|
movaps m0, [inq + 0*mmsize]
|
||||||
movaps m1, [inq + 1*mmsize]
|
movaps m1, [inq + 1*mmsize]
|
||||||
|
|
||||||
@@ -708,11 +720,19 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
|
|||||||
movaps [outq + 0*mmsize], m2
|
movaps [outq + 0*mmsize], m2
|
||||||
movaps [outq + 1*mmsize], m0
|
movaps [outq + 1*mmsize], m0
|
||||||
|
|
||||||
|
%if %3
|
||||||
|
add inq, mmsize*2
|
||||||
|
add outq, mmsize*2
|
||||||
|
ret
|
||||||
|
%else
|
||||||
RET
|
RET
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
FFT4 fwd, 0
|
FFT4_FN fwd, 0, 0
|
||||||
FFT4 inv, 1
|
FFT4_FN fwd, 0, 1
|
||||||
|
FFT4_FN inv, 1, 0
|
||||||
|
FFT4_FN inv, 1, 1
|
||||||
|
|
||||||
%macro FFT8_SSE_FN 1
|
%macro FFT8_SSE_FN 1
|
||||||
INIT_XMM sse3
|
INIT_XMM sse3
|
||||||
|
|||||||
@@ -45,6 +45,9 @@ TX_DECL_FN(fft_sr_ns, avx2)
|
|||||||
|
|
||||||
TX_DECL_FN(mdct_sr_inv, avx2)
|
TX_DECL_FN(mdct_sr_inv, avx2)
|
||||||
|
|
||||||
|
TX_DECL_FN(fft2_asm, sse3)
|
||||||
|
TX_DECL_FN(fft4_fwd_asm, sse2)
|
||||||
|
TX_DECL_FN(fft4_inv_asm, sse2)
|
||||||
TX_DECL_FN(fft8_asm, sse3)
|
TX_DECL_FN(fft8_asm, sse3)
|
||||||
TX_DECL_FN(fft8_asm, avx)
|
TX_DECL_FN(fft8_asm, avx)
|
||||||
TX_DECL_FN(fft16_asm, avx)
|
TX_DECL_FN(fft16_asm, avx)
|
||||||
@@ -101,8 +104,14 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
|
|||||||
|
|
||||||
const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
||||||
TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0),
|
TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0),
|
||||||
|
TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
|
||||||
TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
||||||
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
|
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
|
||||||
|
TX_DEF(fft4_fwd_asm, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
|
||||||
|
TX_DEF(fft4_inv_asm, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2,
|
||||||
|
AV_TX_INPLACE | FF_TX_INVERSE_ONLY | FF_TX_ASM_CALL, 0),
|
||||||
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
||||||
TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
|
TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
|
||||||
TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
|
TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
|
||||||
|
|||||||
Reference in New Issue
Block a user