diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index b3a85a7cb9..5e0c438b9c 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -22,11 +22,10 @@ ; based upon and compare. ; Intra-asm call convention: -; 272 bytes of stack available -; First 10 GPRs available +; 320 bytes of stack available +; 14 GPRs available (last 4 must not be clobbered) +; Additionally, don't clobber ctx, in, out, len, lut ; All vector regs available -; Don't clobber ctx, len, lut -; in and out must point to the end ; TODO: ; carry over registers from smaller transforms to save on ~8 loads/stores @@ -686,8 +685,6 @@ cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride movaps m0, [inq] FFT2 m0, m1 movaps [outq], m0 - add inq, mmsize*1 - add outq, mmsize*1 ret cglobal fft2_float, 4, 4, 2, ctx, out, in, stride @@ -721,8 +718,6 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride movaps [outq + 1*mmsize], m0 %if %3 - add inq, mmsize*2 - add outq, mmsize*2 ret %else RET @@ -764,8 +759,6 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp movups [outq + 3*mmsize], m1 %if %1 - add inq, mmsize*4 - add outq, mmsize*4 ret %else RET @@ -806,8 +799,6 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp vextractf128 [outq + 16*3], m0, 1 %if %1 - add inq, mmsize*2 - add outq, mmsize*2 ret %else RET @@ -857,8 +848,6 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp vextractf128 [outq + 16*7], m1, 1 %if %2 - add inq, mmsize*4 - add outq, mmsize*4 ret %else RET @@ -943,8 +932,6 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp vextractf128 [outq + 16*15], m5, 1 %if %2 - add inq, mmsize*8 - add outq, mmsize*8 ret %else RET @@ -1282,12 +1269,13 @@ FFT_SPLIT_RADIX_DEF 131072 add outq, 8*mmsize add rtabq, 4*mmsize sub itabq, 4*mmsize - sub lenq, 4*mmsize + sub tgtq, 4*mmsize jg .synth_deinterleave %if %2 - mov lenq, tgtq - add outq, offq + sub outq, tmpq + neg tmpq + lea inq, [inq + tmpq*4] ret %else RET @@ -1369,7 +1357,7 @@ FFT_SPLIT_RADIX_DEF 131072 vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 %if %2 - add outq, 16*mmsize + sub inq, 16*mmsize ret %else RET