x86/tx_float: mark AVX2 functions as AVXSLOW
Makes Bulldozer prefer AVX functions rather than AVX2, which are 64% slower: AVX: 117653 decicycles in av_tx (fft), 1048535 runs, 41 skips AVX2: 193385 decicycles in av_tx (fft), 1048561 runs, 15 skips The only difference between both is that vgatherdpd is used in the former. We don't want to mark them with the new SLOW_GATHER flag however, since gathers are still faster on Haswell/Zen 2/3 than plain loads.
This commit is contained in:
parent
7e35e0224c
commit
35080149ef
@ -100,11 +100,11 @@ DECL_CD_DEF(fft_sr_ns_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i
|
||||
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
DECL_CD_DEF(fft_sr_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
|
||||
288, AV_CPU_FLAG_AVX2,
|
||||
288, AV_CPU_FLAG_AVX2 | AV_CPU_FLAG_AVXSLOW,
|
||||
FF_TX_OUT_OF_PLACE)
|
||||
|
||||
DECL_CD_DEF(fft_sr_ns_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2,
|
||||
352, AV_CPU_FLAG_AVX2,
|
||||
352, AV_CPU_FLAG_AVX2 | AV_CPU_FLAG_AVXSLOW,
|
||||
FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE)
|
||||
#endif
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user