diff --git a/libavutil/aarch64/tx_float_init.c b/libavutil/aarch64/tx_float_init.c index e7b73b4bf9..8300472c4c 100644 --- a/libavutil/aarch64/tx_float_init.c +++ b/libavutil/aarch64/tx_float_init.c @@ -37,12 +37,11 @@ static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale) { - const int inv_lookup = opts ? opts->invert_lookup : 1; ff_tx_init_tabs_float(len); if (cd->max_len == 2) - return ff_tx_gen_ptwo_revtab(s, inv_lookup); + return ff_tx_gen_ptwo_revtab(s, opts); else - return ff_tx_gen_split_radix_parity_revtab(s, len, inv, inv_lookup, 8, 0); + return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, 8, 0); } const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { diff --git a/libavutil/tx.c b/libavutil/tx.c index ff81d235ba..8027e983ba 100644 --- a/libavutil/tx.c +++ b/libavutil/tx.c @@ -39,11 +39,41 @@ static av_always_inline int mulinv(int n, int m) return 0; } +int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, + int d1, int d2) +{ + const int sl = d1*d2; + + s->map = av_malloc(s->len*sizeof(*s->map)); + if (!s->map) + return AVERROR(ENOMEM); + + for (int k = 0; k < s->len; k += sl) { + if (s->inv || (opts && opts->map_dir == FF_TX_MAP_SCATTER)) { + for (int m = 0; m < d2; m++) + for (int n = 0; n < d1; n++) + s->map[k + ((m*d1 + n*d2) % (sl))] = m*d1 + n; + } else { + for (int m = 0; m < d2; m++) + for (int n = 0; n < d1; n++) + s->map[k + m*d1 + n] = (m*d1 + n*d2) % (sl); + } + + if (s->inv) + for (int w = 1; w <= ((sl) >> 1); w++) + FFSWAP(int, s->map[k + w], s->map[k + sl - w]); + } + + s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER; + + return 0; +} + /* Guaranteed to work for any n, m where gcd(n, m) == 1 */ -int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m) +int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, + int inv, int n, int m) { int *in_map, *out_map; - const int inv = s->inv; const int len = n*m; /* Will not be equal to s->len for MDCTs */ int m_inv, n_inv; @@ -61,14 +91,22 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m) out_map = s->map + len; /* Ruritanian map for input, CRT map for output, can be swapped */ - for (int j = 0; j < m; j++) { - for (int i = 0; i < n; i++) { - in_map[j*n + i] = (i*m + j*n) % len; - out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j; + if (opts && opts->map_dir == FF_TX_MAP_SCATTER) { + for (int j = 0; j < m; j++) { + for (int i = 0; i < n; i++) { + in_map[(i*m + j*n) % len] = j*n + i; + out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j; + } + } + } else { + for (int j = 0; j < m; j++) { + for (int i = 0; i < n; i++) { + in_map[j*n + i] = (i*m + j*n) % len; + out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j; + } } } - /* Change transform direction by reversing all ACs */ if (inv) { for (int i = 0; i < m; i++) { int *in = &in_map[i*n + 1]; /* Skip the DC */ @@ -77,17 +115,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m) } } - /* Our 15-point transform is also a compound one, so embed its input map */ - if (n == 15) { - for (int k = 0; k < m; k++) { - int tmp[15]; - memcpy(tmp, &in_map[k*15], 15*sizeof(*tmp)); - for (int i = 0; i < 5; i++) { - for (int j = 0; j < 3; j++) - in_map[k*15 + i*3 + j] = tmp[(i*3 + j*5) % 15]; - } - } - } + s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER; return 0; } @@ -103,21 +131,23 @@ static inline int split_radix_permutation(int i, int len, int inv) return split_radix_permutation(i, len, inv) * 4 + 1 - 2*(!(i & len) ^ inv); } -int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup) +int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts) { int len = s->len; if (!(s->map = av_malloc(len*sizeof(*s->map)))) return AVERROR(ENOMEM); - if (invert_lookup) { - for (int i = 0; i < s->len; i++) - s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1); - } else { + if (opts && opts->map_dir == FF_TX_MAP_SCATTER) { for (int i = 0; i < s->len; i++) s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i; + } else { + for (int i = 0; i < s->len; i++) + s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1); } + s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER; + return 0; } @@ -207,7 +237,8 @@ static void parity_revtab_generator(int *revtab, int n, int inv, int offset, } int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv, - int inv_lookup, int basis, int dual_stride) + FFTXCodeletOptions *opts, + int basis, int dual_stride) { basis >>= 1; if (len < basis) @@ -220,7 +251,10 @@ int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv, av_assert0(dual_stride <= basis); parity_revtab_generator(s->map, len, inv, 0, 0, 0, len, - basis, dual_stride, inv_lookup != 0); + basis, dual_stride, + opts ? opts->map_dir == FF_TX_MAP_GATHER : FF_TX_MAP_GATHER); + + s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER; return 0; } @@ -656,6 +690,33 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, ret = cd->init(sctx, cd, flags, opts, len, inv, scale); if (ret >= 0) { + if (opts && opts->map_dir != FF_TX_MAP_NONE && + sctx->map_dir == FF_TX_MAP_NONE) { + /* If a specific map direction was requested, and it doesn't + * exist, create one.*/ + sctx->map = av_malloc(len*sizeof(*sctx->map)); + if (!sctx->map) { + ret = AVERROR(ENOMEM); + goto end; + } + + for (int i = 0; i < len; i++) + sctx->map[i] = i; + } else if (opts && (opts->map_dir != sctx->map_dir)) { + int *tmp = av_malloc(len*sizeof(*sctx->map)); + if (!tmp) { + ret = AVERROR(ENOMEM); + goto end; + } + + memcpy(tmp, sctx->map, len*sizeof(*sctx->map)); + + for (int i = 0; i < len; i++) + sctx->map[tmp[i]] = i; + + free(tmp); + } + s->nb_sub++; goto end; } diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h index 80d045f6af..207f79dfb8 100644 --- a/libavutil/tx_priv.h +++ b/libavutil/tx_priv.h @@ -158,10 +158,23 @@ typedef enum FFTXCodeletPriority { FF_TX_PRIO_MAX = 32768, /* For custom implementations/ASICs */ } FFTXCodeletPriority; +typedef enum FFTXMapDirection { + /* No map. Make a map up. */ + FF_TX_MAP_NONE = 0, + + /* Lookup table must be applied via dst[i] = src[lut[i]]; */ + FF_TX_MAP_GATHER, + + /* Lookup table must be applied via dst[lut[i]] = src[i]; */ + FF_TX_MAP_SCATTER, +} FFTXMapDirection; + /* Codelet options */ typedef struct FFTXCodeletOptions { - int invert_lookup; /* If codelet is flagged as FF_TX_CODELET_PRESHUFFLE, - invert the lookup direction for the map generated */ + /* Request a specific lookup table direction. Codelets MUST put the + * direction in AVTXContext. If the codelet does not respect this, a + * conversion will be performed. */ + FFTXMapDirection map_dir; } FFTXCodeletOptions; /* Maximum number of factors a codelet may have. Arbitrary. */ @@ -234,11 +247,32 @@ struct AVTXContext { enum AVTXType type; /* Type of transform */ uint64_t flags; /* A combination of AVTXFlags and * codelet flags used when creating */ + FFTXMapDirection map_dir; /* Direction of AVTXContext->map */ float scale_f; double scale_d; void *opaque; /* Free to use by implementations */ }; +/* This function embeds a Ruritanian PFA input map into an existing lookup table + * to avoid double permutation. This allows for compound factors to be + * synthesized as fast PFA FFTs and embedded into either other or standalone + * transforms. + * The output CRT map must still be pre-baked into the transform. */ +#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2) \ + do { \ + int mtmp[(d1)*(d2)]; \ + for (int k = 0; k < tot_len; k += (d1)*(d2)) { \ + memcpy(mtmp, &map[k], (d1)*(d2)*sizeof(*mtmp)); \ + for (int m = 0; m < (d2); m++) \ + for (int n = 0; n < (d1); n++) \ + map[k + m*(d1) + n] = mtmp[(m*(d1) + n*(d2)) % ((d1)*(d2))]; \ + } \ + } while (0) + +/* This function generates a Ruritanian PFA input map into s->map. */ +int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, + int d1, int d2); + /* Create a subtransform in the current context with the given parameters. * The flags parameter from FFTXCodelet.init() should be preserved as much * as that's possible. @@ -250,11 +284,18 @@ int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, /* Clear the context by freeing all tables, maps and subtransforms. */ void ff_tx_clear_ctx(AVTXContext *s); +/* Generate a default map (0->len or 0, (len-1)->1 for inverse transforms) + * for a context. */ +int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts); + /* * Generates the PFA permutation table into AVTXContext->pfatab. The end table * is appended to the start table. + * The `inv` flag should only be enabled if the lookup tables of subtransforms + * won't get flattened. */ -int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m); +int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, + int inv, int n, int m); /* * Generates a standard-ish (slightly modified) Split-Radix revtab into @@ -262,7 +303,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m); * If it's set to 0, it has to be applied like out[map[i]] = in[i], otherwise * if it's set to 1, has to be applied as out[i] = in[map[i]] */ -int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup); +int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts); /* * Generates an index into AVTXContext->inplace_idx that if followed in the @@ -303,7 +344,8 @@ int ff_tx_gen_inplace_map(AVTXContext *s, int len); * to out[i] = src[map[i]]. */ int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv, - int inv_lookup, int basis, int dual_stride); + FFTXCodeletOptions *opts, + int basis, int dual_stride); /* Typed init function to initialize shared tables. Will initialize all tables * for all factors of a length. */ diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index c157719d73..38ab517f66 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -479,30 +479,15 @@ static av_cold int TX_NAME(ff_tx_fft_factor_init)(AVTXContext *s, int len, int inv, const void *scale) { + int ret = 0; TX_TAB(ff_tx_init_tabs)(len); - if (flags & FF_TX_PRESHUFFLE) { - s->map = av_malloc(len*sizeof(s->map)); - s->map[0] = 0; /* DC is always at the start */ - if (inv) /* Reversing the ACs flips the transform direction */ - for (int i = 1; i < len; i++) - s->map[i] = len - i; - else - for (int i = 1; i < len; i++) - s->map[i] = i; - } + if (len == 15) + ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5); + else if (flags & FF_TX_PRESHUFFLE) + ret = ff_tx_gen_default_map(s, opts); - /* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */ - if (len == 15) { - int tmp[15]; - memcpy(tmp, s->map, 15*sizeof(*tmp)); - for (int i = 0; i < 5; i++) { - for (int j = 0; j < 3; j++) - s->map[i*3 + j] = tmp[(i*3 + j*5) % 15]; - } - } - - return 0; + return ret; } #define DECL_FACTOR_S(n) \ @@ -605,7 +590,7 @@ static av_cold int TX_NAME(ff_tx_fft_sr_codelet_init)(AVTXContext *s, const void *scale) { TX_TAB(ff_tx_init_tabs)(len); - return ff_tx_gen_ptwo_revtab(s, opts ? opts->invert_lookup : 1); + return ff_tx_gen_ptwo_revtab(s, opts); } #define DECL_SR_CODELET_DEF(n) \ @@ -742,7 +727,9 @@ static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s, { int ret; int is_inplace = !!(flags & AV_TX_INPLACE); - FFTXCodeletOptions sub_opts = { .invert_lookup = !is_inplace }; + FFTXCodeletOptions sub_opts = { + .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER, + }; flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ flags |= AV_TX_INPLACE; /* in-place */ @@ -974,7 +961,9 @@ static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s, sub_len, inv, scale))) return ret; - if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len))) + /* Generate PFA map */ + if ((ret = ff_tx_gen_compound_mapping(s, opts, 0, + cd->factors[0], sub_len))) return ret; if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) @@ -1128,7 +1117,9 @@ static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s, const void *scale) { int ret; - FFTXCodeletOptions sub_opts = { .invert_lookup = inv }; + FFTXCodeletOptions sub_opts = { + .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER, + }; s->scale_d = *((SCALE_TYPE *)scale); s->scale_f = s->scale_d; @@ -1328,7 +1319,7 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s, const void *scale) { int ret, sub_len; - FFTXCodeletOptions sub_opts = { .invert_lookup = 0 }; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER }; len >>= 1; sub_len = len / cd->factors[0]; @@ -1344,9 +1335,13 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s, sub_len, inv, scale))) return ret; - if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len))) + if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len))) return ret; + /* Our 15-point transform is also a compound one, so embed its input map */ + if (cd->factors[0] == 15) + TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5); + if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL))) return ret; diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 97ee44defa..d3c0beb50f 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -75,12 +75,11 @@ static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \ int len, int inv, \ const void *scale) \ { \ - const int inv_lookup = opts ? opts->invert_lookup : 1; \ ff_tx_init_tabs_float(len); \ if (cd->max_len == 2) \ - return ff_tx_gen_ptwo_revtab(s, inv_lookup); \ + return ff_tx_gen_ptwo_revtab(s, opts); \ else \ - return ff_tx_gen_split_radix_parity_revtab(s, len, inv, inv_lookup, \ + return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, \ basis, interleave); \ } @@ -91,27 +90,27 @@ static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale) { + int ret; + + /* The transformations below are performed in the gather domain, + * so override the option and let the infrastructure convert the map + * to SCATTER if needed. */ + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; + TX_TAB(ff_tx_init_tabs)(len); - s->map = av_malloc(len*sizeof(s->map)); - s->map[0] = 0; /* DC is always at the start */ - if (inv) /* Reversing the ACs flips the transform direction */ - for (int i = 1; i < len; i++) - s->map[i] = len - i; + if (len == 15) + ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5); else - for (int i = 1; i < len; i++) - s->map[i] = i; + ret = ff_tx_gen_default_map(s, &sub_opts); + + if (ret < 0) + return ret; if (len == 15) { int cnt = 0, tmp[15]; - /* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */ - memcpy(tmp, s->map, 15*sizeof(*tmp)); - for (int i = 0; i < 5; i++) - for (int j = 0; j < 3; j++) - s->map[i*3 + j] = tmp[(i*3 + j*5) % 15]; - - /* Special 15-point assembly permutation */ + /* Special permutation to simplify loads in the pre-permuted version */ memcpy(tmp, s->map, 15*sizeof(*tmp)); for (int i = 1; i < 15; i += 3) { s->map[cnt] = tmp[i]; @@ -139,7 +138,7 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, int len, int inv, const void *scale) { int ret; - FFTXCodeletOptions sub_opts = { .invert_lookup = 1 }; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; s->scale_d = *((SCALE_TYPE *)scale); s->scale_f = s->scale_d; @@ -177,7 +176,7 @@ static av_cold int fft_pfa_init(AVTXContext *s, { int ret; int sub_len = len / cd->factors[0]; - FFTXCodeletOptions sub_opts = { .invert_lookup = 0 }; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER }; flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ flags |= AV_TX_INPLACE; /* in-place */ @@ -188,13 +187,18 @@ static av_cold int fft_pfa_init(AVTXContext *s, sub_len, inv, scale))) return ret; - if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len))) + if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len))) return ret; if (cd->factors[0] == 15) { + int tmp[15]; + + /* Our 15-point transform is also a compound one, so embed its input map */ + TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5); + + /* Special permutation to simplify loads in the pre-permuted version */ for (int k = 0; k < s->sub[0].len; k++) { int cnt = 0; - int tmp[15]; memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp)); for (int i = 1; i < 15; i += 3) { s->map[k*15 + cnt] = tmp[i];