From e4faea9ca79ce1f277d8499962c7de64f26b0436 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 15 Aug 2024 09:11:00 -0400
Subject: [PATCH 01/10] clusterlin bench: have low/high iter benchmarks instead
 of per-iter

---
 src/bench/cluster_linearize.cpp | 58 ++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 22 deletions(-)
diff --git a/src/bench/cluster_linearize.cpp b/src/bench/cluster_linearize.cpp
index de857419090..080bd86b7a5 100644
--- a/src/bench/cluster_linearize.cpp
+++ b/src/bench/cluster_linearize.cpp
@@ -111,17 +111,18 @@ DepGraph<SetType> MakeHardGraph(ClusterIndex ntx)
     return depgraph;
 }
 
-/** Benchmark that does search-based candidate finding with 10000 iterations.
+/** Benchmark that does search-based candidate finding with a specified number of iterations.
  *
- * Its goal is measuring how much time every additional search iteration in linearization costs.
+ * Its goal is measuring how much time every additional search iteration in linearization costs,
+ * by running with a low and a high count, subtracting the results, and divided by the number
+ * iterations difference.
  */
 template<typename SetType>
-void BenchLinearizePerIterWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
+void BenchLinearizeWorstCase(ClusterIndex ntx, benchmark::Bench& bench, uint64_t iter_limit)
 {
     const auto depgraph = MakeHardGraph<SetType>(ntx);
-    const auto iter_limit = std::min<uint64_t>(10000, uint64_t{1} << (ntx / 2 - 1));
     uint64_t rng_seed = 0;
-    bench.batch(iter_limit).unit("iters").run([&] {
+    bench.run([&] {
         SearchCandidateFinder finder(depgraph, rng_seed++);
         auto [candidate, iters_performed] = finder.FindCandidateSet(iter_limit, {});
         assert(iters_performed == iter_limit);
@@ -132,11 +133,12 @@ void BenchLinearizePerIterWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
  *
  * Its goal is measuring how much time linearization may take without any search iterations.
  *
- * If P is the resulting time of BenchLinearizePerIterWorstCase, and N is the resulting time of
- * BenchLinearizeNoItersWorstCase*, then an invocation of Linearize with max_iterations=m should
- * take no more than roughly N+m*P time. This may however be an overestimate, as the worst cases
- * do not coincide (the ones that are worst for linearization without any search happen to be ones
- * that do not need many search iterations).
+ * If P is the benchmarked per-iteration count (obtained by running BenchLinearizeWorstCase for a
+ * high and a low iteration count, subtracting them, and dividing by the difference in count), and
+ * N is the resulting time of BenchLinearizeNoItersWorstCase*, then an invocation of Linearize with
+ * max_iterations=m should take no more than roughly N+m*P time. This may however be an
+ * overestimate, as the worst cases do not coincide (the ones that are worst for linearization
+ * without any search happen to be ones that do not need many search iterations).
  *
  * This benchmark exercises a worst case for AncestorCandidateFinder, but for which improvement is
  * cheap.
@@ -207,12 +209,18 @@ void BenchMergeLinearizationsWorstCase(ClusterIndex ntx, benchmark::Bench& bench
 
 } // namespace
 
-static void LinearizePerIter16TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<16>>(16, bench); }
-static void LinearizePerIter32TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<32>>(32, bench); }
-static void LinearizePerIter48TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<48>>(48, bench); }
-static void LinearizePerIter64TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<64>>(64, bench); }
-static void LinearizePerIter75TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<75>>(75, bench); }
-static void LinearizePerIter99TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<99>>(99, bench); }
+static void Linearize16TxWorstCase20Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<16>>(16, bench, 20); }
+static void Linearize16TxWorstCase120Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<16>>(16, bench, 120); }
+static void Linearize32TxWorstCase5000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<32>>(32, bench, 5000); }
+static void Linearize32TxWorstCase15000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<32>>(32, bench, 15000); }
+static void Linearize48TxWorstCase5000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<48>>(48, bench, 5000); }
+static void Linearize48TxWorstCase15000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<48>>(48, bench, 15000); }
+static void Linearize64TxWorstCase5000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<64>>(64, bench, 5000); }
+static void Linearize64TxWorstCase15000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<64>>(64, bench, 15000); }
+static void Linearize75TxWorstCase5000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<75>>(75, bench, 5000); }
+static void Linearize75TxWorstCase15000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<75>>(75, bench, 15000); }
+static void Linearize99TxWorstCase5000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<99>>(99, bench, 5000); }
+static void Linearize99TxWorstCase15000Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<99>>(99, bench, 15000); }
 
 static void LinearizeNoIters16TxWorstCaseAnc(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseAnc<BitSet<16>>(16, bench); }
 static void LinearizeNoIters32TxWorstCaseAnc(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseAnc<BitSet<32>>(32, bench); }
@@ -242,12 +250,18 @@ static void MergeLinearizations64TxWorstCase(benchmark::Bench& bench) { BenchMer
 static void MergeLinearizations75TxWorstCase(benchmark::Bench& bench) { BenchMergeLinearizationsWorstCase<BitSet<75>>(75, bench); }
 static void MergeLinearizations99TxWorstCase(benchmark::Bench& bench) { BenchMergeLinearizationsWorstCase<BitSet<99>>(99, bench); }
 
-BENCHMARK(LinearizePerIter16TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizePerIter32TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizePerIter48TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizePerIter64TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizePerIter75TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizePerIter99TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize16TxWorstCase20Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize16TxWorstCase120Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize32TxWorstCase5000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize32TxWorstCase15000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize48TxWorstCase5000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize48TxWorstCase15000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize64TxWorstCase5000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize64TxWorstCase15000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize75TxWorstCase5000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize75TxWorstCase15000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize99TxWorstCase5000Iters, benchmark::PriorityLevel::HIGH);
+BENCHMARK(Linearize99TxWorstCase15000Iters, benchmark::PriorityLevel::HIGH);
 
 BENCHMARK(LinearizeNoIters16TxWorstCaseAnc, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizeNoIters32TxWorstCaseAnc, benchmark::PriorityLevel::HIGH);

From 85a285a306100d1815c4ad0f4e52ccbcae8b0fbc Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Mon, 13 May 2024 16:50:34 -0400
Subject: [PATCH 02/10] clusterlin: separate initial search entries per
 component (optimization)

Before this commit, the worst case for linearization involves clusters which
break apart in several smaller components after the first candidate is
included in the output linearization.

Address this by never considering work items that span multiple components
of what remains of the cluster.
---
 src/cluster_linearize.h             | 21 +++++++---
 src/test/fuzz/cluster_linearize.cpp | 62 ++++++++++++++++++++++++-----
 2 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 607ae681d25..a52ea5c3799 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -613,11 +613,20 @@ public:
         VecDeque<WorkItem> queue;
         queue.reserve(std::max<size_t>(256, 2 * m_todo.Count()));
 
-        // Create an initial entry with m_todo as undecided. Also use it as best if not provided,
-        // so that during the work processing loop below, and during the add_fn/split_fn calls, we
-        // do not need to deal with the best=empty case.
-        if (best.feerate.IsEmpty()) best = SetInfo(m_depgraph, m_todo);
-        queue.emplace_back(SetInfo<SetType>{}, SetType{m_todo});
+        // Create initial entries per connected component of m_todo. While clusters themselves are
+        // generally connected, this is not necessarily true after some parts have already been
+        // removed from m_todo. Without this, effort can be wasted on searching "inc" sets that
+        // span multiple components.
+        auto to_cover = m_todo;
+        do {
+            auto component = m_depgraph.FindConnectedComponent(to_cover);
+            to_cover -= component;
+            // If best is not provided, set it to the first component, so that during the work
+            // processing loop below, and during the add_fn/split_fn calls, we do not need to deal
+            // with the best=empty case.
+            if (best.feerate.IsEmpty()) best = SetInfo(m_depgraph, component);
+            queue.emplace_back(/*inc=*/SetInfo<SetType>{}, /*und=*/std::move(component));
+        } while (to_cover.Any());
 
         /** Local copy of the iteration limit. */
         uint64_t iterations_left = max_iterations;
@@ -645,7 +654,7 @@ public:
             // space runs out (see below), we know that no reallocation of the queue should ever
             // occur.
             Assume(queue.size() < queue.capacity());
-            queue.emplace_back(std::move(inc), std::move(und));
+            queue.emplace_back(/*inc=*/std::move(inc), /*und=*/std::move(und));
         };
 
         /** Internal process function. It takes an existing work item, and splits it in two: one
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index 2dfdfbb41de..abbcab0907b 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -165,6 +165,23 @@ std::pair<std::vector<ClusterIndex>, bool> SimpleLinearize(const DepGraph<SetTyp
     return {std::move(linearization), optimal};
 }
 
+/** Stitch connected components together in a DepGraph, guaranteeing its corresponding cluster is connected. */
+template<typename BS>
+void MakeConnected(DepGraph<BS>& depgraph)
+{
+    auto todo = BS::Fill(depgraph.TxCount());
+    auto comp = depgraph.FindConnectedComponent(todo);
+    Assume(depgraph.IsConnected(comp));
+    todo -= comp;
+    while (todo.Any()) {
+        auto nextcomp = depgraph.FindConnectedComponent(todo);
+        Assume(depgraph.IsConnected(nextcomp));
+        depgraph.AddDependency(comp.Last(), nextcomp.First());
+        todo -= nextcomp;
+        comp = nextcomp;
+    }
+}
+
 /** Given a dependency graph, and a todo set, read a topological subset of todo from reader. */
 template<typename SetType>
 SetType ReadTopologicalSet(const DepGraph<SetType>& depgraph, const SetType& todo, SpanReader& reader)
@@ -369,6 +386,20 @@ FUZZ_TARGET(clusterlin_components)
     assert(depgraph.FindConnectedComponent(todo).None());
 }
 
+FUZZ_TARGET(clusterlin_make_connected)
+{
+    // Verify that MakeConnected makes graphs connected.
+
+    SpanReader reader(buffer);
+    DepGraph<TestBitSet> depgraph;
+    try {
+        reader >> Using<DepGraphFormatter>(depgraph);
+    } catch (const std::ios_base::failure&) {}
+    MakeConnected(depgraph);
+    SanityCheck(depgraph);
+    assert(depgraph.IsConnected());
+}
+
 FUZZ_TARGET(clusterlin_chunking)
 {
     // Verify the correctness of the ChunkLinearization function.
@@ -468,13 +499,17 @@ FUZZ_TARGET(clusterlin_search_finder)
     // and comparing with the results from SimpleCandidateFinder, ExhaustiveCandidateFinder, and
     // AncestorCandidateFinder.
 
-    // Retrieve an RNG seed and a depgraph from the fuzz input.
+    // Retrieve an RNG seed, a depgraph, and whether to make it connected, from the fuzz input.
     SpanReader reader(buffer);
     DepGraph<TestBitSet> depgraph;
     uint64_t rng_seed{0};
+    uint8_t make_connected{1};
     try {
-        reader >> Using<DepGraphFormatter>(depgraph) >> rng_seed;
+        reader >> Using<DepGraphFormatter>(depgraph) >> rng_seed >> make_connected;
     } catch (const std::ios_base::failure&) {}
+    // The most complicated graphs are connected ones (other ones just split up). Optionally force
+    // the graph to be connected.
+    if (make_connected) MakeConnected(depgraph);
 
     // Instantiate ALL the candidate finders.
     SearchCandidateFinder src_finder(depgraph, rng_seed);
@@ -513,9 +548,11 @@ FUZZ_TARGET(clusterlin_search_finder)
             assert(found.transactions.IsSupersetOf(depgraph.Ancestors(i) & todo));
         }
 
-        // At most 2^N-1 iterations can be required: the number of non-empty subsets a graph with N
-        // transactions has.
-        assert(iterations_done <= ((uint64_t{1} << todo.Count()) - 1));
+        // At most 2^(N-1) iterations can be required: the maximum number of non-empty topological
+        // subsets a (connected) cluster with N transactions can have. Even when the cluster is no
+        // longer connected after removing certain transactions, this holds, because the connected
+        // components are searched separately.
+        assert(iterations_done <= (uint64_t{1} << (todo.Count() - 1)));
 
         // Perform quality checks only if SearchCandidateFinder claims an optimal result.
         if (iterations_done < max_iterations) {
@@ -685,14 +722,19 @@ FUZZ_TARGET(clusterlin_linearize)
 {
     // Verify the behavior of Linearize().
 
-    // Retrieve an RNG seed, an iteration count, and a depgraph from the fuzz input.
+    // Retrieve an RNG seed, an iteration count, a depgraph, and whether to make it connected from
+    // the fuzz input.
     SpanReader reader(buffer);
     DepGraph<TestBitSet> depgraph;
     uint64_t rng_seed{0};
     uint64_t iter_count{0};
+    uint8_t make_connected{1};
     try {
-        reader >> VARINT(iter_count) >> Using<DepGraphFormatter>(depgraph) >> rng_seed;
+        reader >> VARINT(iter_count) >> Using<DepGraphFormatter>(depgraph) >> rng_seed >> make_connected;
     } catch (const std::ios_base::failure&) {}
+    // The most complicated graphs are connected ones (other ones just split up). Optionally force
+    // the graph to be connected.
+    if (make_connected) MakeConnected(depgraph);
 
     // Optionally construct an old linearization for it.
     std::vector<ClusterIndex> old_linearization;
@@ -721,10 +763,10 @@ FUZZ_TARGET(clusterlin_linearize)
     }
 
     // If the iteration count is sufficiently high, an optimal linearization must be found.
-    // Each linearization step can use up to 2^k iterations, with steps k=1..n. That sum is
-    // 2 * (2^n - 1)
+    // Each linearization step can use up to 2^(k-1) iterations, with steps k=1..n. That sum is
+    // 2^n - 1.
     const uint64_t n = depgraph.TxCount();
-    if (n <= 18 && iter_count > 2U * ((uint64_t{1} << n) - 1U)) {
+    if (n <= 19 && iter_count > (uint64_t{1} << n)) {
         assert(optimal);
     }
 

From b80e6dfe780b3678bb41f2d9d63816f097529b2e Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 8 Aug 2024 21:42:28 -0400
Subject: [PATCH 03/10] clusterlin: add reordering support for DepGraph

Add a DepGraph(depgraph, reordering) function that constructs a new DepGraph
corresponding to an old one, but with its transactions is a modified order
(given as a vector from old to new positions).

Also use this reordering feature inside DepGraphFormatter::Unser, which needs
a small modification so that its reordering mapping is old-to-new (rather than
the new-to-old it used before).
---
 src/cluster_linearize.h           | 22 ++++++++++++++++++
 src/test/util/cluster_linearize.h | 38 ++++++++++++-------------------
 2 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index a52ea5c3799..7e74460c7e8 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -118,6 +118,28 @@ public:
         }
     }
 
+    /** Construct a DepGraph object given another DepGraph and a mapping from old to new.
+     *
+     * Complexity: O(N^2) where N=depgraph.TxCount().
+     */
+    DepGraph(const DepGraph<SetType>& depgraph, Span<const ClusterIndex> mapping) noexcept : entries(depgraph.TxCount())
+    {
+        Assert(mapping.size() == depgraph.TxCount());
+        // Fill in fee, size, ancestors.
+        for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) {
+            const auto& input = depgraph.entries[i];
+            auto& output = entries[mapping[i]];
+            output.feerate = input.feerate;
+            for (auto j : input.ancestors) output.ancestors.Set(mapping[j]);
+        }
+        // Fill in descendant information.
+        for (ClusterIndex i = 0; i < entries.size(); ++i) {
+            for (auto j : entries[i].ancestors) {
+                entries[j].descendants.Set(i);
+            }
+        }
+    }
+
     /** Get the number of transactions in the graph. Complexity: O(1). */
     auto TxCount() const noexcept { return entries.size(); }
     /** Get the feerate of a given transaction i. Complexity: O(1). */
diff --git a/src/test/util/cluster_linearize.h b/src/test/util/cluster_linearize.h
index 9477d2ed41f..5336d6015c1 100644
--- a/src/test/util/cluster_linearize.h
+++ b/src/test/util/cluster_linearize.h
@@ -186,7 +186,7 @@ struct DepGraphFormatter
         /** The dependency graph which we deserialize into first, with transactions in
          *  topological serialization order, not original cluster order. */
         DepGraph<SetType> topo_depgraph;
-        /** Mapping from cluster order to serialization order, used later to reconstruct the
+        /** Mapping from serialization order to cluster order, used later to reconstruct the
          *  cluster order. */
         std::vector<ClusterIndex> reordering;
 
@@ -205,9 +205,9 @@ struct DepGraphFormatter
                 coded_fee &= 0xFFFFFFFFFFFFF; // Enough for fee between -21M...21M BTC.
                 static_assert(0xFFFFFFFFFFFFF > uint64_t{2} * 21000000 * 100000000);
                 auto fee = UnsignedToSigned(coded_fee);
-                // Extend topo_depgraph with the new transaction (at the end).
+                // Extend topo_depgraph with the new transaction (preliminarily at the end).
                 auto topo_idx = topo_depgraph.AddTransaction({fee, size});
-                reordering.push_back(topo_idx);
+                reordering.push_back(reordering.size());
                 // Read dependency information.
                 uint64_t diff = 0; //!< How many potential parents we have to skip.
                 s >> VARINT(diff);
@@ -226,31 +226,23 @@ struct DepGraphFormatter
                         --diff;
                     }
                 }
-                // If we reach this point, we can interpret the remaining skip value as how far from the
-                // end of reordering topo_idx should be placed (wrapping around), so move it to its
-                // correct location. The preliminary reordering.push_back(topo_idx) above was to make
-                // sure that if a deserialization exception occurs, topo_idx still appears somewhere.
+                // If we reach this point, we can interpret the remaining skip value as how far
+                // from the end of reordering the new transaction should be placed (wrapping
+                // around), so remove the preliminary position it was put in above (which was to
+                // make sure that if a deserialization exception occurs, the new transaction still
+                // has some entry in reordering).
                 reordering.pop_back();
-                reordering.insert(reordering.end() - (diff % (reordering.size() + 1)), topo_idx);
+                ClusterIndex insert_distance = diff % (reordering.size() + 1);
+                // And then update reordering to reflect this new transaction's insertion.
+                for (auto& pos : reordering) {
+                    pos += (pos >= reordering.size() - insert_distance);
+                }
+                reordering.push_back(reordering.size() - insert_distance);
             }
         } catch (const std::ios_base::failure&) {}
 
         // Construct the original cluster order depgraph.
-        depgraph = {};
-        // Add transactions to depgraph in the original cluster order.
-        for (auto topo_idx : reordering) {
-            depgraph.AddTransaction(topo_depgraph.FeeRate(topo_idx));
-        }
-        // Translate dependencies from topological to cluster order.
-        for (ClusterIndex idx = 0; idx < reordering.size(); ++idx) {
-            ClusterIndex topo_idx = reordering[idx];
-            for (ClusterIndex dep_idx = 0; dep_idx < reordering.size(); ++dep_idx) {
-                ClusterIndex dep_topo_idx = reordering[dep_idx];
-                if (topo_depgraph.Ancestors(topo_idx)[dep_topo_idx]) {
-                    depgraph.AddDependency(dep_idx, idx);
-                }
-            }
-        }
+        depgraph = DepGraph(topo_depgraph, reordering);
     }
 };
 

From 9e43e4ce109e98a1ea3f54bbb4de86bc1b92ae4f Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 8 Aug 2024 21:42:30 -0400
Subject: [PATCH 04/10] clusterlin: use feerate-sorted depgraph in
 SearchCandidateFinder

This is a requirement for a future commit, which will rely on quickly iterating
over transaction sets in decreasing individual feerate order.
---
 src/cluster_linearize.h | 75 ++++++++++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 16 deletions(-)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 7e74460c7e8..ed6abfa4a35 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -563,23 +563,60 @@ class SearchCandidateFinder
 {
     /** Internal RNG. */
     InsecureRandomContext m_rng;
-    /** Internal dependency graph for the cluster. */
-    const DepGraph<SetType>& m_depgraph;
-    /** Which transactions are left to do (sorted indices). */
+    /** m_sorted_to_original[i] is the original position that sorted transaction position i had. */
+    std::vector<ClusterIndex> m_sorted_to_original;
+    /** m_original_to_sorted[i] is the sorted position original transaction position i has. */
+    std::vector<ClusterIndex> m_original_to_sorted;
+    /** Internal dependency graph for the cluster (with transactions in decreasing individual
+     *  feerate order). */
+    DepGraph<SetType> m_sorted_depgraph;
+    /** Which transactions are left to do (indices in m_sorted_depgraph's order). */
     SetType m_todo;
 
+    /** Given a set of transactions with sorted indices, get their original indices. */
+    SetType SortedToOriginal(const SetType& arg) const noexcept
+    {
+        SetType ret;
+        for (auto pos : arg) ret.Set(m_sorted_to_original[pos]);
+        return ret;
+    }
+
+    /** Given a set of transactions with original indices, get their sorted indices. */
+    SetType OriginalToSorted(const SetType& arg) const noexcept
+    {
+        SetType ret;
+        for (auto pos : arg) ret.Set(m_original_to_sorted[pos]);
+        return ret;
+    }
+
 public:
     /** Construct a candidate finder for a graph.
      *
      * @param[in] depgraph   Dependency graph for the to-be-linearized cluster.
      * @param[in] rng_seed   A random seed to control the search order.
      *
-     * Complexity: O(1).
+     * Complexity: O(N^2) where N=depgraph.Count().
      */
-    SearchCandidateFinder(const DepGraph<SetType>& depgraph LIFETIMEBOUND, uint64_t rng_seed) noexcept :
+    SearchCandidateFinder(const DepGraph<SetType>& depgraph, uint64_t rng_seed) noexcept :
         m_rng(rng_seed),
-        m_depgraph(depgraph),
-        m_todo(SetType::Fill(depgraph.TxCount())) {}
+        m_sorted_to_original(depgraph.TxCount()),
+        m_original_to_sorted(depgraph.TxCount()),
+        m_todo(SetType::Fill(depgraph.TxCount()))
+    {
+        // Determine reordering mapping, by sorting by decreasing feerate.
+        std::iota(m_sorted_to_original.begin(), m_sorted_to_original.end(), ClusterIndex{0});
+        std::sort(m_sorted_to_original.begin(), m_sorted_to_original.end(), [&](auto a, auto b) {
+            auto feerate_cmp = depgraph.FeeRate(a) <=> depgraph.FeeRate(b);
+            if (feerate_cmp == 0) return a < b;
+            return feerate_cmp > 0;
+        });
+        // Compute reverse mapping.
+        for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) {
+            m_original_to_sorted[m_sorted_to_original[i]] = i;
+        }
+        // Compute reordered dependency graph.
+        m_sorted_depgraph = DepGraph(depgraph, m_original_to_sorted);
+    }
 
     /** Check whether any unlinearized transactions remain. */
     bool AllDone() const noexcept
@@ -608,6 +645,9 @@ public:
     {
         Assume(!AllDone());
 
+        // Convert the provided best to internal sorted indices.
+        best.transactions = OriginalToSorted(best.transactions);
+
         /** Type for work queue items. */
         struct WorkItem
         {
@@ -641,12 +681,12 @@ public:
         // span multiple components.
         auto to_cover = m_todo;
         do {
-            auto component = m_depgraph.FindConnectedComponent(to_cover);
+            auto component = m_sorted_depgraph.FindConnectedComponent(to_cover);
             to_cover -= component;
             // If best is not provided, set it to the first component, so that during the work
             // processing loop below, and during the add_fn/split_fn calls, we do not need to deal
             // with the best=empty case.
-            if (best.feerate.IsEmpty()) best = SetInfo(m_depgraph, component);
+            if (best.feerate.IsEmpty()) best = SetInfo(m_sorted_depgraph, component);
             queue.emplace_back(/*inc=*/SetInfo<SetType>{}, /*und=*/std::move(component));
         } while (to_cover.Any());
 
@@ -695,13 +735,13 @@ public:
             const ClusterIndex split = elem.und.First();
 
             // Add a work item corresponding to exclusion of the split transaction.
-            const auto& desc = m_depgraph.Descendants(split);
+            const auto& desc = m_sorted_depgraph.Descendants(split);
             add_fn(/*inc=*/elem.inc,
                    /*und=*/elem.und - desc);
 
             // Add a work item corresponding to inclusion of the split transaction.
-            const auto anc = m_depgraph.Ancestors(split) & m_todo;
-            add_fn(/*inc=*/elem.inc.Add(m_depgraph, anc),
+            const auto anc = m_sorted_depgraph.Ancestors(split) & m_todo;
+            add_fn(/*inc=*/elem.inc.Add(m_sorted_depgraph, anc),
                    /*und=*/elem.und - anc);
 
             // Account for the performed split.
@@ -744,7 +784,9 @@ public:
             split_fn(std::move(elem));
         }
 
-        // Return the found best set and the number of iterations performed.
+        // Return the found best set (converted to the original transaction indices), and the
+        // number of iterations performed.
+        best.transactions = SortedToOriginal(best.transactions);
         return {std::move(best), max_iterations - iterations_left};
     }
 
@@ -754,9 +796,10 @@ public:
      */
     void MarkDone(const SetType& done) noexcept
     {
-        Assume(done.Any());
-        Assume(done.IsSubsetOf(m_todo));
-        m_todo -= done;
+        const auto done_sorted = OriginalToSorted(done);
+        Assume(done_sorted.Any());
+        Assume(done_sorted.IsSubsetOf(m_todo));
+        m_todo -= done_sorted;
     }
 };
 

From 2965fbf203f0b244814d7159381a2e9472bc1f97 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 9 May 2024 15:22:49 -0400
Subject: [PATCH 05/10] clusterlin: track upper bound potential set for work
 items (optimization)

In each work item, keep track of a conservative overestimate of the best
possible feerate that can be reached from it, and then use these to avoid
exploring hopeless work items.
---
 src/cluster_linearize.h             | 67 +++++++++++++++++++++++++----
 src/test/fuzz/cluster_linearize.cpp |  4 +-
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index ed6abfa4a35..2da9e1ebcc2 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -279,6 +279,14 @@ struct SetInfo
     explicit SetInfo(const DepGraph<SetType>& depgraph, const SetType& txn) noexcept :
         transactions(txn), feerate(depgraph.FeeRate(txn)) {}
 
+    /** Add a transaction to this SetInfo (which must not yet be in it). */
+    void Set(const DepGraph<SetType>& depgraph, ClusterIndex pos) noexcept
+    {
+        Assume(!transactions[pos]);
+        transactions.Set(pos);
+        feerate += depgraph.FeeRate(pos);
+    }
+
     /** Add the transactions of other to this SetInfo (no overlap allowed). */
     SetInfo& operator|=(const SetInfo& other) noexcept
     {
@@ -658,16 +666,24 @@ public:
             /** Set of undecided transactions. This must be a subset of m_todo, and have no overlap
              *  with inc. The set (inc | und) must be topologically valid. */
             SetType und;
+            /** (Only when inc is not empty) The best feerate of any superset of inc that is also a
+             *  subset of (inc | und), without requiring it to be topologically valid. It forms a
+             *  conservative upper bound on how good a set this work item can give rise to. */
+            FeeFrac pot_feerate;
 
             /** Construct a new work item. */
-            WorkItem(SetInfo<SetType>&& i, SetType&& u) noexcept :
-                inc(std::move(i)), und(std::move(u)) {}
+            WorkItem(SetInfo<SetType>&& i, SetType&& u, FeeFrac&& p_f) noexcept :
+                inc(std::move(i)), und(std::move(u)), pot_feerate(std::move(p_f))
+            {
+                Assume(pot_feerate.IsEmpty() == inc.feerate.IsEmpty());
+            }
 
             /** Swap two WorkItems. */
             void Swap(WorkItem& other) noexcept
             {
                 swap(inc, other.inc);
                 swap(und, other.und);
+                swap(pot_feerate, other.pot_feerate);
             }
         };
 
@@ -687,7 +703,9 @@ public:
             // processing loop below, and during the add_fn/split_fn calls, we do not need to deal
             // with the best=empty case.
             if (best.feerate.IsEmpty()) best = SetInfo(m_sorted_depgraph, component);
-            queue.emplace_back(/*inc=*/SetInfo<SetType>{}, /*und=*/std::move(component));
+            queue.emplace_back(/*inc=*/SetInfo<SetType>{},
+                               /*und=*/std::move(component),
+                               /*pot_feerate=*/FeeFrac{});
         } while (to_cover.Any());
 
         /** Local copy of the iteration limit. */
@@ -700,23 +718,44 @@ public:
          * - und: the "und" value for the new work item ((inc | und) must be topological).
          */
         auto add_fn = [&](SetInfo<SetType> inc, SetType und) noexcept {
+            /** SetInfo object with the set whose feerate will become the new work item's
+             *  pot_feerate. It starts off equal to inc. */
+            auto pot = inc;
             if (!inc.feerate.IsEmpty()) {
+                // Add entries to pot.
+                for (auto pos : und) {
+                    // Determine if adding transaction pos to pot (ignoring topology) would improve
+                    // it. If not, we're done updating pot. This relies on the fact that
+                    // m_sorted_depgraph, and thus the transactions iterated over, are in decreasing
+                    // individual feerate order.
+                    if (!(m_sorted_depgraph.FeeRate(pos) >> pot.feerate)) break;
+                    pot.Set(m_sorted_depgraph, pos);
+                }
+
                 // If inc's feerate is better than best's, remember it as our new best.
                 if (inc.feerate > best.feerate) {
                     best = inc;
                 }
+
+                // If no potential transactions exist beyond the already included ones, no
+                // improvement is possible anymore.
+                if (pot.feerate.size == inc.feerate.size) return;
+                // At this point und must be non-empty. If it were empty then pot would equal inc.
+                Assume(und.Any());
             } else {
                 Assume(inc.transactions.None());
+                // If inc is empty, we just make sure there are undecided transactions left to
+                // split on.
+                if (und.None()) return;
             }
 
-            // Make sure there are undecided transactions left to split on.
-            if (und.None()) return;
-
             // Actually construct a new work item on the queue. Due to the switch to DFS when queue
             // space runs out (see below), we know that no reallocation of the queue should ever
             // occur.
             Assume(queue.size() < queue.capacity());
-            queue.emplace_back(/*inc=*/std::move(inc), /*und=*/std::move(und));
+            queue.emplace_back(/*inc=*/std::move(inc),
+                               /*und=*/std::move(und),
+                               /*pot_feerate=*/std::move(pot.feerate));
         };
 
         /** Internal process function. It takes an existing work item, and splits it in two: one
@@ -730,9 +769,21 @@ public:
             Assume(elem.inc.transactions.IsSubsetOf(m_todo) && elem.und.IsSubsetOf(m_todo));
             // Included transactions cannot be undecided.
             Assume(!elem.inc.transactions.Overlaps(elem.und));
+            // If pot is empty, then so is inc.
+            Assume(elem.inc.feerate.IsEmpty() == elem.pot_feerate.IsEmpty());
+
+            const ClusterIndex first = elem.und.First();
+            if (!elem.inc.feerate.IsEmpty()) {
+                // We can ignore any queue item whose potential feerate isn't better than the best
+                // seen so far.
+                if (elem.pot_feerate <= best.feerate) return;
+            } else {
+                // In case inc is empty use a simpler alternative check.
+                if (m_sorted_depgraph.FeeRate(first) <= best.feerate) return;
+            }
 
             // Pick the first undecided transaction as the one to split on.
-            const ClusterIndex split = elem.und.First();
+            const ClusterIndex split = first;
 
             // Add a work item corresponding to exclusion of the split transaction.
             const auto& desc = m_sorted_depgraph.Descendants(split);
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index abbcab0907b..1eb3ff05542 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -429,7 +429,7 @@ FUZZ_TARGET(clusterlin_chunking)
         SetInfo<TestBitSet> accumulator, best;
         for (ClusterIndex idx : linearization) {
             if (todo[idx]) {
-                accumulator |= SetInfo(depgraph, idx);
+                accumulator.Set(depgraph, idx);
                 if (best.feerate.IsEmpty() || accumulator.feerate >> best.feerate) {
                     best = accumulator;
                 }
@@ -658,7 +658,7 @@ FUZZ_TARGET(clusterlin_linearization_chunking)
             SetInfo<TestBitSet> accumulator, best;
             for (auto j : linearization) {
                 if (todo[j] && !combined[j]) {
-                    accumulator |= SetInfo(depgraph, j);
+                    accumulator.Set(depgraph, j);
                     if (best.feerate.IsEmpty() || accumulator.feerate > best.feerate) {
                         best = accumulator;
                     }

From 6060a948caf6dbc7505658d6cac750e25698eaf9 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 5 Sep 2024 12:55:49 -0400
Subject: [PATCH 06/10] clusterlin bench: add example hard cluster benchmarks

Co-Authored-By: Suhas Daftuar <sdaftuar@gmail.com>
---
 src/bench/cluster_linearize.cpp   | 91 +++++++++++++++++++++++++++++++
 src/test/util/cluster_linearize.h |  4 +-
 2 files changed, 93 insertions(+), 2 deletions(-)

diff --git a/src/bench/cluster_linearize.cpp b/src/bench/cluster_linearize.cpp
index 080bd86b7a5..4942b1a67e3 100644
--- a/src/bench/cluster_linearize.cpp
+++ b/src/bench/cluster_linearize.cpp
@@ -4,7 +4,9 @@
 
 #include <bench/bench.h>
 #include <cluster_linearize.h>
+#include <test/util/cluster_linearize.h>
 #include <util/bitset.h>
+#include <util/strencodings.h>
 
 #include <algorithm>
 #include <cassert>
@@ -12,6 +14,7 @@
 #include <vector>
 
 using namespace cluster_linearize;
+using namespace util::hex_literals;
 
 namespace {
 
@@ -207,6 +210,43 @@ void BenchMergeLinearizationsWorstCase(ClusterIndex ntx, benchmark::Bench& bench
     });
 }
 
+template<size_t N>
+void BenchLinearizeOptimally(benchmark::Bench& bench, const std::array<uint8_t, N>& serialized)
+{
+    // Determine how many transactions the serialized cluster has.
+    ClusterIndex num_tx{0};
+    {
+        SpanReader reader{serialized};
+        DepGraph<BitSet<128>> depgraph;
+        reader >> Using<DepGraphFormatter>(depgraph);
+        num_tx = depgraph.TxCount();
+        assert(num_tx < 128);
+    }
+
+    SpanReader reader{serialized};
+    auto runner_fn = [&]<typename SetType>() noexcept {
+        DepGraph<SetType> depgraph;
+        reader >> Using<DepGraphFormatter>(depgraph);
+        uint64_t rng_seed = 0;
+        bench.run([&] {
+            auto res = Linearize(depgraph, /*max_iterations=*/10000000, rng_seed++);
+            assert(res.second);
+        });
+    };
+
+    if (num_tx <= 32) {
+        runner_fn.template operator()<BitSet<32>>();
+    } else if (num_tx <= 64) {
+        runner_fn.template operator()<BitSet<64>>();
+    } else if (num_tx <= 96) {
+        runner_fn.template operator()<BitSet<96>>();
+    } else if (num_tx <= 128) {
+        runner_fn.template operator()<BitSet<128>>();
+    } else {
+        assert(false);
+    }
+}
+
 } // namespace
 
 static void Linearize16TxWorstCase20Iters(benchmark::Bench& bench) { BenchLinearizeWorstCase<BitSet<16>>(16, bench, 20); }
@@ -250,6 +290,45 @@ static void MergeLinearizations64TxWorstCase(benchmark::Bench& bench) { BenchMer
 static void MergeLinearizations75TxWorstCase(benchmark::Bench& bench) { BenchMergeLinearizationsWorstCase<BitSet<75>>(75, bench); }
 static void MergeLinearizations99TxWorstCase(benchmark::Bench& bench) { BenchMergeLinearizationsWorstCase<BitSet<99>>(99, bench); }
 
+// The following example clusters were constructed by replaying historical mempool activity, and
+// selecting for ones that take many iterations (after the introduction of some but not all
+// linearization algorithm optimizations).
+
+/* 2023-05-05T23:12:21Z 71, 521780, 543141,*/
+static constexpr auto BENCH_EXAMPLE_00 = "801081a5360092239efc6201810982ab58029b6b98c86803800eed7804800ecb7e058f2f878778068030d43407853e81902a08962a81d176098010b6620a8010b2280b8010da3a0c9f069da9580d800db11e0e9d719ad37a0f967897ed5210990e99fc0e11812c81982012804685823e0f0a893982b6040a10804682c146110a6e80db5c120a8010819806130a8079858f0c140a8054829a120c12803483a1760c116f81843c0d11718189000e11800d81ac2c0f11800d81e50e10117181c77c1111822e87f2601012815983d17211127180f2121212811584a21e1312800e80d1781412813c83e81815126f80ef5016126f80ff6c16126f80f66017126e80fd541812800d81942a1912800e80dd781a12800d81f96c1b12805282e7581b127180fd721c1271a918230b805fc11a220d8118a15a2d036f80e5002011817684d8241e346f80e1181c37805082fc04260024800d81f8621734803382b354270b12805182ca2e162f800e80d52e0d32803dc360201b850e818c400b318c49808a5a290210805181d65823142a800d81a34e0850800e81fb3c0851886994fc0a280b00082c805482d208032e28805e83ba380059801081cd4a0159811884f770002e0015e17280e49024300a0000000000000031803dcb48014200"_hex_u8;
+/* 2023-12-06T09:30:01Z 81, 141675, 647053,*/
+static constexpr auto BENCH_EXAMPLE_01 = "b348f1fc4000f365818a9e2c01b44cf7ca0002b004f0b02003b33ef8ae3004b334f9e87005800d81c85e06b368fae26007b05ef2e14208be1a8093a50409b15cf5ee500a802c80a1420b802dea440c802ce50a0d802cdc320e802cd7220f802dd72210805380f74a118174f370126e96b32812127182c4701312817389d26414128035848c221512800e82bf3816126f81e4341712801082b228181280518af57418128040859a0019127182d0401a12803e858b641b127182c4421c126f82b3481d12811486b6301e12821d89e7281f126e8a8b421f127182d6642012806284c12021126e81d34822126e86a76222126e86d8102212805187b6542312800d82fc002412803d848e0e2512801082d27a26126e8589642612800e83a9602712800e83bd0028126e81ef1a29116e858d7228126f82db5e2912801083843c2a127181c93c2b126e85d0162b127181c5622c126e84f8262c12800f8392202d12800e82b66c2e126e81d0082f12803282d50430126e84f9003012805f84be6c3112846e88df0e2b12804080d44c340a8b31898808350a800ed760350b801083a1182b517182817e2a51800e82b6582951803583cb52420030806284cb6c204f7181d300204f82688ce0303e001d800e82bb200f488010808a182822a3289cd63041000a6fcd100a408a7caaa7024800002f803584e0741e27288f3386dd783b001000802683f27e004b8c44bcd0763f0000000000000000000100000e00"_hex_u8;
+/* 2023-04-04T00:26:50Z 90, 99930, 529375,*/
+static constexpr auto BENCH_EXAMPLE_02 = "815b80b61e00800da63001cd378da70e028010991a03800e9d3e0480109708058010991a068010973a07da738fa72408de7491831009b35b88f0080a9d4485de180b71974e0c71974e0d80108e500eb27988a75a0f719632108061a56c11801087761280108a1413807893441480538c1415a606828806168010893e1780548c40188e4b80bb2c196eab3e1718805ed60e18188051c97a19188010cf781a1871b11e1b1871c5281c1880508080581d186e80b13c1e188035cf421f18805fe0482018804caa661f198035a9001f156e80cb701d1871a2281e1871ad281f18817380a16020186f98642118805ee04821198010b6702219800ea12623196eb67024198035808b0025196fa65c26198054ba1c2719807680bf7c28198053cd782919803d80b80429198051db5a2a198040d3742b19976584bb1c28196efc1c281971b21a29198052bc762a1971a2502b196eb73c2c19976381ab0c2a18806290543409862081c3423b00336fbc70224d80109e7c1c52805ebd5c1942800eb57016468034ba423405158118da28350416927480f4743000159f6a81c9462e00188051ec5e380e00800e9e420775800d9e26007c906c82f754251d0025870480f12c14280023800d9e26027e9e1385ed08102900001a804fac7a018001719856028001800da87e0180039b1a868b60064102246e9f42018005800da87e028005850d81d600026d862381a2200e0008230015831480a5480342000524803eeb32006e873582a4700a0100351300"_hex_u8;
+/* 2023-05-08T15:51:59Z 87, 76869, 505222,*/
+static constexpr auto BENCH_EXAMPLE_03 = "c040b9e15a00b10eac842601805f85931802c104bae17403ae50aaa336049d76a9bf7005c55bbeab6606ae2aa9c72c07805e81992e08af7dab817a096e80a7e4520909803e92bd780a097185c76c0b096e98e7380b09850bb9953c0c09803389f6260d096f859d620e09803f88d3000f0971829c6e1009837690f6481109806285931811097181f56814076ea09b74120980408eb73213096f87853214096f86e2701509803f8c860016098a6fe6c3721709814f92a204180980628a8a441909803285df681a0980348498661b096e8290781c096e978e081c097187da1a1d097186c05c1e097185893c1f09805f8ad9002009800d84e74e21097183a67a22097182e23423097184b53a23096ea393062309840faddd46240980618eb732250980548bee6a2609807986883c2709718298402809815388b6582909805384ec742a097181b9142b096e97b5262b096e85e14e2c0980518abb5c2d09805489e75a2e09803187e3382f097180eb1c34046f87c34a2f098309a5c54430097186911831098054899c083209801083bc1033097081e02a3409805f848f0c35096e80d4343a057180c37040006f80a22438097180a0503f03816f8381444003803f80ef003f05800580a4283f066ef72845016efb91663e09923d808d8216470041803584837c46012f9247dc86684501268267a09610450222862184db68440712803585ea40440113835d97887805800b8723c7a40a4b00022f81529ae2143c0c1f80548b8f381b311980408e955c055e802589dc10037e801083b54602658010848130006700"_hex_u8;
+/* 2023-05-01T19:32:10Z 35, 55747, 504128,*/
+static constexpr auto BENCH_EXAMPLE_04 = "801af95c00801af72801801af95c02873e85f2180202873e85f2180202873e85f21802028018fb2802068018fb2803068018fb2804068018fb2805068018fb2806068018fb2807068018fb2808068018fb2809068018fb280a068018fb280a058018fb280b058018fb280c058018fb280d058018fb280e058018fb280f058018fb2810058018fb2811058018fb2812058018fb2813058018fb2814058018fb2815058018fb2815048018fb2816048018fb2817048018fb2818048018fb2819048018fb281a048018fb281b04810d80d9481f00000100"_hex_u8;
+/* 2023-02-27T17:06:38Z 60, 55680, 502749,*/
+static constexpr auto BENCH_EXAMPLE_05 = "b5108ab56600b26d89f85601b07383b01602b22683c96003b34a83d82e04b12f83b53a05b20e83c75a066e80840a06068040be0007066fb10608066fb2120906800eba320a06842b80b05a0a066eff420b067199300b068124c3140c0680618085180d066faa1c0e068010b4440f068051af541006800da1781106857881946812066eee1613068052b31014068324808d361506806180885c150671b03216066ef11017068052b63218066ef3521806803f80865419066e93441a068035a13e1b0680628085181c06806ec4481d068117e72c1e06719c721f068077c42420068159808d1821066eef0c21058010b90022056f9908230571993024058010b00a25058010b00a260580608087402705803fc10027068032b42828068051b6322906800db11e212a8324808d361933803ff400192f826381a7141a2f8032ac08152a800db54c044e8323808d3630010002018158d84000042d821cea12002807853580d462002d01891181d022002e00"_hex_u8;
+/* 2023-04-20T22:25:49Z 99, 49100, 578622,*/
+static constexpr auto BENCH_EXAMPLE_06 = "bf3c87c14c008010955a01b21d85e07002800d946c036e8e3404b77f86c26605b33c85f55e06bd06879852078010970a08bd4b87cf00098123a7720ab2158687680b8054d4440b0a8062fa4c0c0a71ac400d0a80628081540e0a8010a2580f0a8054b676100a8032b85c110a6e9a40120a6e809012130a817f80c31e140a8175808674150a719d46160a8172d86415098033c1481609800da4181709800ada2e1809803dc85219098034b4041a096ef5501b098052d67c1c098051d3281d09800ebc4a1e098175808c641f098061c55020098078c85021096e8081141f0b6faf1e200b8061da68210b8062f000220b800ebc20230b8035d058240b8053de32250b8050b610250b6fad32260b803dc276270b803d80a610280b6ef812290b8052b6322a0b800eb57e2b0b8052bd062c0b719e522d0b71a3762e0b8010bb1e2f0b80109a78310a80109962320a8051a60c330a6f9f3e320b6e808b24330b719e40340b8117cc50350b803d80971a360b8051b930370b6f9e0a380b719b10390b8052a6003a0b6e808c76390a7195603a0a6f935c3b0a8054a31a3c0a803ce30c3b0b803fa3003c0b800dbe2a3d0b8f3480a84244058005851a44069d1bf824400b83098f284507719c723d4f6f9c1c3449719c722f4f6eb23c304f8061c5502e528061da682b4e8118bb724e022a8054b35028476e941c1d51815be02c4f01148557808e3a4f070e8104af464e001180329d364e010d805f9f6a421b9c3387aa744c0d4d71ac400b800881748098444710338173809b780b80008054d444292c12821dc040550403078b4682b4664517003f00"_hex_u8;
+/* 2023-06-05T19:56:12Z 52, 44896, 540514,*/
+static constexpr auto BENCH_EXAMPLE_07 = "b317998a4000b40098d53e01b45b99814802b7289b940003b3699a9d1204b6619a807a05814682cb78050571d854060571d8540705800e808d7a0805803480c06a09056e8189280a056ffd060b05800d80ea7a0c05803c80b80c0c03803e80d86e0d036ed2280e03811581804a0f036fd34e1003805380eb6811036e81f60e12038010ec101204805f80e83a13048033809534140471e00a15048010f95816046e81fa301704805180a74c1705800d808f1018056fd55c1905800e8091481a056e80a76e1b05805f80e2741c0571809b021c05826382c8401d0571df201e05800e809d2c1f05850083e87c1f05811580af68200571f20a21056ff9042205803e80df1e23056e81956c24056e9f542604805180e83829000e800e8080621325803380b0402a020d6ef8100e2c8c4889a96a2c000f803580ce4c2c000b6e9f54062a803480c96406260500"_hex_u8;
+/* 2023-12-05T23:48:44Z 69, 44283, 586734,*/
+static constexpr auto BENCH_EXAMPLE_08 = "83728ce80000b90befca1001806083b24002b40de6da3203b545e9c35c04b34beede3005b068e8883006d41c80b1e14c07b337e7841208b26beadb2e096e83892e090980518487380a096e82815c0a096e81ce3c0b097181db200c097181d4020d09810084ed600e096e96b0100f0971819a0210086e93da2e0f09803583ee5e1009803583c66c1109800d82bb6e1209800d81d56a1309803c82e622140971819f521509803d84a55c15057181d6161605806283ac5217056e949c5a18056e89e8641806815889e23419067181de321a066e8af2641a076e82a70a1b07803583f2081c076f81e76e1d076e81d33e1e07800d83b8761e086e82a5541f087181de302008805f84ad0021086e81c74022086e81bd3e23086e9288182408806184b3102409803283816025096e91ed662609830a88e70827096e81d14a27097181ce6028096e8cf03829097181883832016f81835c3103806181e0103203804180b8103204863584fe183304800de66434046e9e4c34056e81d6742f429213c0eb602e3d6483b06c283a6e81d73c263d6e82f9581831805485ab360e37805080c62609398b3189880838010603916db1f3583a03000110873199f8623c000000011100"_hex_u8;
+/* 2023-04-14T19:36:52Z 77, 20418, 501117,*/
+static constexpr auto BENCH_EXAMPLE_09 = "bf2989d00400815bca5c01af1e86f97602800d9d6c03800d8a3404b47988866e05b36287f92e0680109f68078010991a08805ecf1208076e80933e09078062d01c0a078054b6760b078053b6760c076f9c1c0d078054b6760e0771af260f0771b17e10078032f57011078035d56812078054e1581307886b83dc301407817480d13013068005a6001406803d80821a15066ef3201606800ea2181706800da628180671ab1219068054db0c1a06719b001b06815b80a11c1c068050b9301d066fac2a1e068033ab481f06719b1020068035ab721e07803dc2761f0771ae3c20078040f60e210771ce282207800ea4322307882a81a66024078035ad4625076efe7e26078162808e1827078118bb7228076eac7428088010bf58290871a04c2a0871bc722b086fa8382c08803d80a0142d088035d6282e088051c30c2f086efc623008800d9f6231086f986432088117bb7237028010a63034068010c84e2740800ea64c2237832c80933e1f3b830880c454390208813c80955c3905068032c73611348010a03c093c837a808a101b278050ac34093a8051ac34291b8f3b8187401d28881a82cb3a3a0a37977b86d20843000028996686a7083f030f8078d3761b27106e995a08499070839b5a1131000b00"_hex_u8;
+/* 2023-11-07T17:59:35Z 48, 4792, 498995,*/
+static constexpr auto BENCH_EXAMPLE_10 = "875f89aa1000b51ec09d7201c55cc7a72e02a11aa1fb3203b233a7f95204800ef56205b33ea9d13006803e80b26e07d90ec9dd4008b45eabbe6c09806080ca000a815984e8680a0a6f80925e0a0a803f80e1660c09937c94b7420d086e82f5640a086e80997e0b086f808d320c08800580a5640d086f8089100e08804080c9060f088115819a1c10086e82961a0f0a805f81bc0a100a6ff826110a6ef53e120a807584c60c110a6e818f32120a803c81c246130a805481d508140a8159838410150a7180a55c160a6f80821c170a6fe6101c066fe6101d06805080f854190a6e81b27c1a0a8155819c701e06805180ae0c21046e8b9a222501805180f53422001680f26880f8a62a220116803580da582007058153838e6e21000c800d80a712033a807681ae1c23000308834a82d36023020205815981e03a051a08001700"_hex_u8;
+
+static void LinearizeOptimallyExample00(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_00); }
+static void LinearizeOptimallyExample01(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_01); }
+static void LinearizeOptimallyExample02(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_02); }
+static void LinearizeOptimallyExample03(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_03); }
+static void LinearizeOptimallyExample04(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_04); }
+static void LinearizeOptimallyExample05(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_05); }
+static void LinearizeOptimallyExample06(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_06); }
+static void LinearizeOptimallyExample07(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_07); }
+static void LinearizeOptimallyExample08(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_08); }
+static void LinearizeOptimallyExample09(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_09); }
+static void LinearizeOptimallyExample10(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_10); }
+
 BENCHMARK(Linearize16TxWorstCase20Iters, benchmark::PriorityLevel::HIGH);
 BENCHMARK(Linearize16TxWorstCase120Iters, benchmark::PriorityLevel::HIGH);
 BENCHMARK(Linearize32TxWorstCase5000Iters, benchmark::PriorityLevel::HIGH);
@@ -290,3 +369,15 @@ BENCHMARK(MergeLinearizations48TxWorstCase, benchmark::PriorityLevel::HIGH);
 BENCHMARK(MergeLinearizations64TxWorstCase, benchmark::PriorityLevel::HIGH);
 BENCHMARK(MergeLinearizations75TxWorstCase, benchmark::PriorityLevel::HIGH);
 BENCHMARK(MergeLinearizations99TxWorstCase, benchmark::PriorityLevel::HIGH);
+
+BENCHMARK(LinearizeOptimallyExample00, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample01, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample02, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample03, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample04, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample05, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample06, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample07, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample08, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample09, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample10, benchmark::PriorityLevel::HIGH);
diff --git a/src/test/util/cluster_linearize.h b/src/test/util/cluster_linearize.h
index 5336d6015c1..b86ebcd78b2 100644
--- a/src/test/util/cluster_linearize.h
+++ b/src/test/util/cluster_linearize.h
@@ -102,7 +102,7 @@ bool IsAcyclic(const DepGraph<SetType>& depgraph) noexcept
 struct DepGraphFormatter
 {
     /** Convert x>=0 to 2x (even), x<0 to -2x-1 (odd). */
-    static uint64_t SignedToUnsigned(int64_t x) noexcept
+    [[maybe_unused]] static uint64_t SignedToUnsigned(int64_t x) noexcept
     {
         if (x < 0) {
             return 2 * uint64_t(-(x + 1)) + 1;
@@ -112,7 +112,7 @@ struct DepGraphFormatter
     }
 
     /** Convert even x to x/2 (>=0), odd x to -(x/2)-1 (<0). */
-    static int64_t UnsignedToSigned(uint64_t x) noexcept
+    [[maybe_unused]] static int64_t UnsignedToSigned(uint64_t x) noexcept
     {
         if (x & 1) {
             return -int64_t(x / 2) - 1;

From e20fda77a2da1da3deb273d83700a0b7139422ab Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 9 May 2024 14:30:05 -0400
Subject: [PATCH 07/10] clusterlin: reduce computation of unnecessary pot sets
 (optimization)

Keep track of which transactions in the graph have an individual
feerate that is better than the best included set so far. Others do not
need to be added to the pot set, as they cannot possibly help beating
best.
---
 src/cluster_linearize.h | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 2da9e1ebcc2..4053f90eaca 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -668,7 +668,10 @@ public:
             SetType und;
             /** (Only when inc is not empty) The best feerate of any superset of inc that is also a
              *  subset of (inc | und), without requiring it to be topologically valid. It forms a
-             *  conservative upper bound on how good a set this work item can give rise to. */
+             *  conservative upper bound on how good a set this work item can give rise to.
+             *  Transactions whose feerate is below best's are ignored when determining this value,
+             *  which means it may technically be an underestimate, but if so, this work item
+             *  cannot result in something that beats best anyway. */
             FeeFrac pot_feerate;
 
             /** Construct a new work item. */
@@ -711,8 +714,16 @@ public:
         /** Local copy of the iteration limit. */
         uint64_t iterations_left = max_iterations;
 
+        /** The set of transactions in m_todo which have feerate > best's. */
+        SetType imp = m_todo;
+        while (imp.Any()) {
+            ClusterIndex check = imp.Last();
+            if (m_sorted_depgraph.FeeRate(check) >> best.feerate) break;
+            imp.Reset(check);
+        }
+
         /** Internal function to add an item to the queue of elements to explore if there are any
-         *  transactions left to split on, and to update best.
+         *  transactions left to split on, and to update best/imp.
          *
          * - inc: the "inc" value for the new work item (must be topological).
          * - und: the "und" value for the new work item ((inc | und) must be topological).
@@ -722,8 +733,11 @@ public:
              *  pot_feerate. It starts off equal to inc. */
             auto pot = inc;
             if (!inc.feerate.IsEmpty()) {
-                // Add entries to pot.
-                for (auto pos : und) {
+                // Add entries to pot. We iterate over all undecided transactions whose feerate is
+                // higher than best. While undecided transactions of lower feerate may improve pot,
+                // the resulting pot feerate cannot possibly exceed best's (and this item will be
+                // skipped in split_fn anyway).
+                for (auto pos : imp & und) {
                     // Determine if adding transaction pos to pot (ignoring topology) would improve
                     // it. If not, we're done updating pot. This relies on the fact that
                     // m_sorted_depgraph, and thus the transactions iterated over, are in decreasing
@@ -735,6 +749,12 @@ public:
                 // If inc's feerate is better than best's, remember it as our new best.
                 if (inc.feerate > best.feerate) {
                     best = inc;
+                    // See if we can remove any entries from imp now.
+                    while (imp.Any()) {
+                        ClusterIndex check = imp.Last();
+                        if (m_sorted_depgraph.FeeRate(check) >> best.feerate) break;
+                        imp.Reset(check);
+                    }
                 }
 
                 // If no potential transactions exist beyond the already included ones, no
@@ -774,6 +794,9 @@ public:
 
             const ClusterIndex first = elem.und.First();
             if (!elem.inc.feerate.IsEmpty()) {
+                // If no undecided transactions remain with feerate higher than best, this entry
+                // cannot be improved beyond best.
+                if (!elem.und.Overlaps(imp)) return;
                 // We can ignore any queue item whose potential feerate isn't better than the best
                 // seen so far.
                 if (elem.pot_feerate <= best.feerate) return;

From 71f26293988019d2035bcc55af7b440b494b56bc Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 9 May 2024 13:53:27 -0400
Subject: [PATCH 08/10] clusterlin: include topological pot subsets
 automatically (optimization)

Automatically add topologically-valid subsets of the potential set pot
to inc. It can be proven that these must be part of the best reachable
topologically-valid set from that work item.

This is a crucial optimization that (apparently) reduces the maximum
number of iterations from ~2^(N-1) to ~sqrt(2^N).

Co-Authored-By: Suhas Daftuar <sdaftuar@gmail.com>
---
 src/bench/cluster_linearize.cpp     | 21 ++++++++++++++++++---
 src/cluster_linearize.h             | 29 ++++++++++++++++++++++++++---
 src/test/fuzz/cluster_linearize.cpp | 13 +++++++++++++
 3 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/src/bench/cluster_linearize.cpp b/src/bench/cluster_linearize.cpp
index 4942b1a67e3..c5262ffe607 100644
--- a/src/bench/cluster_linearize.cpp
+++ b/src/bench/cluster_linearize.cpp
@@ -48,8 +48,8 @@ DepGraph<SetType> MakeWideGraph(ClusterIndex ntx)
     return depgraph;
 }
 
-// Construct a difficult graph. These need at least sqrt(2^(n-1)) iterations in the best
-// known algorithms (purely empirically determined).
+// Construct a difficult graph. These need at least sqrt(2^(n-1)) iterations in the implemented
+// algorithm (purely empirically determined).
 template<typename SetType>
 DepGraph<SetType> MakeHardGraph(ClusterIndex ntx)
 {
@@ -316,7 +316,14 @@ static constexpr auto BENCH_EXAMPLE_08 = "83728ce80000b90befca1001806083b24002b4
 static constexpr auto BENCH_EXAMPLE_09 = "bf2989d00400815bca5c01af1e86f97602800d9d6c03800d8a3404b47988866e05b36287f92e0680109f68078010991a08805ecf1208076e80933e09078062d01c0a078054b6760b078053b6760c076f9c1c0d078054b6760e0771af260f0771b17e10078032f57011078035d56812078054e1581307886b83dc301407817480d13013068005a6001406803d80821a15066ef3201606800ea2181706800da628180671ab1219068054db0c1a06719b001b06815b80a11c1c068050b9301d066fac2a1e068033ab481f06719b1020068035ab721e07803dc2761f0771ae3c20078040f60e210771ce282207800ea4322307882a81a66024078035ad4625076efe7e26078162808e1827078118bb7228076eac7428088010bf58290871a04c2a0871bc722b086fa8382c08803d80a0142d088035d6282e088051c30c2f086efc623008800d9f6231086f986432088117bb7237028010a63034068010c84e2740800ea64c2237832c80933e1f3b830880c454390208813c80955c3905068032c73611348010a03c093c837a808a101b278050ac34093a8051ac34291b8f3b8187401d28881a82cb3a3a0a37977b86d20843000028996686a7083f030f8078d3761b27106e995a08499070839b5a1131000b00"_hex_u8;
 /* 2023-11-07T17:59:35Z 48, 4792, 498995,*/
 static constexpr auto BENCH_EXAMPLE_10 = "875f89aa1000b51ec09d7201c55cc7a72e02a11aa1fb3203b233a7f95204800ef56205b33ea9d13006803e80b26e07d90ec9dd4008b45eabbe6c09806080ca000a815984e8680a0a6f80925e0a0a803f80e1660c09937c94b7420d086e82f5640a086e80997e0b086f808d320c08800580a5640d086f8089100e08804080c9060f088115819a1c10086e82961a0f0a805f81bc0a100a6ff826110a6ef53e120a807584c60c110a6e818f32120a803c81c246130a805481d508140a8159838410150a7180a55c160a6f80821c170a6fe6101c066fe6101d06805080f854190a6e81b27c1a0a8155819c701e06805180ae0c21046e8b9a222501805180f53422001680f26880f8a62a220116803580da582007058153838e6e21000c800d80a712033a807681ae1c23000308834a82d36023020205815981e03a051a08001700"_hex_u8;
-
+/* 2023-11-16T10:47:08Z 77, 473962, 486863,*/
+static constexpr auto BENCH_EXAMPLE_11 = "801980c06000801980c06001801980c06002801980c06003801980c06004801980c06005801980c06006801980c06007801980c06008801980c06009801980c0600a801980c0600b801980c0600c801980c0600d801980c0600e801980c0600f801980c060108019d12c11800f80b1601111800f80b1601111801080b1601111800f80b160100e800f80b160100f801980c060110f800f80b160140d801180b1601111801180b160100d801180b160120c801180b1600f10801180b1600f11801980c0601011800f80b160140e800f80b160110f801980c060170a801180b1601210801980c060140f800f80b1601311801980c0602005801180b1601f07800f80b1601b0c800fca7c1611812081f9601638812081f9601637812081fb001636801080b160142f801980c0600e2a801080b1600f2a801180b1600d25801980c0600e25800f80b1600d27801980c0600e27801980c0600d27801180b1600e26812080b1500c27812081f960201025812081f960200f27812081fc201d101c812081fc201d101d812081fc201d0f1f812081fc201d0f20812081f9601b1016800f80b1600a35800f80b1600a36800f80b1600e32801080b160122f812081f960280040812081fc20121d1b812081f960112713812081f960160d37812081fc20140d2b812081f960130d2d812081fc20130c2c812081fb001b0157812081fb001a0245812081fc20140030812081fc20092747812081fb000b152500"_hex_u8;
+/* 2023-10-06T20:44:09Z 40, 341438, 341438,*/
+static constexpr auto BENCH_EXAMPLE_12 = "80318f4c0080318f4c0180318f4c0280318f4c0380318f4c0480318f4c0580318f4c0680318f4c078033a57807078033a57807078033a57807078033a57807078033a57807078033a57807078033a57807078033a578070780318f4c0e0180318f4c0d0380318f4c0c0580318f4c0b078033a57803128033a57803128033a57803128033a578031280318f4c0412810b9c28140300810c9c281303028033a57802188033a57802188033a5780218810c9c280b01108033a578001c810c9c2807050f8033a578001b810c98040700158033a578001c810c98040301158033a5780019806ca1240101118033a578001300"_hex_u8;
+/* 2023-11-15T21:40:46Z 96, 23608, 138286,*/
+static constexpr auto BENCH_EXAMPLE_13 = "8060829f4000b157bab07a01b27cc2b16802b22fbce54603826480a95804803da81a05bc7bcac93806800de55207800daf0608805bc71809805bc7180a800d9d4a0b805bbc700c8152d7180d805bb9380e850a8886260f800d80d33410bf38d3d55011b41dc4eb6012bd70d2ce2e138d3596af7812137180cd501313805e81f7281413718092001513803d81f90016136e8b916c1713801081861a17106e80cd2a18106f80cc3c19106e80cf161911800d80fe781b107180d87c1c106e80fb081d10803e8286701d11800d81c4781f10804082a6002010801081912e21107180ff0021116e81da4a2310850b8b864023116e89db3224116e84ff7e2610897c95993427106f80bb1a240b803581c272250b8032828c10260b6e80d42a270b804082b35a280b800d80fe3e290b805cc0282312821d8697022b0b6e8add562c0b805281c8063007811883f1082313800d80fe3e24137180c9142513800d8380102613803382c00e2713805eb32228136e8494542913800e8186742913806082b74c2a1380528285782b13800d818f7a2c136e84a5562d1380508286702e136f80a46e3e04803f8191364102805481ad4c3d076e809a5a3e077180fe4032136e838b7233138c4790cf384106853584ab624206805b80932a4801806280966c48028168ef04400b7181bd524903806282db5c375b9316acbf703a599c68c5a454385c6e81d63e364a6f80ff64334e817485a6784f023171819536234e800d81826e1e498053829a12420018834c87cb14291d2e840e8bc94c1d2825800d81b7220368811783fe0e271f1f811783e758380f001ecd55809edf6e56000000003a815984ba76008010d54d80aebb4e2c22000000000000002c807682f150007a00"_hex_u8;
+/* 2023-12-06T09:18:20Z 93, 68130, 122830,*/
+static constexpr auto BENCH_EXAMPLE_14 = "b26beadb2e00800d80ca0a01d41c80b1e14c02b068e8883003800d81af1604b34beede30056e80b14006b151f5d46c07b93e8085b02608b30cf98b1009b14ef6b3040ab176f6ab480bb7078082b8640c800d81c6460d802c80a8080e802c80a8080f802c80a14210802ce50a11802cd722127181ce6012126e81d14a13126e9b8b00141282428dd42c15128051828408150e6e81bd3e150f805f84ad00160f7181de30170f6e81c740180f800d83b876190f6e82a5541a0f6e81d33e1a106e82a70a1b106f81e76e1c10803583f2081d106e82d9401e106e96e4441f107181de321e12815889e2341f127182d60c20126e979d4e21126e8282262410800d82972c25106f838a5822126f82842a23127182d24a2412803e84bc2a2512800d83c81a26126e84f8142712805085a22c27126e889e6a2812801083aa50281280348598102912801082d5522a126e85865c2b127182c7602b1282468c82042c126e84972c2d12805485d93a2d12801083c7322e12815386e1582f126e84fb0c30126f82eb6c3011813a85b47a3111803f869f5c3211805181ed30370d6e84bf0a3411804180e1383809815883aa183a08815a8392203e05807681f140380c6e9e4c4005805485ab363255805183856030406e82f9582c45805185c1001b4f82418df1001a4e803283c50e430026800d83a6201a4b836886be3044010b8b318988084c0101803183a6120776800d828a1e087682338ae050301c33873199f8624d010032813986bc663c1034800d83a5220a6f800d82be52048000805183e364084907800d83cc4a018005815987b41e1832000017884b9dce72035035803284c11e00800885769d9538192f0000000002001000"_hex_u8;
 static void LinearizeOptimallyExample00(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_00); }
 static void LinearizeOptimallyExample01(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_01); }
 static void LinearizeOptimallyExample02(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_02); }
@@ -328,6 +335,10 @@ static void LinearizeOptimallyExample07(benchmark::Bench& bench) { BenchLineariz
 static void LinearizeOptimallyExample08(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_08); }
 static void LinearizeOptimallyExample09(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_09); }
 static void LinearizeOptimallyExample10(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_10); }
+static void LinearizeOptimallyExample11(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_11); }
+static void LinearizeOptimallyExample12(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_12); }
+static void LinearizeOptimallyExample13(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_13); }
+static void LinearizeOptimallyExample14(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_14); }
 
 BENCHMARK(Linearize16TxWorstCase20Iters, benchmark::PriorityLevel::HIGH);
 BENCHMARK(Linearize16TxWorstCase120Iters, benchmark::PriorityLevel::HIGH);
@@ -381,3 +392,7 @@ BENCHMARK(LinearizeOptimallyExample07, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizeOptimallyExample08, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizeOptimallyExample09, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizeOptimallyExample10, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample11, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample12, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample13, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample14, benchmark::PriorityLevel::HIGH);
diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 4053f90eaca..b421ba6b401 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -647,7 +647,7 @@ public:
      *                              be <= max_iterations. If strictly < max_iterations, the
      *                              returned subset is optimal.
      *
-     * Complexity: O(N * min(max_iterations, 2^N)) where N=depgraph.TxCount().
+     * Complexity: possibly O(N * min(max_iterations, sqrt(2^N))) where N=depgraph.TxCount().
      */
     std::pair<SetInfo<SetType>, uint64_t> FindCandidateSet(uint64_t max_iterations, SetInfo<SetType> best) noexcept
     {
@@ -723,7 +723,8 @@ public:
         }
 
         /** Internal function to add an item to the queue of elements to explore if there are any
-         *  transactions left to split on, and to update best/imp.
+         *  transactions left to split on, possibly improving it before doing so, and to update
+         *  best/imp.
          *
          * - inc: the "inc" value for the new work item (must be topological).
          * - und: the "und" value for the new work item ((inc | und) must be topological).
@@ -746,6 +747,28 @@ public:
                     pot.Set(m_sorted_depgraph, pos);
                 }
 
+                // The "jump ahead" optimization: whenever pot has a topologically-valid subset,
+                // that subset can be added to inc. Any subset of (pot - inc) has the property that
+                // its feerate exceeds that of any set compatible with this work item (superset of
+                // inc, subset of (inc | und)). Thus, if T is a topological subset of pot, and B is
+                // the best topologically-valid set compatible with this work item, and (T - B) is
+                // non-empty, then (T | B) is better than B and also topological. This is in
+                // contradiction with the assumption that B is best. Thus, (T - B) must be empty,
+                // or T must be a subset of B.
+                //
+                // See https://delvingbitcoin.org/t/how-to-linearize-your-cluster/303 section 2.4.
+                const auto init_inc = inc.transactions;
+                for (auto pos : pot.transactions - inc.transactions) {
+                    // If the transaction's ancestors are a subset of pot, we can add it together
+                    // with its ancestors to inc. Just update the transactions here; the feerate
+                    // update happens below.
+                    auto anc_todo = m_sorted_depgraph.Ancestors(pos) & m_todo;
+                    if (anc_todo.IsSubsetOf(pot.transactions)) inc.transactions |= anc_todo;
+                }
+                // Finally update und and inc's feerate to account for the added transactions.
+                und -= inc.transactions;
+                inc.feerate += m_sorted_depgraph.FeeRate(inc.transactions - init_inc);
+
                 // If inc's feerate is better than best's, remember it as our new best.
                 if (inc.feerate > best.feerate) {
                     best = inc;
@@ -892,7 +915,7 @@ public:
  *                                - A boolean indicating whether the result is guaranteed to be
  *                                  optimal.
  *
- * Complexity: O(N * min(max_iterations + N, 2^N)) where N=depgraph.TxCount().
+ * Complexity: possibly O(N * min(max_iterations + N, sqrt(2^N))) where N=depgraph.TxCount().
  */
 template<typename SetType>
 std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations, uint64_t rng_seed, Span<const ClusterIndex> old_linearization = {}) noexcept
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index 1eb3ff05542..b1e46c8685e 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -553,6 +553,12 @@ FUZZ_TARGET(clusterlin_search_finder)
         // longer connected after removing certain transactions, this holds, because the connected
         // components are searched separately.
         assert(iterations_done <= (uint64_t{1} << (todo.Count() - 1)));
+        // Additionally, test that no more than sqrt(2^N)+1 iterations are required. This is just
+        // an empirical bound that seems to hold, without proof. Still, add a test for it so we
+        // can learn about counterexamples if they exist.
+        if (iterations_done >= 1 && todo.Count() <= 63) {
+            Assume((iterations_done - 1) * (iterations_done - 1) <= uint64_t{1} << todo.Count());
+        }
 
         // Perform quality checks only if SearchCandidateFinder claims an optimal result.
         if (iterations_done < max_iterations) {
@@ -769,6 +775,13 @@ FUZZ_TARGET(clusterlin_linearize)
     if (n <= 19 && iter_count > (uint64_t{1} << n)) {
         assert(optimal);
     }
+    // Additionally, if the assumption of sqrt(2^k)+1 iterations per step holds, the maximum number
+    // of iterations is also bounded by (2 + sqrt(2)) * (sqrt(2^n) - 1) + n, which is less than
+    // (2 + sqrt(2)) * sqrt(2^n) + n. Subtracting n and squaring gives
+    // (6 + 4 * sqrt(2)) * 2^n < 12 * 2^n.
+    if (n <= 35 && iter_count > n && (iter_count - n) * (iter_count - n) >= uint64_t{12} << n) {
+        Assume(optimal);
+    }
 
     // If Linearize claims optimal result, run quality tests.
     if (optimal) {

From bd044356edb6c8978df600f33b0073f772ee047c Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 9 May 2024 13:45:38 -0400
Subject: [PATCH 09/10] clusterlin: improve heuristic to decide split
 transaction (optimization)

Empirically, this approach seems to be more efficient in common real-life
clusters, and does not change the worst case.

Co-Authored-By: Suhas Daftuar <sdaftuar@gmail.com>
---
 src/bench/cluster_linearize.cpp | 21 +++++++++++++++++++
 src/cluster_linearize.h         | 37 +++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/src/bench/cluster_linearize.cpp b/src/bench/cluster_linearize.cpp
index c5262ffe607..cf071dda2d1 100644
--- a/src/bench/cluster_linearize.cpp
+++ b/src/bench/cluster_linearize.cpp
@@ -324,6 +324,17 @@ static constexpr auto BENCH_EXAMPLE_12 = "80318f4c0080318f4c0180318f4c0280318f4c
 static constexpr auto BENCH_EXAMPLE_13 = "8060829f4000b157bab07a01b27cc2b16802b22fbce54603826480a95804803da81a05bc7bcac93806800de55207800daf0608805bc71809805bc7180a800d9d4a0b805bbc700c8152d7180d805bb9380e850a8886260f800d80d33410bf38d3d55011b41dc4eb6012bd70d2ce2e138d3596af7812137180cd501313805e81f7281413718092001513803d81f90016136e8b916c1713801081861a17106e80cd2a18106f80cc3c19106e80cf161911800d80fe781b107180d87c1c106e80fb081d10803e8286701d11800d81c4781f10804082a6002010801081912e21107180ff0021116e81da4a2310850b8b864023116e89db3224116e84ff7e2610897c95993427106f80bb1a240b803581c272250b8032828c10260b6e80d42a270b804082b35a280b800d80fe3e290b805cc0282312821d8697022b0b6e8add562c0b805281c8063007811883f1082313800d80fe3e24137180c9142513800d8380102613803382c00e2713805eb32228136e8494542913800e8186742913806082b74c2a1380528285782b13800d818f7a2c136e84a5562d1380508286702e136f80a46e3e04803f8191364102805481ad4c3d076e809a5a3e077180fe4032136e838b7233138c4790cf384106853584ab624206805b80932a4801806280966c48028168ef04400b7181bd524903806282db5c375b9316acbf703a599c68c5a454385c6e81d63e364a6f80ff64334e817485a6784f023171819536234e800d81826e1e498053829a12420018834c87cb14291d2e840e8bc94c1d2825800d81b7220368811783fe0e271f1f811783e758380f001ecd55809edf6e56000000003a815984ba76008010d54d80aebb4e2c22000000000000002c807682f150007a00"_hex_u8;
 /* 2023-12-06T09:18:20Z 93, 68130, 122830,*/
 static constexpr auto BENCH_EXAMPLE_14 = "b26beadb2e00800d80ca0a01d41c80b1e14c02b068e8883003800d81af1604b34beede30056e80b14006b151f5d46c07b93e8085b02608b30cf98b1009b14ef6b3040ab176f6ab480bb7078082b8640c800d81c6460d802c80a8080e802c80a8080f802c80a14210802ce50a11802cd722127181ce6012126e81d14a13126e9b8b00141282428dd42c15128051828408150e6e81bd3e150f805f84ad00160f7181de30170f6e81c740180f800d83b876190f6e82a5541a0f6e81d33e1a106e82a70a1b106f81e76e1c10803583f2081d106e82d9401e106e96e4441f107181de321e12815889e2341f127182d60c20126e979d4e21126e8282262410800d82972c25106f838a5822126f82842a23127182d24a2412803e84bc2a2512800d83c81a26126e84f8142712805085a22c27126e889e6a2812801083aa50281280348598102912801082d5522a126e85865c2b127182c7602b1282468c82042c126e84972c2d12805485d93a2d12801083c7322e12815386e1582f126e84fb0c30126f82eb6c3011813a85b47a3111803f869f5c3211805181ed30370d6e84bf0a3411804180e1383809815883aa183a08815a8392203e05807681f140380c6e9e4c4005805485ab363255805183856030406e82f9582c45805185c1001b4f82418df1001a4e803283c50e430026800d83a6201a4b836886be3044010b8b318988084c0101803183a6120776800d828a1e087682338ae050301c33873199f8624d010032813986bc663c1034800d83a5220a6f800d82be52048000805183e364084907800d83cc4a018005815987b41e1832000017884b9dce72035035803284c11e00800885769d9538192f0000000002001000"_hex_u8;
+/* 2023-12-14T02:02:29Z 55, 247754, 247754,*/
+static constexpr auto BENCH_EXAMPLE_15 = "801980c06000801980c06001801980c06002801980c06003801980c06004801980c06005801980c06006801980c06007801980c06008801980c06009801980c0600a801980c0600b801980c0600c801980c0600d801980c0600e801180b1600e0e801180b1600e0e801180b1600e0e801180b1600e0e801180b1600e0e801180b1600e0e801180b1600d07801180b1600f06801180b1600c0a801180b1600f08801180b1600c0c801180b1600c0d801180b1600c0e801180b160100b801180b1601309812081fc200e2a812081fc200e29812081fc200e28812081fc200e0e18812081fc200e0e17801980c060042e812081fc200e0d07812081fc200e0d08812081fc200e0c0a812081fc200e0d0a801980c060081e812081fc200f0c0c812081fc200f0c0d812081fc200f0c0e801180b160083a801180b1600426801980c0600b20801980c0600a22812081fc200f0b30801180b160022b801180b160022b812081fc20062422812081fc2006220b812081fc200c0a1e812081fc2012041a00"_hex_u8;
+/* 2023-12-14T15:17:20Z 76, 102600, 103935,*/
+static constexpr auto BENCH_EXAMPLE_16 = "801980c06000801980c06001801980c06002801980c06003801980c06004801180b1600404801180b1600404801180b1600404801980c0600504801980c0600802801980c0600803801180b1600704801980c0600804801280b1600804812081fc200810812081fc20080f812081fc20080e801180b160080c800f80b160080d801980c060090d801180b160090e801980c0600a0e812181fc200a0c801180b1600a0d812181fd400a0c801980c0600a1c801980c0600916801180b1600719801180b160061b801980c0600d15801980c0600717812081fc200718801980c0600716801180b160072d801180b1600722801180b1600525801980c060091b801980c060071e801080b160071f801280b160061d812081fc20063a812181f960160815801280b1600525801980c0600625801180b1600626801980c0600726801980c0600536801180b160032b801980c060042b801280b160032d801980c060033e801180b160043e812181fc20100c27801080b160042f801980c0600342801180b1600442812081fc20150d25800f80b1600245812081fd40120619812081fc20040243812081fc20120c2c812081fd40120a1d812181fb00100623812081fc20030347812081fc20072126801980c0600236812081fc20040d2b812081fc20120328801980c0600237801180b1600337812081fc20052230801180b1600239812081fc2008242c812081fd4005112d812081fb00070b32812081f96011034700"_hex_u8;
+/* 2023-12-15T07:12:29Z 98, 112693, 112730,*/
+static constexpr auto BENCH_EXAMPLE_17 = "801980c06000801980c06001801980c06002801980c06003801980c06004801980c06005801980c06006801180b1600606801180b1600606801180b1600606801180b1600606801280b1600606801180b1600606801180b1600606801980c0600d00801980c0600b03801980c0600b04801980c0600f01812081fc200a16812081fc200a15812081fc200a14812081fc200a13812081fd400a12812181fc200a11812181fc200a0f801180b1600a10801180b1600a10801980c0600a10801180b1600b10801180b1600b10801980c0600621801980c0600915801980c060041b801180b160051b801980c0600f12801980c0600f13801980c0600d15801980c0600c17801980c060072e800f80b160082e812181fc200d150e801980c0600922801180b1600923801980c0600823801180b1600623801180b1600a20801180b1600e1c801180b1600b20801180b1600b21801980c0600a3e800f80b1600b3e801980c0600931801180b1600a31812181fc20140325801180b1600a30801180b160054c801180b160043b801980c0600336812181fc200253812081f960090944812081fc2007003c801980c0600339801180b1600433801980c0600453801980c0600340801980c060033d801080b160043d812081f960070854801980c060045a801180b160055a801180b1600545801980c0600643801980c0600641801280b1600739801180b1600562812081fc20121f27812181fc20210137812181fc2016112f801980c0600259801980c0600156812181fc20053a31801180b160025c801180b1600257801980c0600357812081fc200d2d1e812181fc20102444812181fc20035a801180b160035b801980c0600751812181fc2007392a812181fc20025f801980c060045e801180b1600350812081fc20070f6f801180b1600263812181fc201b1322812181fc2011283b812081fc2002442100"_hex_u8;
+/* 2023-12-16T02:25:33Z 99, 112399, 112399,*/
+static constexpr auto BENCH_EXAMPLE_18 = "801980c06000801980c06001801980c06002801980c06003801980c06004801980c06005801980c06006801980c06007801180b16008801180b16009801180b1600a801180b1600a0a801180b1600a0a801180b1600a0a801180b1600a0a801980c0600d06801180b1600b09801980c0601005801180b1600c0a801980c0600d0a801980c0601106801180b1600e0a801980c0601207801980c0601207801180b160100a812081e668100a812081e668100a812081e668100a801980c0601407801980c0601606812081fc201226812081fc201225812081fc201224812081fc201223801180b1600e21801980c0600b1e801180b1600c1e801180b1601316801980c060091b801980c0601312801980c0600a1c801180b160190e801180b1601315801180b1600e1b801180b1601713801180b1600f1c801980c0600d34801980c0600d30801980c060102e801980c060122d801980c0600b2a801980c0600b2a801980c0600b2b801180b1601122801180b1600e26801180b1601025801180b1600f26812081fc20280032812081fc20270034812081fc20250034801180b1600d4b801980c0600d457a809a000d46801980c0601044801980c0600e46801180b1600f43801180b160123f801180b160123e801180b1601130801180b1601131801180b1601131812081fc20230a36801980c0600a5a801180b1600a5b801980c0600a5b801180b1600b5b801980c0600b5a801180b1600f57801180b1600d3f801980c0600669801980c0600568801980c0600466801180b1600945801180b1600649801180b1600945812081fc2018234b812081fc20142534812081fc20142532812081fc20142530801180b160074d801180b1600a4b801180b1600a4a812081fc20221662812081fc200c0472812081fc20072e42812081fc20062c23812081fc20100572812081fc200f036c812081fc2001345100"_hex_u8;
+/* 2023-03-31T19:24:02Z 78, 90393, 152832,*/
+static constexpr auto BENCH_EXAMPLE_19 = "800dd042008028b13c018028b13c028028b13c038029b13c048029b13c058029b13c0680299948078029b13c088029b13c09802899480a802899480b8028b13c0c80299e700d802899480e802999480f8029b13c10802999481180299948128028b13c138029b13c1480289e701580289948168028b13c1780289948188028994819802899481a802999481b802999481c802899481d802999481e8028b13c1f8029b13c20802999482180299948228028b13c2380298c242480289948258029b13c2680288c242780298c242880299e70298f5a80ea762a824780aa00292a82038090402429813fcf00152a8203809040142a813ff700112982038090402d002d813ff70028002c8203809040270024824780aa00270025820380904025002882038090401e022a82038090401d042782038090401c01298203809040190029813ff700170028813ff700140128807b9258120128841280f6402c01002e82038090402b00062b820380904027000031813ff70011192d82038090401d000129851981a9403a0000003b82038090400c182e813ff7000b0f2982038090401314141b807b925805192b84568190001121000334807bdd400149824780aa00001f2a813ff700003d0b8203809040050d1915807bdd4001498728828f400b010004050501000a050c851981a9400104050b061a0400"_hex_u8;
+
 static void LinearizeOptimallyExample00(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_00); }
 static void LinearizeOptimallyExample01(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_01); }
 static void LinearizeOptimallyExample02(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_02); }
@@ -339,6 +350,11 @@ static void LinearizeOptimallyExample11(benchmark::Bench& bench) { BenchLineariz
 static void LinearizeOptimallyExample12(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_12); }
 static void LinearizeOptimallyExample13(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_13); }
 static void LinearizeOptimallyExample14(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_14); }
+static void LinearizeOptimallyExample15(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_15); }
+static void LinearizeOptimallyExample16(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_16); }
+static void LinearizeOptimallyExample17(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_17); }
+static void LinearizeOptimallyExample18(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_18); }
+static void LinearizeOptimallyExample19(benchmark::Bench& bench) { BenchLinearizeOptimally(bench, BENCH_EXAMPLE_19); }
 
 BENCHMARK(Linearize16TxWorstCase20Iters, benchmark::PriorityLevel::HIGH);
 BENCHMARK(Linearize16TxWorstCase120Iters, benchmark::PriorityLevel::HIGH);
@@ -396,3 +412,8 @@ BENCHMARK(LinearizeOptimallyExample11, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizeOptimallyExample12, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizeOptimallyExample13, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizeOptimallyExample14, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample15, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample16, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample17, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample18, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeOptimallyExample19, benchmark::PriorityLevel::HIGH);
diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index b421ba6b401..e3c79ab3ffb 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -828,8 +828,41 @@ public:
                 if (m_sorted_depgraph.FeeRate(first) <= best.feerate) return;
             }
 
-            // Pick the first undecided transaction as the one to split on.
-            const ClusterIndex split = first;
+            // Decide which transaction to split on. Splitting is how new work items are added, and
+            // how progress is made. One split transaction is chosen among the queue item's
+            // undecided ones, and:
+            // - A work item is (potentially) added with that transaction plus its remaining
+            //   descendants excluded (removed from the und set).
+            // - A work item is (potentially) added with that transaction plus its remaining
+            //   ancestors included (added to the inc set).
+            //
+            // To decide what to split on, consider the undecided ancestors of the highest
+            // individual feerate undecided transaction. Pick the one which reduces the search space
+            // most. Let I(t) be the size of the undecided set after including t, and E(t) the size
+            // of the undecided set after excluding t. Then choose the split transaction t such
+            // that 2^I(t) + 2^E(t) is minimal, tie-breaking by highest individual feerate for t.
+            ClusterIndex split = 0;
+            const auto select = elem.und & m_sorted_depgraph.Ancestors(first);
+            Assume(select.Any());
+            std::optional<std::pair<ClusterIndex, ClusterIndex>> split_counts;
+            for (auto t : select) {
+                // Call max = max(I(t), E(t)) and min = min(I(t), E(t)). Let counts = {max,min}.
+                // Sorting by the tuple counts is equivalent to sorting by 2^I(t) + 2^E(t). This
+                // expression is equal to 2^max + 2^min = 2^max * (1 + 1/2^(max - min)). The second
+                // factor (1 + 1/2^(max - min)) there is in (1,2]. Thus increasing max will always
+                // increase it, even when min decreases. Because of this, we can first sort by max.
+                std::pair<ClusterIndex, ClusterIndex> counts{
+                    (elem.und - m_sorted_depgraph.Ancestors(t)).Count(),
+                    (elem.und - m_sorted_depgraph.Descendants(t)).Count()};
+                if (counts.first < counts.second) std::swap(counts.first, counts.second);
+                // Remember the t with the lowest counts.
+                if (!split_counts.has_value() || counts < *split_counts) {
+                    split = t;
+                    split_counts = counts;
+                }
+            }
+            // Since there was at least one transaction in select, we must always find one.
+            Assume(split_counts.has_value());
 
             // Add a work item corresponding to exclusion of the split transaction.
             const auto& desc = m_sorted_depgraph.Descendants(split);

From 9ad2fe7e69e9e69949ebbb280a15756dc3301f09 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 25 Jul 2024 00:13:11 -0400
Subject: [PATCH 10/10] clusterlin: only start/use search when enough
 iterations left

---
 src/cluster_linearize.h             | 40 ++++++++++++++++++++++++-----
 src/test/fuzz/cluster_linearize.cpp | 19 ++++++++++----
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index e3c79ab3ffb..e964849f228 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -536,6 +536,12 @@ public:
         return m_todo.None();
     }
 
+    /** Count the number of remaining unlinearized transactions. */
+    ClusterIndex NumRemaining() const noexcept
+    {
+        return m_todo.Count();
+    }
+
     /** Find the best (highest-feerate, smallest among those in case of a tie) ancestor set
      *  among the remaining transactions. Requires !AllDone().
      *
@@ -960,10 +966,20 @@ std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& de
     std::vector<ClusterIndex> linearization;
 
     AncestorCandidateFinder anc_finder(depgraph);
-    SearchCandidateFinder src_finder(depgraph, rng_seed);
+    std::optional<SearchCandidateFinder<SetType>> src_finder;
     linearization.reserve(depgraph.TxCount());
     bool optimal = true;
 
+    // Treat the initialization of SearchCandidateFinder as taking N^2/64 (rounded up) iterations
+    // (largely due to the cost of constructing the internal sorted-by-feerate DepGraph inside
+    // SearchCandidateFinder), a rough approximation based on benchmark. If we don't have that
+    // many, don't start it.
+    uint64_t start_iterations = (uint64_t{depgraph.TxCount()} * depgraph.TxCount() + 63) / 64;
+    if (iterations_left > start_iterations) {
+        iterations_left -= start_iterations;
+        src_finder.emplace(depgraph, rng_seed);
+    }
+
     /** Chunking of what remains of the old linearization. */
     LinearizationChunking old_chunking(depgraph, old_linearization);
 
@@ -976,12 +992,22 @@ std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& de
         auto best = anc_finder.FindCandidateSet();
         if (!best_prefix.feerate.IsEmpty() && best_prefix.feerate >= best.feerate) best = best_prefix;
 
-        // Invoke bounded search to update best, with up to half of our remaining iterations as
-        // limit.
-        uint64_t max_iterations_now = (iterations_left + 1) / 2;
         uint64_t iterations_done_now = 0;
-        std::tie(best, iterations_done_now) = src_finder.FindCandidateSet(max_iterations_now, best);
-        iterations_left -= iterations_done_now;
+        uint64_t max_iterations_now = 0;
+        if (src_finder) {
+            // Treat the invocation of SearchCandidateFinder::FindCandidateSet() as costing N/4
+            // up-front (rounded up) iterations (largely due to the cost of connected-component
+            // splitting), a rough approximation based on benchmarks.
+            uint64_t base_iterations = (anc_finder.NumRemaining() + 3) / 4;
+            if (iterations_left > base_iterations) {
+                // Invoke bounded search to update best, with up to half of our remaining
+                // iterations as limit.
+                iterations_left -= base_iterations;
+                max_iterations_now = (iterations_left + 1) / 2;
+                std::tie(best, iterations_done_now) = src_finder->FindCandidateSet(max_iterations_now, best);
+                iterations_left -= iterations_done_now;
+            }
+        }
 
         if (iterations_done_now == max_iterations_now) {
             optimal = false;
@@ -999,7 +1025,7 @@ std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& de
         // Update state to reflect best is no longer to be linearized.
         anc_finder.MarkDone(best.transactions);
         if (anc_finder.AllDone()) break;
-        src_finder.MarkDone(best.transactions);
+        if (src_finder) src_finder->MarkDone(best.transactions);
         if (old_chunking.NumChunksLeft() > 0) {
             old_chunking.MarkDone(best.transactions);
         }
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index b1e46c8685e..d91f85d867b 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -458,6 +458,7 @@ FUZZ_TARGET(clusterlin_ancestor_finder)
     while (todo.Any()) {
         // Call the ancestor finder's FindCandidateSet for what remains of the graph.
         assert(!anc_finder.AllDone());
+        assert(todo.Count() == anc_finder.NumRemaining());
         auto best_anc = anc_finder.FindCandidateSet();
         // Sanity check the result.
         assert(best_anc.transactions.Any());
@@ -489,6 +490,7 @@ FUZZ_TARGET(clusterlin_ancestor_finder)
         anc_finder.MarkDone(del_set);
     }
     assert(anc_finder.AllDone());
+    assert(anc_finder.NumRemaining() == 0);
 }
 
 static constexpr auto MAX_SIMPLE_ITERATIONS = 300000;
@@ -523,6 +525,7 @@ FUZZ_TARGET(clusterlin_search_finder)
         assert(!smp_finder.AllDone());
         assert(!exh_finder.AllDone());
         assert(!anc_finder.AllDone());
+        assert(anc_finder.NumRemaining() == todo.Count());
 
         // For each iteration, read an iteration count limit from the fuzz input.
         uint64_t max_iterations = 1;
@@ -605,6 +608,7 @@ FUZZ_TARGET(clusterlin_search_finder)
     assert(smp_finder.AllDone());
     assert(exh_finder.AllDone());
     assert(anc_finder.AllDone());
+    assert(anc_finder.NumRemaining() == 0);
 }
 
 FUZZ_TARGET(clusterlin_linearization_chunking)
@@ -775,11 +779,16 @@ FUZZ_TARGET(clusterlin_linearize)
     if (n <= 19 && iter_count > (uint64_t{1} << n)) {
         assert(optimal);
     }
-    // Additionally, if the assumption of sqrt(2^k)+1 iterations per step holds, the maximum number
-    // of iterations is also bounded by (2 + sqrt(2)) * (sqrt(2^n) - 1) + n, which is less than
-    // (2 + sqrt(2)) * sqrt(2^n) + n. Subtracting n and squaring gives
-    // (6 + 4 * sqrt(2)) * 2^n < 12 * 2^n.
-    if (n <= 35 && iter_count > n && (iter_count - n) * (iter_count - n) >= uint64_t{12} << n) {
+    // Additionally, if the assumption of sqrt(2^k)+1 iterations per step holds, plus ceil(k/4)
+    // start-up cost per step, plus ceil(n^2/64) start-up cost overall, we can compute the upper
+    // bound for a whole linearization (summing for k=1..n) using the Python expression
+    // [sum((k+3)//4 + int(math.sqrt(2**k)) + 1 for k in range(1, n + 1)) + (n**2 + 63) // 64 for n in range(0, 35)]:
+    static constexpr uint64_t MAX_OPTIMAL_ITERS[] = {
+        0, 4, 8, 12, 18, 26, 37, 51, 70, 97, 133, 182, 251, 346, 480, 666, 927, 1296, 1815, 2545,
+        3576, 5031, 7087, 9991, 14094, 19895, 28096, 39690, 56083, 79263, 112041, 158391, 223936,
+        316629, 447712
+    };
+    if (n < std::size(MAX_OPTIMAL_ITERS) && iter_count >= MAX_OPTIMAL_ITERS[n]) {
         Assume(optimal);
     }