From a6e07e769a1af652a14e533f6d3558ccdefb1de5 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Mon, 29 Jan 2024 22:05:53 -0500
Subject: [PATCH 01/12] clusterlin: introduce cluster_linearize.h with Cluster
 and DepGraph types

This primarily adds the DepGraph class, which encapsulates precomputed
ancestor/descendant information for a given transaction cluster, with a
number of utility features (inspectors for set feerates, computing
reduced parents/children, adding transactions, adding dependencies), which
will become needed in future commits.
---
 src/Makefile.am         |   1 +
 src/cluster_linearize.h | 171 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 src/cluster_linearize.h
diff --git a/src/Makefile.am b/src/Makefile.am
index 72dd942c401..36de5dd1509 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -132,6 +132,7 @@ BITCOIN_CORE_H = \
   chainparamsseeds.h \
   checkqueue.h \
   clientversion.h \
+  cluster_linearize.h \
   coins.h \
   common/args.h \
   common/bloom.h \
diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
new file mode 100644
index 00000000000..2e230bcd638
--- /dev/null
+++ b/src/cluster_linearize.h
@@ -0,0 +1,171 @@
+// Copyright (c) The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_CLUSTER_LINEARIZE_H
+#define BITCOIN_CLUSTER_LINEARIZE_H
+
+#include <stdint.h>
+#include <vector>
+#include <utility>
+
+#include <util/feefrac.h>
+
+namespace cluster_linearize {
+
+/** Data type to represent cluster input.
+ *
+ * cluster[i].first is tx_i's fee and size.
+ * cluster[i].second[j] is true iff tx_i spends one or more of tx_j's outputs.
+ */
+template<typename SetType>
+using Cluster = std::vector<std::pair<FeeFrac, SetType>>;
+
+/** Data type to represent transaction indices in clusters. */
+using ClusterIndex = uint32_t;
+
+/** Data structure that holds a transaction graph's preprocessed data (fee, size, ancestors,
+ *  descendants). */
+template<typename SetType>
+class DepGraph
+{
+    /** Information about a single transaction. */
+    struct Entry
+    {
+        /** Fee and size of transaction itself. */
+        FeeFrac feerate;
+        /** All ancestors of the transaction (including itself). */
+        SetType ancestors;
+        /** All descendants of the transaction (including itself). */
+        SetType descendants;
+
+        /** Equality operator (primarily for for testing purposes). */
+        friend bool operator==(const Entry&, const Entry&) noexcept = default;
+
+        /** Construct an empty entry. */
+        Entry() noexcept = default;
+        /** Construct an entry with a given feerate, ancestor set, descendant set. */
+        Entry(const FeeFrac& f, const SetType& a, const SetType& d) noexcept : feerate(f), ancestors(a), descendants(d) {}
+    };
+
+    /** Data for each transaction, in the same order as the Cluster it was constructed from. */
+    std::vector<Entry> entries;
+
+public:
+    /** Equality operator (primarily for testing purposes). */
+    friend bool operator==(const DepGraph&, const DepGraph&) noexcept = default;
+
+    // Default constructors.
+    DepGraph() noexcept = default;
+    DepGraph(const DepGraph&) noexcept = default;
+    DepGraph(DepGraph&&) noexcept = default;
+    DepGraph& operator=(const DepGraph&) noexcept = default;
+    DepGraph& operator=(DepGraph&&) noexcept = default;
+
+    /** Construct a DepGraph object for ntx transactions, with no dependencies.
+     *
+     * Complexity: O(N) where N=ntx.
+     **/
+    explicit DepGraph(ClusterIndex ntx) noexcept
+    {
+        Assume(ntx <= SetType::Size());
+        entries.resize(ntx);
+        for (ClusterIndex i = 0; i < ntx; ++i) {
+            entries[i].ancestors = SetType::Singleton(i);
+            entries[i].descendants = SetType::Singleton(i);
+        }
+    }
+
+    /** Construct a DepGraph object given a cluster.
+     *
+     * Complexity: O(N^2) where N=cluster.size().
+     */
+    explicit DepGraph(const Cluster<SetType>& cluster) noexcept : entries(cluster.size())
+    {
+        for (ClusterIndex i = 0; i < cluster.size(); ++i) {
+            // Fill in fee and size.
+            entries[i].feerate = cluster[i].first;
+            // Fill in direct parents as ancestors.
+            entries[i].ancestors = cluster[i].second;
+            // Make sure transactions are ancestors of themselves.
+            entries[i].ancestors.Set(i);
+        }
+
+        // Propagate ancestor information.
+        for (ClusterIndex i = 0; i < entries.size(); ++i) {
+            // At this point, entries[a].ancestors[b] is true iff b is an ancestor of a and there
+            // is a path from a to b through the subgraph consisting of {a, b} union
+            // {0, 1, ..., (i-1)}.
+            SetType to_merge = entries[i].ancestors;
+            for (ClusterIndex j = 0; j < entries.size(); ++j) {
+                if (entries[j].ancestors[i]) {
+                    entries[j].ancestors |= to_merge;
+                }
+            }
+        }
+
+        // Fill in descendant information by transposing the ancestor information.
+        for (ClusterIndex i = 0; i < entries.size(); ++i) {
+            for (auto j : entries[i].ancestors) {
+                entries[j].descendants.Set(i);
+            }
+        }
+    }
+
+    /** Get the number of transactions in the graph. Complexity: O(1). */
+    auto TxCount() const noexcept { return entries.size(); }
+    /** Get the feerate of a given transaction i. Complexity: O(1). */
+    const FeeFrac& FeeRate(ClusterIndex i) const noexcept { return entries[i].feerate; }
+    /** Get the ancestors of a given transaction i. Complexity: O(1). */
+    const SetType& Ancestors(ClusterIndex i) const noexcept { return entries[i].ancestors; }
+    /** Get the descendants of a given transaction i. Complexity: O(1). */
+    const SetType& Descendants(ClusterIndex i) const noexcept { return entries[i].descendants; }
+
+    /** Add a new unconnected transaction to this transaction graph (at the end), and return its
+     *  ClusterIndex.
+     *
+     * Complexity: O(1) (amortized, due to resizing of backing vector).
+     */
+    ClusterIndex AddTransaction(const FeeFrac& feefrac) noexcept
+    {
+        Assume(TxCount() < SetType::Size());
+        ClusterIndex new_idx = TxCount();
+        entries.emplace_back(feefrac, SetType::Singleton(new_idx), SetType::Singleton(new_idx));
+        return new_idx;
+    }
+
+    /** Modify this transaction graph, adding a dependency between a specified parent and child.
+     *
+     * Complexity: O(N) where N=TxCount().
+     **/
+    void AddDependency(ClusterIndex parent, ClusterIndex child) noexcept
+    {
+        // Bail out if dependency is already implied.
+        if (entries[child].ancestors[parent]) return;
+        // To each ancestor of the parent, add as descendants the descendants of the child.
+        const auto& chl_des = entries[child].descendants;
+        for (auto anc_of_par : Ancestors(parent)) {
+            entries[anc_of_par].descendants |= chl_des;
+        }
+        // To each descendant of the child, add as ancestors the ancestors of the parent.
+        const auto& par_anc = entries[parent].ancestors;
+        for (auto dec_of_chl : Descendants(child)) {
+            entries[dec_of_chl].ancestors |= par_anc;
+        }
+    }
+
+    /** Compute the aggregate feerate of a set of nodes in this graph.
+     *
+     * Complexity: O(N) where N=elems.Count().
+     **/
+    FeeFrac FeeRate(const SetType& elems) const noexcept
+    {
+        FeeFrac ret;
+        for (auto pos : elems) ret += entries[pos].feerate;
+        return ret;
+    }
+};
+
+} // namespace cluster_linearize
+
+#endif // BITCOIN_CLUSTER_LINEARIZE_H

From 58f7e01db4bad6d958d44f2bcdfd9df9e22931a4 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Wed, 8 May 2024 20:52:56 -0400
Subject: [PATCH 02/12] tests: framework for testing DepGraph class

This introduces a bespoke fuzzing-focused serialization format for DepGraphs,
and then tests that this format can represent any graph, roundtrips, and then
uses that to test the correctness of DepGraph itself.

This forms the basis for future fuzz tests that need to work with interesting
graphs.
---
 src/Makefile.test.include            |   2 +
 src/Makefile.test_util.include       |   1 +
 src/test/cluster_linearize_tests.cpp | 138 +++++++++++
 src/test/fuzz/cluster_linearize.cpp  |  87 +++++++
 src/test/util/cluster_linearize.h    | 336 +++++++++++++++++++++++++++
 5 files changed, 564 insertions(+)
 create mode 100644 src/test/cluster_linearize_tests.cpp
 create mode 100644 src/test/fuzz/cluster_linearize.cpp
 create mode 100644 src/test/util/cluster_linearize.h

diff --git a/src/Makefile.test.include b/src/Makefile.test.include
index 0993a65efff..3d044983699 100644
--- a/src/Makefile.test.include
+++ b/src/Makefile.test.include
@@ -83,6 +83,7 @@ BITCOIN_TESTS =\
   test/bloom_tests.cpp \
   test/bswap_tests.cpp \
   test/checkqueue_tests.cpp \
+  test/cluster_linearize_tests.cpp \
   test/coins_tests.cpp \
   test/coinstatsindex_tests.cpp \
   test/common_url_tests.cpp \
@@ -302,6 +303,7 @@ test_fuzz_fuzz_SOURCES = \
  test/fuzz/buffered_file.cpp \
  test/fuzz/chain.cpp \
  test/fuzz/checkqueue.cpp \
+ test/fuzz/cluster_linearize.cpp \
  test/fuzz/coins_view.cpp \
  test/fuzz/coinscache_sim.cpp \
  test/fuzz/connman.cpp \
diff --git a/src/Makefile.test_util.include b/src/Makefile.test_util.include
index 960eb078c8a..0c0e849fba3 100644
--- a/src/Makefile.test_util.include
+++ b/src/Makefile.test_util.include
@@ -10,6 +10,7 @@ EXTRA_LIBRARIES += \
 TEST_UTIL_H = \
   test/util/blockfilter.h \
   test/util/chainstate.h \
+  test/util/cluster_linearize.h \
   test/util/coins.h \
   test/util/index.h \
   test/util/json.h \
diff --git a/src/test/cluster_linearize_tests.cpp b/src/test/cluster_linearize_tests.cpp
new file mode 100644
index 00000000000..d15e783ea15
--- /dev/null
+++ b/src/test/cluster_linearize_tests.cpp
@@ -0,0 +1,138 @@
+// Copyright (c) The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include <cluster_linearize.h>
+#include <test/util/cluster_linearize.h>
+#include <test/util/setup_common.h>
+#include <util/bitset.h>
+#include <util/strencodings.h>
+
+#include <vector>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_FIXTURE_TEST_SUITE(cluster_linearize_tests, BasicTestingSetup)
+
+using namespace cluster_linearize;
+
+namespace {
+
+template<typename SetType>
+void TestDepGraphSerialization(const Cluster<SetType>& cluster, const std::string& hexenc)
+{
+    DepGraph depgraph(cluster);
+
+    // Run normal sanity and correspondence checks, which includes a round-trip test.
+    VerifyDepGraphFromCluster(cluster, depgraph);
+
+    // There may be multiple serializations of the same graph, but DepGraphFormatter's serializer
+    // only produces one of those. Verify that hexenc matches that canonical serialization.
+    std::vector<unsigned char> encoding;
+    VectorWriter writer(encoding, 0);
+    writer << Using<DepGraphFormatter>(depgraph);
+    BOOST_CHECK_EQUAL(HexStr(encoding), hexenc);
+
+    // Test that deserializing that encoding yields depgraph. This is effectively already implied
+    // by the round-trip test above (if depgraph is acyclic), but verify it explicitly again here.
+    SpanReader reader(encoding);
+    DepGraph<SetType> depgraph_read;
+    reader >> Using<DepGraphFormatter>(depgraph_read);
+    BOOST_CHECK(depgraph == depgraph_read);
+}
+
+} // namespace
+
+BOOST_AUTO_TEST_CASE(depgraph_ser_tests)
+{
+    // Empty cluster.
+    TestDepGraphSerialization<TestBitSet>(
+        {},
+        "00" /* end of graph */);
+
+    // Transactions: A(fee=0,size=1).
+    TestDepGraphSerialization<TestBitSet>(
+        {{{0, 1}, {}}},
+        "01" /* A size */
+        "00" /* A fee */
+        "00" /* A insertion position (no skips): A */
+        "00" /* end of graph */);
+
+    // Transactions: A(fee=42,size=11), B(fee=-13,size=7), B depends on A.
+    TestDepGraphSerialization<TestBitSet>(
+        {{{42, 11}, {}}, {{-13, 7}, {0}}},
+        "0b" /* A size */
+        "54" /* A fee */
+        "00" /* A insertion position (no skips): A */
+        "07" /* B size */
+        "19" /* B fee */
+        "00" /* B->A dependency (no skips) */
+        "00" /* B insertion position (no skips): A,B */
+        "00" /* end of graph */);
+
+    // Transactions: A(64,128), B(128,256), C(1,1), C depends on A and B.
+    TestDepGraphSerialization<TestBitSet>(
+        {{{64, 128}, {}}, {{128, 256}, {}}, {{1, 1}, {0, 1}}},
+        "8000" /* A size */
+        "8000" /* A fee */
+        "00"   /* A insertion position (no skips): A */
+        "8100" /* B size */
+        "8100" /* B fee */
+        "01"   /* B insertion position (skip B->A dependency): A,B */
+        "01"   /* C size */
+        "02"   /* C fee */
+        "00"   /* C->B dependency (no skips) */
+        "00"   /* C->A dependency (no skips) */
+        "00"   /* C insertion position (no skips): A,B,C */
+        "00"   /* end of graph */);
+
+    // Transactions: A(-57,113), B(57,114), C(-58,115), D(58,116). Deps: B->A, C->A, D->C, in order
+    // [B,A,C,D]. This exercises non-topological ordering (internally serialized as A,B,C,D).
+    TestDepGraphSerialization<TestBitSet>(
+        {{{57, 114}, {1}}, {{-57, 113}, {}}, {{-58, 115}, {1}}, {{58, 116}, {2}}},
+        "71" /* A size */
+        "71" /* A fee */
+        "00" /* A insertion position (no skips): A */
+        "72" /* B size */
+        "72" /* B fee */
+        "00" /* B->A dependency (no skips) */
+        "01" /* B insertion position (skip A): B,A */
+        "73" /* C size */
+        "73" /* C fee */
+        "01" /* C->A dependency (skip C->B dependency) */
+        "00" /* C insertion position (no skips): B,A,C */
+        "74" /* D size */
+        "74" /* D fee */
+        "00" /* D->C dependency (no skips) */
+        "01" /* D insertion position (skip D->B dependency, D->A is implied): B,A,C,D */
+        "00" /* end of graph */);
+
+    // Transactions: A(1,2), B(3,1), C(2,1), D(1,3), E(1,1). Deps: C->A, D->A, D->B, E->D.
+    // In order: [D,A,B,E,C]. Internally serialized in order A,B,C,D,E.
+    TestDepGraphSerialization<TestBitSet>(
+        {{{1, 3}, {1, 2}}, {{1, 2}, {}}, {{3, 1}, {}}, {{1, 1}, {0}}, {{2, 1}, {1}}},
+        "02" /* A size */
+        "02" /* A fee */
+        "00" /* A insertion position (no skips): A */
+        "01" /* B size */
+        "06" /* B fee */
+        "01" /* B insertion position (skip B->A dependency): A,B */
+        "01" /* C size */
+        "04" /* C fee */
+        "01" /* C->A dependency (skip C->B dependency) */
+        "00" /* C insertion position (no skips): A,B,C */
+        "03" /* D size */
+        "02" /* D fee */
+        "01" /* D->B dependency (skip D->C dependency) */
+        "00" /* D->A dependency (no skips) */
+        "03" /* D insertion position (skip C,B,A): D,A,B,C */
+        "01" /* E size */
+        "02" /* E fee */
+        "00" /* E->D dependency (no skips) */
+        "02" /* E insertion position (skip E->C dependency, E->B and E->A are implied,
+                skip insertion C): D,A,B,E,C */
+        "00" /* end of graph */
+    );
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
new file mode 100644
index 00000000000..fd75fd5b081
--- /dev/null
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -0,0 +1,87 @@
+// Copyright (c) The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include <cluster_linearize.h>
+#include <serialize.h>
+#include <streams.h>
+#include <test/fuzz/fuzz.h>
+#include <test/fuzz/FuzzedDataProvider.h>
+#include <test/util/cluster_linearize.h>
+#include <util/bitset.h>
+#include <util/feefrac.h>
+
+#include <stdint.h>
+#include <vector>
+#include <utility>
+
+FUZZ_TARGET(clusterlin_add_dependency)
+{
+    // Verify that computing a DepGraph from a cluster, or building it step by step using AddDependency
+    // have the same effect.
+
+    // Construct a cluster of a certain length, with no dependencies.
+    FuzzedDataProvider provider(buffer.data(), buffer.size());
+    auto num_tx = provider.ConsumeIntegralInRange<ClusterIndex>(2, 32);
+    Cluster<TestBitSet> cluster(num_tx, std::pair{FeeFrac{0, 1}, TestBitSet{}});
+    // Construct the corresponding DepGraph object (also no dependencies).
+    DepGraph depgraph(cluster);
+    SanityCheck(depgraph);
+    // Read (parent, child) pairs, and add them to the cluster and depgraph.
+    LIMITED_WHILE(provider.remaining_bytes() > 0, TestBitSet::Size() * TestBitSet::Size()) {
+        auto parent = provider.ConsumeIntegralInRange<ClusterIndex>(0, num_tx - 1);
+        auto child = provider.ConsumeIntegralInRange<ClusterIndex>(0, num_tx - 2);
+        child += (child >= parent);
+        cluster[child].second.Set(parent);
+        depgraph.AddDependency(parent, child);
+        assert(depgraph.Ancestors(child)[parent]);
+        assert(depgraph.Descendants(parent)[child]);
+    }
+    // Sanity check the result.
+    SanityCheck(depgraph);
+    // Verify that the resulting DepGraph matches one recomputed from the cluster.
+    assert(DepGraph(cluster) == depgraph);
+}
+
+FUZZ_TARGET(clusterlin_cluster_serialization)
+{
+    // Verify that any graph of transactions has its ancestry correctly computed by DepGraph, and
+    // if it is a DAG, that it can be serialized as a DepGraph in a way that roundtrips. This
+    // guarantees that any acyclic cluster has a corresponding DepGraph serialization.
+
+    FuzzedDataProvider provider(buffer.data(), buffer.size());
+
+    // Construct a cluster in a naive way (using a FuzzedDataProvider-based serialization).
+    Cluster<TestBitSet> cluster;
+    auto num_tx = provider.ConsumeIntegralInRange<ClusterIndex>(1, 32);
+    cluster.resize(num_tx);
+    for (ClusterIndex i = 0; i < num_tx; ++i) {
+        cluster[i].first.size = provider.ConsumeIntegralInRange<int32_t>(1, 0x3fffff);
+        cluster[i].first.fee = provider.ConsumeIntegralInRange<int64_t>(-0x8000000000000, 0x7ffffffffffff);
+        for (ClusterIndex j = 0; j < num_tx; ++j) {
+            if (i == j) continue;
+            if (provider.ConsumeBool()) cluster[i].second.Set(j);
+        }
+    }
+
+    // Construct dependency graph, and verify it matches the cluster (which includes a round-trip
+    // check for the serialization).
+    DepGraph depgraph(cluster);
+    VerifyDepGraphFromCluster(cluster, depgraph);
+}
+
+FUZZ_TARGET(clusterlin_depgraph_serialization)
+{
+    // Verify that any deserialized depgraph is acyclic and roundtrips to an identical depgraph.
+
+    // Construct a graph by deserializing.
+    SpanReader reader(buffer);
+    DepGraph<TestBitSet> depgraph;
+    try {
+        reader >> Using<DepGraphFormatter>(depgraph);
+    } catch (const std::ios_base::failure&) {}
+    SanityCheck(depgraph);
+
+    // Verify the graph is a DAG.
+    assert(IsAcyclic(depgraph));
+}
diff --git a/src/test/util/cluster_linearize.h b/src/test/util/cluster_linearize.h
new file mode 100644
index 00000000000..771fbd543a7
--- /dev/null
+++ b/src/test/util/cluster_linearize.h
@@ -0,0 +1,336 @@
+// Copyright (c) The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_TEST_UTIL_CLUSTER_LINEARIZE_H
+#define BITCOIN_TEST_UTIL_CLUSTER_LINEARIZE_H
+
+#include <cluster_linearize.h>
+#include <serialize.h>
+#include <streams.h>
+#include <util/bitset.h>
+#include <util/feefrac.h>
+
+#include <stdint.h>
+#include <numeric>
+#include <vector>
+#include <utility>
+
+namespace {
+
+using namespace cluster_linearize;
+
+using TestBitSet = BitSet<32>;
+
+/** Check if a graph is acyclic. */
+template<typename SetType>
+bool IsAcyclic(const DepGraph<SetType>& depgraph) noexcept
+{
+    for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) {
+        if ((depgraph.Ancestors(i) & depgraph.Descendants(i)) != SetType::Singleton(i)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/** A formatter for a bespoke serialization for acyclic DepGraph objects.
+ *
+ * The serialization format outputs information about transactions in a topological order (parents
+ * before children), together with position information so transactions can be moved back to their
+ * correct position on deserialization.
+ *
+ * - For each transaction t in the DepGraph (in some topological order);
+ *   - The size: VARINT(t.size), which cannot be 0.
+ *   - The fee: VARINT(SignedToUnsigned(t.fee)), see below for SignedToUnsigned.
+ *   - For each direct dependency:
+ *     - VARINT(skip)
+ *   - The position of t in the cluster: VARINT(skip)
+ * - The end of the graph: VARINT(0)
+ *
+ * The list of skip values encodes the dependencies of t, as well as its position in the cluster.
+ * Each skip value is the number of possibilities that were available, but were not taken. These
+ * possibilities are, in order:
+ * - For each previous transaction in the graph, in reverse serialization order, whether it is a
+ *   direct parent of t (but excluding transactions which are already implied to be dependencies
+ *   by parent relations that were serialized before it).
+ * - The various insertion positions in the cluster, from the very end of the cluster, to the
+ *   front.
+ *
+ * Let's say you have a 7-transaction cluster, consisting of transactions F,A,C,B,G,E,D, but
+ * serialized in order A,B,C,D,E,F,G, because that happens to be a topological ordering. By the
+ * time G gets serialized, what has been serialized already represents the cluster F,A,C,B,E,D (in
+ * that order). G has B and E as direct parents, and E depends on C.
+ *
+ * In this case, the possibilities are, in order:
+ * - [ ] the dependency G->F
+ * - [X] the dependency G->E
+ * - [ ] the dependency G->D
+ * - [X] the dependency G->B
+ * - [ ] the dependency G->A
+ * - [ ] put G at the end of the cluster
+ * - [ ] put G before D
+ * - [X] put G before E
+ * - [ ] put G before B
+ * - [ ] put G before C
+ * - [ ] put G before A
+ * - [ ] put G before F
+ *
+ * The skip values in this case are 1 (G->F), 1 (G->D), 3 (G->A, G at end, G before D). No skip
+ * after 3 is needed (or permitted), because there can only be one position for G. Also note that
+ * G->C is not included in the list of possibilities, as it is implied by the included G->E and
+ * E->C that came before it. On deserialization, if the last skip value was 8 or larger (putting
+ * G before the beginning of the cluster), it is interpreted as wrapping around back to the end.
+ *
+ *
+ * Rationale:
+ * - Why VARINTs? They are flexible enough to represent large numbers where needed, but more
+ *   compact for smaller numbers. The serialization format is designed so that simple structures
+ *   involve smaller numbers, so smaller size maps to simpler graphs.
+ * - Why use SignedToUnsigned? It results in small unsigned values for signed values with small
+ *   absolute value. This way we can encode negative fees in graphs, but still let small negative
+ *   numbers have small encodings.
+ * - Why are the parents emitted in reverse order compared to the transactions themselves? This
+ *   naturally lets us skip parents-of-parents, as they will be reflected as implied dependencies.
+ * - Why encode skip values and not a bitmask to convey the list positions? It turns out that the
+ *   most complex graphs (in terms of linearization complexity) are ones with ~1 dependency per
+ *   transaction. The current encoding uses ~1 byte per transaction for dependencies in this case,
+ *   while a bitmask would require ~N/2 bits per transaction.
+ */
+
+struct DepGraphFormatter
+{
+    /** Convert x>=0 to 2x (even), x<0 to -2x-1 (odd). */
+    static uint64_t SignedToUnsigned(int64_t x) noexcept
+    {
+        if (x < 0) {
+            return 2 * uint64_t(-(x + 1)) + 1;
+        } else {
+            return 2 * uint64_t(x);
+        }
+    }
+
+    /** Convert even x to x/2 (>=0), odd x to -(x/2)-1 (<0). */
+    static int64_t UnsignedToSigned(uint64_t x) noexcept
+    {
+        if (x & 1) {
+            return -int64_t(x / 2) - 1;
+        } else {
+            return int64_t(x / 2);
+        }
+    }
+
+    template <typename Stream, typename SetType>
+    static void Ser(Stream& s, const DepGraph<SetType>& depgraph)
+    {
+        /** Construct a topological order to serialize the transactions in. */
+        std::vector<ClusterIndex> topo_order(depgraph.TxCount());
+        std::iota(topo_order.begin(), topo_order.end(), ClusterIndex{0});
+        std::sort(topo_order.begin(), topo_order.end(), [&](ClusterIndex a, ClusterIndex b) {
+            auto anc_a = depgraph.Ancestors(a).Count(), anc_b = depgraph.Ancestors(b).Count();
+            if (anc_a != anc_b) return anc_a < anc_b;
+            return a < b;
+        });
+
+        /** Which transactions the deserializer already knows when it has deserialized what has
+         *  been serialized here so far, and in what order. */
+        std::vector<ClusterIndex> rebuilt_order;
+        rebuilt_order.reserve(depgraph.TxCount());
+
+        // Loop over the transactions in topological order.
+        for (ClusterIndex topo_idx = 0; topo_idx < topo_order.size(); ++topo_idx) {
+            /** Which depgraph index we are currently writing. */
+            ClusterIndex idx = topo_order[topo_idx];
+            // Write size, which must be larger than 0.
+            s << VARINT_MODE(depgraph.FeeRate(idx).size, VarIntMode::NONNEGATIVE_SIGNED);
+            // Write fee, encoded as an unsigned varint (odd=negative, even=non-negative).
+            s << VARINT(SignedToUnsigned(depgraph.FeeRate(idx).fee));
+            // Write dependency information.
+            SetType written_parents;
+            uint64_t diff = 0; //!< How many potential parent/child relations we have skipped over.
+            for (ClusterIndex dep_dist = 0; dep_dist < topo_idx; ++dep_dist) {
+                /** Which depgraph index we are currently considering as parent of idx. */
+                ClusterIndex dep_idx = topo_order[topo_idx - 1 - dep_dist];
+                // Ignore transactions which are already known to be ancestors.
+                if (depgraph.Descendants(dep_idx).Overlaps(written_parents)) continue;
+                if (depgraph.Ancestors(idx)[dep_idx]) {
+                    // When an actual parent is encounted, encode how many non-parents were skipped
+                    // before it.
+                    s << VARINT(diff);
+                    diff = 0;
+                    written_parents.Set(dep_idx);
+                } else {
+                    // When a non-parent is encountered, increment the skip counter.
+                    ++diff;
+                }
+            }
+            // Write position information.
+            ClusterIndex insert_distance = 0;
+            while (insert_distance < rebuilt_order.size()) {
+                // Loop to find how far from the end in rebuilt_order to insert.
+                if (idx > *(rebuilt_order.end() - 1 - insert_distance)) break;
+                ++insert_distance;
+            }
+            rebuilt_order.insert(rebuilt_order.end() - insert_distance, idx);
+            s << VARINT(diff + insert_distance);
+        }
+
+        // Output a final 0 to denote the end of the graph.
+        s << uint8_t{0};
+    }
+
+    template <typename Stream, typename SetType>
+    void Unser(Stream& s, DepGraph<SetType>& depgraph)
+    {
+        /** The dependency graph which we deserialize into first, with transactions in
+         *  topological serialization order, not original cluster order. */
+        DepGraph<SetType> topo_depgraph;
+        /** Mapping from cluster order to serialization order, used later to reconstruct the
+         *  cluster order. */
+        std::vector<ClusterIndex> reordering;
+
+        // Read transactions in topological order.
+        try {
+            while (true) {
+                // Read size. Size 0 signifies the end of the DepGraph.
+                int32_t size;
+                s >> VARINT_MODE(size, VarIntMode::NONNEGATIVE_SIGNED);
+                size &= 0x3FFFFF; // Enough for size up to 4M.
+                static_assert(0x3FFFFF >= 4000000);
+                if (size == 0 || topo_depgraph.TxCount() == SetType::Size()) break;
+                // Read fee, encoded as an unsigned varint (odd=negative, even=non-negative).
+                uint64_t coded_fee;
+                s >> VARINT(coded_fee);
+                coded_fee &= 0xFFFFFFFFFFFFF; // Enough for fee between -21M...21M BTC.
+                static_assert(0xFFFFFFFFFFFFF > uint64_t{2} * 21000000 * 100000000);
+                auto fee = UnsignedToSigned(coded_fee);
+                // Extend topo_depgraph with the new transaction (at the end).
+                auto topo_idx = topo_depgraph.AddTransaction({fee, size});
+                reordering.push_back(topo_idx);
+                // Read dependency information.
+                uint64_t diff = 0; //!< How many potential parents we have to skip.
+                s >> VARINT(diff);
+                for (ClusterIndex dep_dist = 0; dep_dist < topo_idx; ++dep_dist) {
+                    /** Which topo_depgraph index we are currently considering as parent of topo_idx. */
+                    ClusterIndex dep_topo_idx = topo_idx - 1 - dep_dist;
+                    // Ignore transactions which are already known ancestors of topo_idx.
+                    if (topo_depgraph.Descendants(dep_topo_idx)[topo_idx]) continue;
+                    if (diff == 0) {
+                        // When the skip counter has reached 0, add an actual dependency.
+                        topo_depgraph.AddDependency(dep_topo_idx, topo_idx);
+                        // And read the number of skips after it.
+                        s >> VARINT(diff);
+                    } else {
+                        // Otherwise, dep_topo_idx is not a parent. Decrement and continue.
+                        --diff;
+                    }
+                }
+                // If we reach this point, we can interpret the remaining skip value as how far from the
+                // end of reordering topo_idx should be placed (wrapping around), so move it to its
+                // correct location. The preliminary reordering.push_back(topo_idx) above was to make
+                // sure that if a deserialization exception occurs, topo_idx still appears somewhere.
+                reordering.pop_back();
+                reordering.insert(reordering.end() - (diff % (reordering.size() + 1)), topo_idx);
+            }
+        } catch (const std::ios_base::failure&) {}
+
+        // Construct the original cluster order depgraph.
+        depgraph = {};
+        // Add transactions to depgraph in the original cluster order.
+        for (auto topo_idx : reordering) {
+            depgraph.AddTransaction(topo_depgraph.FeeRate(topo_idx));
+        }
+        // Translate dependencies from topological to cluster order.
+        for (ClusterIndex idx = 0; idx < reordering.size(); ++idx) {
+            ClusterIndex topo_idx = reordering[idx];
+            for (ClusterIndex dep_idx = 0; dep_idx < reordering.size(); ++dep_idx) {
+                ClusterIndex dep_topo_idx = reordering[dep_idx];
+                if (topo_depgraph.Ancestors(topo_idx)[dep_topo_idx]) {
+                    depgraph.AddDependency(dep_idx, idx);
+                }
+            }
+        }
+    }
+};
+
+/** Perform a sanity/consistency check on a DepGraph. */
+template<typename SetType>
+void SanityCheck(const DepGraph<SetType>& depgraph)
+{
+    // Consistency check between ancestors internally.
+    for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) {
+        // Transactions include themselves as ancestors.
+        assert(depgraph.Ancestors(i)[i]);
+        // If a is an ancestor of b, then b's ancestors must include all of a's ancestors.
+        for (auto a : depgraph.Ancestors(i)) {
+            assert(depgraph.Ancestors(i).IsSupersetOf(depgraph.Ancestors(a)));
+        }
+    }
+    // Consistency check between ancestors and descendants.
+    for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) {
+        for (ClusterIndex j = 0; j < depgraph.TxCount(); ++j) {
+            assert(depgraph.Ancestors(i)[j] == depgraph.Descendants(j)[i]);
+        }
+    }
+    // If DepGraph is acyclic, serialize + deserialize must roundtrip.
+    if (IsAcyclic(depgraph)) {
+        std::vector<unsigned char> ser;
+        VectorWriter writer(ser, 0);
+        writer << Using<DepGraphFormatter>(depgraph);
+        SpanReader reader(ser);
+        DepGraph<TestBitSet> decoded_depgraph;
+        reader >> Using<DepGraphFormatter>(decoded_depgraph);
+        assert(depgraph == decoded_depgraph);
+        assert(reader.empty());
+        // It must also deserialize correctly without the terminal 0 byte (as the deserializer
+        // will upon EOF still return what it read so far).
+        assert(ser.size() >= 1 && ser.back() == 0);
+        ser.pop_back();
+        reader = SpanReader{ser};
+        decoded_depgraph = {};
+        reader >> Using<DepGraphFormatter>(decoded_depgraph);
+        assert(depgraph == decoded_depgraph);
+        assert(reader.empty());
+    }
+}
+
+/** Verify that a DepGraph corresponds to the information in a cluster. */
+template<typename SetType>
+void VerifyDepGraphFromCluster(const Cluster<SetType>& cluster, const DepGraph<SetType>& depgraph)
+{
+    // Sanity check the depgraph, which includes a check for correspondence between ancestors and
+    // descendants, so it suffices to check just ancestors below.
+    SanityCheck(depgraph);
+    // Verify transaction count.
+    assert(cluster.size() == depgraph.TxCount());
+    // Verify feerates.
+    for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) {
+        assert(depgraph.FeeRate(i) == cluster[i].first);
+    }
+    // Verify ancestors.
+    for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) {
+        // Start with the transaction having itself as ancestor.
+        auto ancestors = SetType::Singleton(i);
+        // Add parents of ancestors to the set of ancestors until it stops changing.
+        while (true) {
+            const auto old_ancestors = ancestors;
+            for (auto ancestor : ancestors) {
+                ancestors |= cluster[ancestor].second;
+            }
+            if (old_ancestors == ancestors) break;
+        }
+        // Compare against depgraph.
+        assert(depgraph.Ancestors(i) == ancestors);
+        // Some additional sanity tests:
+        // - Every transaction has itself as ancestor.
+        assert(ancestors[i]);
+        // - Every transaction has its direct parents as ancestors.
+        for (auto parent : cluster[i].second) {
+            assert(ancestors[parent]);
+        }
+    }
+}
+
+} // namespace
+
+#endif // BITCOIN_TEST_UTIL_CLUSTER_LINEARIZE_H

From 4828079db327bf2aeaed744843a415d1654e8796 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Wed, 8 May 2024 18:56:59 -0400
Subject: [PATCH 03/12] clusterlin: add AncestorCandidateFinder class

This is a class that encapsulates precomputed ancestor set feerates, and
presents an interface for getting the best remaining ancestor set.
---
 src/cluster_linearize.h             | 117 ++++++++++++++++++++++++++++
 src/test/fuzz/cluster_linearize.cpp |  72 +++++++++++++++++
 2 files changed, 189 insertions(+)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 2e230bcd638..03ee894ae32 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -5,6 +5,7 @@
 #ifndef BITCOIN_CLUSTER_LINEARIZE_H
 #define BITCOIN_CLUSTER_LINEARIZE_H
 
+#include <optional>
 #include <stdint.h>
 #include <vector>
 #include <utility>
@@ -166,6 +167,122 @@ public:
     }
 };
 
+/** A set of transactions together with their aggregate feerate. */
+template<typename SetType>
+struct SetInfo
+{
+    /** The transactions in the set. */
+    SetType transactions;
+    /** Their combined fee and size. */
+    FeeFrac feerate;
+
+    /** Construct a SetInfo for a specified set and feerate. */
+    SetInfo(const SetType& txn, const FeeFrac& fr) noexcept : transactions(txn), feerate(fr) {}
+
+    /** Construct a SetInfo for a set of transactions in a depgraph. */
+    explicit SetInfo(const DepGraph<SetType>& depgraph, const SetType& txn) noexcept :
+        transactions(txn), feerate(depgraph.FeeRate(txn)) {}
+
+    /** Permit equality testing. */
+    friend bool operator==(const SetInfo&, const SetInfo&) noexcept = default;
+};
+
+/** Class encapsulating the state needed to find the best remaining ancestor set.
+ *
+ * It is initialized for an entire DepGraph, and parts of the graph can be dropped by calling
+ * MarkDone.
+ *
+ * As long as any part of the graph remains, FindCandidateSet() can be called which will return a
+ * SetInfo with the highest-feerate ancestor set that remains (an ancestor set is a single
+ * transaction together with all its remaining ancestors).
+ */
+template<typename SetType>
+class AncestorCandidateFinder
+{
+    /** Internal dependency graph. */
+    const DepGraph<SetType>& m_depgraph;
+    /** Which transaction are left to include. */
+    SetType m_todo;
+    /** Precomputed ancestor-set feerates (only kept up-to-date for indices in m_todo). */
+    std::vector<FeeFrac> m_ancestor_set_feerates;
+
+public:
+    /** Construct an AncestorCandidateFinder for a given cluster.
+     *
+     * Complexity: O(N^2) where N=depgraph.TxCount().
+     */
+    AncestorCandidateFinder(const DepGraph<SetType>& depgraph LIFETIMEBOUND) noexcept :
+        m_depgraph(depgraph),
+        m_todo{SetType::Fill(depgraph.TxCount())},
+        m_ancestor_set_feerates(depgraph.TxCount())
+    {
+        // Precompute ancestor-set feerates.
+        for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) {
+            /** The remaining ancestors for transaction i. */
+            SetType anc_to_add = m_depgraph.Ancestors(i);
+            FeeFrac anc_feerate;
+            // Reuse accumulated feerate from first ancestor, if usable.
+            Assume(anc_to_add.Any());
+            ClusterIndex first = anc_to_add.First();
+            if (first < i) {
+                anc_feerate = m_ancestor_set_feerates[first];
+                Assume(!anc_feerate.IsEmpty());
+                anc_to_add -= m_depgraph.Ancestors(first);
+            }
+            // Add in other ancestors (which necessarily include i itself).
+            Assume(anc_to_add[i]);
+            anc_feerate += m_depgraph.FeeRate(anc_to_add);
+            // Store the result.
+            m_ancestor_set_feerates[i] = anc_feerate;
+        }
+    }
+
+    /** Remove a set of transactions from the set of to-be-linearized ones.
+     *
+     * The same transaction may not be MarkDone()'d twice.
+     *
+     * Complexity: O(N*M) where N=depgraph.TxCount(), M=select.Count().
+     */
+    void MarkDone(SetType select) noexcept
+    {
+        Assume(select.Any());
+        Assume(select.IsSubsetOf(m_todo));
+        m_todo -= select;
+        for (auto i : select) {
+            auto feerate = m_depgraph.FeeRate(i);
+            for (auto j : m_depgraph.Descendants(i) & m_todo) {
+                m_ancestor_set_feerates[j] -= feerate;
+            }
+        }
+    }
+
+    /** Check whether any unlinearized transactions remain. */
+    bool AllDone() const noexcept
+    {
+        return m_todo.None();
+    }
+
+    /** Find the best (highest-feerate, smallest among those in case of a tie) ancestor set
+     *  among the remaining transactions. Requires !AllDone().
+     *
+     * Complexity: O(N) where N=depgraph.TxCount();
+     */
+    SetInfo<SetType> FindCandidateSet() const noexcept
+    {
+        Assume(!AllDone());
+        std::optional<ClusterIndex> best;
+        for (auto i : m_todo) {
+            if (best.has_value()) {
+                Assume(!m_ancestor_set_feerates[i].IsEmpty());
+                if (!(m_ancestor_set_feerates[i] > m_ancestor_set_feerates[*best])) continue;
+            }
+            best = i;
+        }
+        Assume(best.has_value());
+        return {m_depgraph.Ancestors(*best) & m_todo, m_ancestor_set_feerates[*best]};
+    }
+};
+
 } // namespace cluster_linearize
 
 #endif // BITCOIN_CLUSTER_LINEARIZE_H
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index fd75fd5b081..880fcb79aa6 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -15,6 +15,30 @@
 #include <vector>
 #include <utility>
 
+using namespace cluster_linearize;
+
+namespace {
+
+/** Given a dependency graph, and a todo set, read a topological subset of todo from reader. */
+template<typename SetType>
+SetType ReadTopologicalSet(const DepGraph<SetType>& depgraph, const SetType& todo, SpanReader& reader)
+{
+    uint64_t mask{0};
+    try {
+        reader >> VARINT(mask);
+    } catch(const std::ios_base::failure&) {}
+    SetType ret;
+    for (auto i : todo) {
+        if (!ret[i]) {
+            if (mask & 1) ret |= depgraph.Ancestors(i);
+            mask >>= 1;
+        }
+    }
+    return ret & todo;
+}
+
+} // namespace
+
 FUZZ_TARGET(clusterlin_add_dependency)
 {
     // Verify that computing a DepGraph from a cluster, or building it step by step using AddDependency
@@ -85,3 +109,51 @@ FUZZ_TARGET(clusterlin_depgraph_serialization)
     // Verify the graph is a DAG.
     assert(IsAcyclic(depgraph));
 }
+
+FUZZ_TARGET(clusterlin_ancestor_finder)
+{
+    // Verify that AncestorCandidateFinder works as expected.
+
+    // Retrieve a depgraph from the fuzz input.
+    SpanReader reader(buffer);
+    DepGraph<TestBitSet> depgraph;
+    try {
+        reader >> Using<DepGraphFormatter>(depgraph);
+    } catch (const std::ios_base::failure&) {}
+
+    AncestorCandidateFinder anc_finder(depgraph);
+    auto todo = TestBitSet::Fill(depgraph.TxCount());
+    while (todo.Any()) {
+        // Call the ancestor finder's FindCandidateSet for what remains of the graph.
+        assert(!anc_finder.AllDone());
+        auto best_anc = anc_finder.FindCandidateSet();
+        // Sanity check the result.
+        assert(best_anc.transactions.Any());
+        assert(best_anc.transactions.IsSubsetOf(todo));
+        assert(depgraph.FeeRate(best_anc.transactions) == best_anc.feerate);
+        // Check that it is topologically valid.
+        for (auto i : best_anc.transactions) {
+            assert((depgraph.Ancestors(i) & todo).IsSubsetOf(best_anc.transactions));
+        }
+
+        // Compute all remaining ancestor sets.
+        std::optional<SetInfo<TestBitSet>> real_best_anc;
+        for (auto i : todo) {
+            SetInfo info(depgraph, todo & depgraph.Ancestors(i));
+            if (!real_best_anc.has_value() || info.feerate > real_best_anc->feerate) {
+                real_best_anc = info;
+            }
+        }
+        // The set returned by anc_finder must equal the real best ancestor sets.
+        assert(real_best_anc.has_value());
+        assert(*real_best_anc == best_anc);
+
+        // Find a topologically valid subset of transactions to remove from the graph.
+        auto del_set = ReadTopologicalSet(depgraph, todo, reader);
+        // If we did not find anything, use best_anc itself, because we should remove something.
+        if (del_set.None()) del_set = best_anc.transactions;
+        todo -= del_set;
+        anc_finder.MarkDone(del_set);
+    }
+    assert(anc_finder.AllDone());
+}

From 2a41f151afb82466486402e250327e22319c754e Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Wed, 8 May 2024 18:09:34 -0400
Subject: [PATCH 04/12] clusterlin: add SearchCandidateFinder class

Similar to AncestorCandidateFinder, this encapsulates the state needed for
finding good candidate sets using a search algorithm.
---
 src/cluster_linearize.h             | 171 +++++++++++++++++++++
 src/test/fuzz/cluster_linearize.cpp | 221 ++++++++++++++++++++++++++++
 2 files changed, 392 insertions(+)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 03ee894ae32..39b6881544e 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -5,6 +5,8 @@
 #ifndef BITCOIN_CLUSTER_LINEARIZE_H
 #define BITCOIN_CLUSTER_LINEARIZE_H
 
+#include <algorithm>
+#include <numeric>
 #include <optional>
 #include <stdint.h>
 #include <vector>
@@ -176,6 +178,9 @@ struct SetInfo
     /** Their combined fee and size. */
     FeeFrac feerate;
 
+    /** Construct a SetInfo for the empty set. */
+    SetInfo() noexcept = default;
+
     /** Construct a SetInfo for a specified set and feerate. */
     SetInfo(const SetType& txn, const FeeFrac& fr) noexcept : transactions(txn), feerate(fr) {}
 
@@ -183,6 +188,13 @@ struct SetInfo
     explicit SetInfo(const DepGraph<SetType>& depgraph, const SetType& txn) noexcept :
         transactions(txn), feerate(depgraph.FeeRate(txn)) {}
 
+    /** Construct a new SetInfo equal to this, with more transactions added (which may overlap
+     *  with the existing transactions in the SetInfo). */
+    [[nodiscard]] SetInfo Add(const DepGraph<SetType>& depgraph, const SetType& txn) const noexcept
+    {
+        return {transactions | txn, feerate + depgraph.FeeRate(txn - transactions)};
+    }
+
     /** Permit equality testing. */
     friend bool operator==(const SetInfo&, const SetInfo&) noexcept = default;
 };
@@ -283,6 +295,165 @@ public:
     }
 };
 
+/** Class encapsulating the state needed to perform search for good candidate sets.
+ *
+ * It is initialized for an entire DepGraph, and parts of the graph can be dropped by calling
+ * MarkDone().
+ *
+ * As long as any part of the graph remains, FindCandidateSet() can be called to perform a search
+ * over the set of topologically-valid subsets of that remainder, with a limit on how many
+ * combinations are tried.
+ */
+template<typename SetType>
+class SearchCandidateFinder
+{
+    /** Internal dependency graph for the cluster. */
+    const DepGraph<SetType>& m_depgraph;
+    /** Which transactions are left to do (sorted indices). */
+    SetType m_todo;
+
+public:
+    /** Construct a candidate finder for a graph.
+     *
+     * @param[in] depgraph   Dependency graph for the to-be-linearized cluster.
+     *
+     * Complexity: O(1).
+     */
+    SearchCandidateFinder(const DepGraph<SetType>& depgraph LIFETIMEBOUND) noexcept :
+        m_depgraph(depgraph),
+        m_todo(SetType::Fill(depgraph.TxCount())) {}
+
+    /** Check whether any unlinearized transactions remain. */
+    bool AllDone() const noexcept
+    {
+        return m_todo.None();
+    }
+
+    /** Find a high-feerate topologically-valid subset of what remains of the cluster.
+     *  Requires !AllDone().
+     *
+     * @param[in] max_iterations  The maximum number of optimization steps that will be performed.
+     * @param[in] best            A set/feerate pair with an already-known good candidate. This may
+     *                            be empty.
+     * @return                    A pair of:
+     *                            - The best (highest feerate, smallest size as tiebreaker)
+     *                              topologically valid subset (and its feerate) that was
+     *                              encountered during search. It will be at least as good as the
+     *                              best passed in (if not empty).
+     *                            - The number of optimization steps that were performed. This will
+     *                              be <= max_iterations. If strictly < max_iterations, the
+     *                              returned subset is optimal.
+     *
+     * Complexity: O(N * min(max_iterations, 2^N)) where N=depgraph.TxCount().
+     */
+    std::pair<SetInfo<SetType>, uint64_t> FindCandidateSet(uint64_t max_iterations, SetInfo<SetType> best) noexcept
+    {
+        Assume(!AllDone());
+
+        /** Type for work queue items. */
+        struct WorkItem
+        {
+            /** Set of transactions definitely included (and its feerate). This must be a subset
+             *  of m_todo, and be topologically valid (includes all in-m_todo ancestors of
+             *  itself). */
+            SetInfo<SetType> inc;
+            /** Set of undecided transactions. This must be a subset of m_todo, and have no overlap
+             *  with inc. The set (inc | und) must be topologically valid. */
+            SetType und;
+
+            /** Construct a new work item. */
+            WorkItem(SetInfo<SetType>&& i, SetType&& u) noexcept :
+                inc(std::move(i)), und(std::move(u)) {}
+        };
+
+        /** The queue of work items. */
+        std::vector<WorkItem> queue;
+
+        // Create an initial entry with m_todo as undecided. Also use it as best if not provided,
+        // so that during the work processing loop below, and during the add_fn/split_fn calls, we
+        // do not need to deal with the best=empty case.
+        if (best.feerate.IsEmpty()) best = SetInfo(m_depgraph, m_todo);
+        queue.emplace_back(SetInfo<SetType>{}, SetType{m_todo});
+
+        /** Local copy of the iteration limit. */
+        uint64_t iterations_left = max_iterations;
+
+        /** Internal function to add an item to the queue of elements to explore if there are any
+         *  transactions left to split on, and to update best.
+         *
+         * - inc: the "inc" value for the new work item (must be topological).
+         * - und: the "und" value for the new work item ((inc | und) must be topological).
+         */
+        auto add_fn = [&](SetInfo<SetType> inc, SetType und) noexcept {
+            if (!inc.feerate.IsEmpty()) {
+                // If inc's feerate is better than best's, remember it as our new best.
+                if (inc.feerate > best.feerate) {
+                    best = inc;
+                }
+            } else {
+                Assume(inc.transactions.None());
+            }
+
+            // Make sure there are undecided transactions left to split on.
+            if (und.None()) return;
+
+            // Actually construct a new work item on the queue.
+            queue.emplace_back(std::move(inc), std::move(und));
+        };
+
+        /** Internal process function. It takes an existing work item, and splits it in two: one
+         *  with a particular transaction (and its ancestors) included, and one with that
+         *  transaction (and its descendants) excluded. */
+        auto split_fn = [&](WorkItem&& elem) noexcept {
+            // Any queue element must have undecided transactions left, otherwise there is nothing
+            // to explore anymore.
+            Assume(elem.und.Any());
+            // The included and undecided set are all subsets of m_todo.
+            Assume(elem.inc.transactions.IsSubsetOf(m_todo) && elem.und.IsSubsetOf(m_todo));
+            // Included transactions cannot be undecided.
+            Assume(!elem.inc.transactions.Overlaps(elem.und));
+
+            // Pick the first undecided transaction as the one to split on.
+            const ClusterIndex split = elem.und.First();
+
+            // Add a work item corresponding to exclusion of the split transaction.
+            const auto& desc = m_depgraph.Descendants(split);
+            add_fn(/*inc=*/elem.inc,
+                   /*und=*/elem.und - desc);
+
+            // Add a work item corresponding to inclusion of the split transaction.
+            const auto anc = m_depgraph.Ancestors(split) & m_todo;
+            add_fn(/*inc=*/elem.inc.Add(m_depgraph, anc),
+                   /*und=*/elem.und - anc);
+
+            // Account for the performed split.
+            --iterations_left;
+        };
+
+        // Work processing loop.
+        while (!queue.empty()) {
+            if (!iterations_left) break;
+            auto elem = queue.back();
+            queue.pop_back();
+            split_fn(std::move(elem));
+        }
+
+        // Return the found best set and the number of iterations performed.
+        return {std::move(best), max_iterations - iterations_left};
+    }
+
+    /** Remove a subset of transactions from the cluster being linearized.
+     *
+     * Complexity: O(N) where N=done.Count().
+     */
+    void MarkDone(const SetType& done) noexcept
+    {
+        Assume(done.Any());
+        Assume(done.IsSubsetOf(m_todo));
+        m_todo -= done;
+    }
+};
+
 } // namespace cluster_linearize
 
 #endif // BITCOIN_CLUSTER_LINEARIZE_H
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index 880fcb79aa6..931862b12dd 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -19,6 +19,127 @@ using namespace cluster_linearize;
 
 namespace {
 
+/** A simple finder class for candidate sets.
+ *
+ * This class matches SearchCandidateFinder in interface and behavior, though with fewer
+ * optimizations.
+ */
+template<typename SetType>
+class SimpleCandidateFinder
+{
+    /** Internal dependency graph. */
+    const DepGraph<SetType>& m_depgraph;
+    /** Which transaction are left to include. */
+    SetType m_todo;
+
+public:
+    /** Construct an SimpleCandidateFinder for a given graph. */
+    SimpleCandidateFinder(const DepGraph<SetType>& depgraph LIFETIMEBOUND) noexcept :
+        m_depgraph(depgraph), m_todo{SetType::Fill(depgraph.TxCount())} {}
+
+    /** Remove a set of transactions from the set of to-be-linearized ones. */
+    void MarkDone(SetType select) noexcept { m_todo -= select; }
+
+    /** Determine whether unlinearized transactions remain. */
+    bool AllDone() const noexcept { return m_todo.None(); }
+
+    /** Find a candidate set using at most max_iterations iterations, and the number of iterations
+     *  actually performed. If that number is less than max_iterations, then the result is optimal.
+     *
+     * Complexity: O(N * M), where M is the number of connected topological subsets of the cluster.
+     *             That number is bounded by M <= 2^(N-1).
+     */
+    std::pair<SetInfo<SetType>, uint64_t> FindCandidateSet(uint64_t max_iterations) const noexcept
+    {
+        uint64_t iterations_left = max_iterations;
+        // Queue of work units. Each consists of:
+        // - inc: set of transactions definitely included
+        // - und: set of transactions that can be added to inc still
+        std::vector<std::pair<SetType, SetType>> queue;
+        // Initially we have just one queue element, with the entire graph in und.
+        queue.emplace_back(SetType{}, m_todo);
+        // Best solution so far.
+        SetInfo best(m_depgraph, m_todo);
+        // Process the queue.
+        while (!queue.empty() && iterations_left) {
+            --iterations_left;
+            // Pop top element of the queue.
+            auto [inc, und] = queue.back();
+            queue.pop_back();
+            // Look for a transaction to consider adding/removing.
+            bool inc_none = inc.None();
+            for (auto split : und) {
+                // If inc is empty, consider any split transaction. Otherwise only consider
+                // transactions that share ancestry with inc so far (which means only connected
+                // sets will be considered).
+                if (inc_none || inc.Overlaps(m_depgraph.Ancestors(split))) {
+                    // Add a queue entry with split included.
+                    SetInfo new_inc(m_depgraph, inc | (m_todo & m_depgraph.Ancestors(split)));
+                    queue.emplace_back(new_inc.transactions, und - new_inc.transactions);
+                    // Add a queue entry with split excluded.
+                    queue.emplace_back(inc, und - m_depgraph.Descendants(split));
+                    // Update statistics to account for the candidate new_inc.
+                    if (new_inc.feerate > best.feerate) best = new_inc;
+                    break;
+                }
+            }
+        }
+        return {std::move(best), max_iterations - iterations_left};
+    }
+};
+
+/** A very simple finder class for optimal candidate sets, which tries every subset.
+ *
+ * It is even simpler than SimpleCandidateFinder, and is primarily included here to test the
+ * correctness of SimpleCandidateFinder, which is then used to test the correctness of
+ * SearchCandidateFinder.
+ */
+template<typename SetType>
+class ExhaustiveCandidateFinder
+{
+    /** Internal dependency graph. */
+    const DepGraph<SetType>& m_depgraph;
+    /** Which transaction are left to include. */
+    SetType m_todo;
+
+public:
+    /** Construct an ExhaustiveCandidateFinder for a given graph. */
+    ExhaustiveCandidateFinder(const DepGraph<SetType>& depgraph LIFETIMEBOUND) noexcept :
+        m_depgraph(depgraph), m_todo{SetType::Fill(depgraph.TxCount())} {}
+
+    /** Remove a set of transactions from the set of to-be-linearized ones. */
+    void MarkDone(SetType select) noexcept { m_todo -= select; }
+
+    /** Determine whether unlinearized transactions remain. */
+    bool AllDone() const noexcept { return m_todo.None(); }
+
+    /** Find the optimal remaining candidate set.
+     *
+     * Complexity: O(N * 2^N).
+     */
+    SetInfo<SetType> FindCandidateSet() const noexcept
+    {
+        // Best solution so far.
+        SetInfo<SetType> best{m_todo, m_depgraph.FeeRate(m_todo)};
+        // The number of combinations to try.
+        uint64_t limit = (uint64_t{1} << m_todo.Count()) - 1;
+        // Try the transitive closure of every non-empty subset of m_todo.
+        for (uint64_t x = 1; x < limit; ++x) {
+            // If bit number b is set in x, then the remaining ancestors of the b'th remaining
+            // transaction in m_todo are included.
+            SetType txn;
+            auto x_shifted{x};
+            for (auto i : m_todo) {
+                if (x_shifted & 1) txn |= m_depgraph.Ancestors(i);
+                x_shifted >>= 1;
+            }
+            SetInfo cur(m_depgraph, txn & m_todo);
+            if (cur.feerate > best.feerate) best = cur;
+        }
+        return best;
+    }
+};
+
 /** Given a dependency graph, and a todo set, read a topological subset of todo from reader. */
 template<typename SetType>
 SetType ReadTopologicalSet(const DepGraph<SetType>& depgraph, const SetType& todo, SpanReader& reader)
@@ -157,3 +278,103 @@ FUZZ_TARGET(clusterlin_ancestor_finder)
     }
     assert(anc_finder.AllDone());
 }
+
+static constexpr auto MAX_SIMPLE_ITERATIONS = 300000;
+
+FUZZ_TARGET(clusterlin_search_finder)
+{
+    // Verify that SearchCandidateFinder works as expected by sanity checking the results
+    // and comparing with the results from SimpleCandidateFinder, ExhaustiveCandidateFinder, and
+    // AncestorCandidateFinder.
+
+    // Retrieve a depgraph from the fuzz input.
+    SpanReader reader(buffer);
+    DepGraph<TestBitSet> depgraph;
+    try {
+        reader >> Using<DepGraphFormatter>(depgraph);
+    } catch (const std::ios_base::failure&) {}
+
+    // Instantiate ALL the candidate finders.
+    SearchCandidateFinder src_finder(depgraph);
+    SimpleCandidateFinder smp_finder(depgraph);
+    ExhaustiveCandidateFinder exh_finder(depgraph);
+    AncestorCandidateFinder anc_finder(depgraph);
+
+    auto todo = TestBitSet::Fill(depgraph.TxCount());
+    while (todo.Any()) {
+        assert(!src_finder.AllDone());
+        assert(!smp_finder.AllDone());
+        assert(!exh_finder.AllDone());
+        assert(!anc_finder.AllDone());
+
+        // For each iteration, read an iteration count limit from the fuzz input.
+        uint64_t max_iterations = 1;
+        try {
+            reader >> VARINT(max_iterations);
+        } catch (const std::ios_base::failure&) {}
+        max_iterations &= 0xfffff;
+
+        // Read an initial subset from the fuzz input.
+        SetInfo init_best(depgraph, ReadTopologicalSet(depgraph, todo, reader));
+
+        // Call the search finder's FindCandidateSet for what remains of the graph.
+        auto [found, iterations_done] = src_finder.FindCandidateSet(max_iterations, init_best);
+
+        // Sanity check the result.
+        assert(iterations_done <= max_iterations);
+        assert(found.transactions.Any());
+        assert(found.transactions.IsSubsetOf(todo));
+        assert(depgraph.FeeRate(found.transactions) == found.feerate);
+        if (!init_best.feerate.IsEmpty()) assert(found.feerate >= init_best.feerate);
+        // Check that it is topologically valid.
+        for (auto i : found.transactions) {
+            assert(found.transactions.IsSupersetOf(depgraph.Ancestors(i) & todo));
+        }
+
+        // At most 2^N-1 iterations can be required: the number of non-empty subsets a graph with N
+        // transactions has.
+        assert(iterations_done <= ((uint64_t{1} << todo.Count()) - 1));
+
+        // Perform quality checks only if SearchCandidateFinder claims an optimal result.
+        if (iterations_done < max_iterations) {
+            // Compare with SimpleCandidateFinder.
+            auto [simple, simple_iters] = smp_finder.FindCandidateSet(MAX_SIMPLE_ITERATIONS);
+            assert(found.feerate >= simple.feerate);
+            if (simple_iters < MAX_SIMPLE_ITERATIONS) {
+                assert(found.feerate == simple.feerate);
+            }
+
+            // Compare with AncestorCandidateFinder;
+            auto anc = anc_finder.FindCandidateSet();
+            assert(found.feerate >= anc.feerate);
+
+            // Compare with ExhaustiveCandidateFinder. This quickly gets computationally expensive
+            // for large clusters (O(2^n)), so only do it for sufficiently small ones.
+            if (todo.Count() <= 12) {
+                auto exhaustive = exh_finder.FindCandidateSet();
+                assert(exhaustive.feerate == found.feerate);
+                // Also compare ExhaustiveCandidateFinder with SimpleCandidateFinder (this is
+                // primarily a test for SimpleCandidateFinder's correctness).
+                assert(exhaustive.feerate >= simple.feerate);
+                if (simple_iters < MAX_SIMPLE_ITERATIONS) {
+                    assert(exhaustive.feerate == simple.feerate);
+                }
+            }
+        }
+
+        // Find a topologically valid subset of transactions to remove from the graph.
+        auto del_set = ReadTopologicalSet(depgraph, todo, reader);
+        // If we did not find anything, use found itself, because we should remove something.
+        if (del_set.None()) del_set = found.transactions;
+        todo -= del_set;
+        src_finder.MarkDone(del_set);
+        smp_finder.MarkDone(del_set);
+        exh_finder.MarkDone(del_set);
+        anc_finder.MarkDone(del_set);
+    }
+
+    assert(src_finder.AllDone());
+    assert(smp_finder.AllDone());
+    assert(exh_finder.AllDone());
+    assert(anc_finder.AllDone());
+}

From ee0ddfe4f626bfb4b58927db89d317cb3531813f Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Tue, 9 Jul 2024 20:38:12 -0400
Subject: [PATCH 05/12] clusterlin: add chunking algorithm

A fuzz test is added which verifies various of its expected properties, including
correctness
---
 src/cluster_linearize.h             | 32 ++++++++++++
 src/test/fuzz/cluster_linearize.cpp | 80 +++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 39b6881544e..ba60c49ed02 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -184,10 +184,23 @@ struct SetInfo
     /** Construct a SetInfo for a specified set and feerate. */
     SetInfo(const SetType& txn, const FeeFrac& fr) noexcept : transactions(txn), feerate(fr) {}
 
+    /** Construct a SetInfo for a given transaction in a depgraph. */
+    explicit SetInfo(const DepGraph<SetType>& depgraph, ClusterIndex pos) noexcept :
+        transactions(SetType::Singleton(pos)), feerate(depgraph.FeeRate(pos)) {}
+
     /** Construct a SetInfo for a set of transactions in a depgraph. */
     explicit SetInfo(const DepGraph<SetType>& depgraph, const SetType& txn) noexcept :
         transactions(txn), feerate(depgraph.FeeRate(txn)) {}
 
+    /** Add the transactions of other to this SetInfo (no overlap allowed). */
+    SetInfo& operator|=(const SetInfo& other) noexcept
+    {
+        Assume(!transactions.Overlaps(other.transactions));
+        transactions |= other.transactions;
+        feerate += other.feerate;
+        return *this;
+    }
+
     /** Construct a new SetInfo equal to this, with more transactions added (which may overlap
      *  with the existing transactions in the SetInfo). */
     [[nodiscard]] SetInfo Add(const DepGraph<SetType>& depgraph, const SetType& txn) const noexcept
@@ -199,6 +212,25 @@ struct SetInfo
     friend bool operator==(const SetInfo&, const SetInfo&) noexcept = default;
 };
 
+/** Compute the feerates of the chunks of linearization. */
+template<typename SetType>
+std::vector<FeeFrac> ChunkLinearization(const DepGraph<SetType>& depgraph, Span<const ClusterIndex> linearization) noexcept
+{
+    std::vector<FeeFrac> ret;
+    for (ClusterIndex i : linearization) {
+        /** The new chunk to be added, initially a singleton. */
+        auto new_chunk = depgraph.FeeRate(i);
+        // As long as the new chunk has a higher feerate than the last chunk so far, absorb it.
+        while (!ret.empty() && new_chunk >> ret.back()) {
+            new_chunk += ret.back();
+            ret.pop_back();
+        }
+        // Actually move that new chunk into the chunking.
+        ret.push_back(std::move(new_chunk));
+    }
+    return ret;
+}
+
 /** Class encapsulating the state needed to find the best remaining ancestor set.
  *
  * It is initialized for an entire DepGraph, and parts of the graph can be dropped by calling
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index 931862b12dd..f93eb9c8e3b 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -158,6 +158,44 @@ SetType ReadTopologicalSet(const DepGraph<SetType>& depgraph, const SetType& tod
     return ret & todo;
 }
 
+/** Given a dependency graph, construct any valid linearization for it, reading from a SpanReader. */
+template<typename BS>
+std::vector<ClusterIndex> ReadLinearization(const DepGraph<BS>& depgraph, SpanReader& reader)
+{
+    std::vector<ClusterIndex> linearization;
+    TestBitSet todo = TestBitSet::Fill(depgraph.TxCount());
+    // In every iteration one topologically-valid transaction is appended to linearization.
+    while (todo.Any()) {
+        // Compute the set of transactions with no not-yet-included ancestors.
+        TestBitSet potential_next;
+        for (auto j : todo) {
+            if ((depgraph.Ancestors(j) & todo) == TestBitSet::Singleton(j)) {
+                potential_next.Set(j);
+            }
+        }
+        // There must always be one (otherwise there is a cycle in the graph).
+        assert(potential_next.Any());
+        // Read a number from reader, and interpret it as index into potential_next.
+        uint64_t idx{0};
+        try {
+            reader >> VARINT(idx);
+        } catch (const std::ios_base::failure&) {}
+        idx %= potential_next.Count();
+        // Find out which transaction that corresponds to.
+        for (auto j : potential_next) {
+            if (idx == 0) {
+                // When found, add it to linearization and remove it from todo.
+                linearization.push_back(j);
+                assert(todo[j]);
+                todo.Reset(j);
+                break;
+            }
+            --idx;
+        }
+    }
+    return linearization;
+}
+
 } // namespace
 
 FUZZ_TARGET(clusterlin_add_dependency)
@@ -231,6 +269,48 @@ FUZZ_TARGET(clusterlin_depgraph_serialization)
     assert(IsAcyclic(depgraph));
 }
 
+FUZZ_TARGET(clusterlin_chunking)
+{
+    // Verify the correctness of the ChunkLinearization function.
+
+    // Construct a graph by deserializing.
+    SpanReader reader(buffer);
+    DepGraph<TestBitSet> depgraph;
+    try {
+        reader >> Using<DepGraphFormatter>(depgraph);
+    } catch (const std::ios_base::failure&) {}
+
+    // Read a valid linearization for depgraph.
+    auto linearization = ReadLinearization(depgraph, reader);
+
+    // Invoke the chunking function.
+    auto chunking = ChunkLinearization(depgraph, linearization);
+
+    // Verify that chunk feerates are monotonically non-increasing.
+    for (size_t i = 1; i < chunking.size(); ++i) {
+        assert(!(chunking[i] >> chunking[i - 1]));
+    }
+
+    // Naively recompute the chunks (each is the highest-feerate prefix of what remains).
+    auto todo = TestBitSet::Fill(depgraph.TxCount());
+    for (const auto& chunk_feerate : chunking) {
+        assert(todo.Any());
+        SetInfo<TestBitSet> accumulator, best;
+        for (ClusterIndex idx : linearization) {
+            if (todo[idx]) {
+                accumulator |= SetInfo(depgraph, idx);
+                if (best.feerate.IsEmpty() || accumulator.feerate >> best.feerate) {
+                    best = accumulator;
+                }
+            }
+        }
+        assert(chunk_feerate == best.feerate);
+        assert(best.transactions.IsSubsetOf(todo));
+        todo -= best.transactions;
+    }
+    assert(todo.None());
+}
+
 FUZZ_TARGET(clusterlin_ancestor_finder)
 {
     // Verify that AncestorCandidateFinder works as expected.

From 46aad9b09986feb1d54fc4229a0d224da94fb80a Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Wed, 8 May 2024 17:28:39 -0400
Subject: [PATCH 06/12] clusterlin: add Linearize function

This adds a first version of the overall linearization interface, which given
a DepGraph constructs a good linearization, by incrementally including good
candidate sets (found using AncestorCandidateFinder and SearchCandidateFinder).
---
 src/cluster_linearize.h             | 67 ++++++++++++++++++++++
 src/test/fuzz/cluster_linearize.cpp | 89 +++++++++++++++++++++++++++++
 src/test/util/cluster_linearize.h   | 17 ++++++
 3 files changed, 173 insertions(+)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index ba60c49ed02..cb6187eeca3 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -167,6 +167,22 @@ public:
         for (auto pos : elems) ret += entries[pos].feerate;
         return ret;
     }
+
+    /** Append the entries of select to list in a topologically valid order.
+     *
+     * Complexity: O(select.Count() * log(select.Count())).
+     */
+    void AppendTopo(std::vector<ClusterIndex>& list, const SetType& select) const noexcept
+    {
+        ClusterIndex old_len = list.size();
+        for (auto i : select) list.push_back(i);
+        std::sort(list.begin() + old_len, list.end(), [&](ClusterIndex a, ClusterIndex b) noexcept {
+            const auto a_anc_count = entries[a].ancestors.Count();
+            const auto b_anc_count = entries[b].ancestors.Count();
+            if (a_anc_count != b_anc_count) return a_anc_count < b_anc_count;
+            return a < b;
+        });
+    }
 };
 
 /** A set of transactions together with their aggregate feerate. */
@@ -486,6 +502,57 @@ public:
     }
 };
 
+/** Find a linearization for a cluster.
+ *
+ * @param[in] depgraph            Dependency graph of the cluster to be linearized.
+ * @param[in] max_iterations      Upper bound on the number of optimization steps that will be done.
+ * @return                        A pair of:
+ *                                - The resulting linearization.
+ *                                - A boolean indicating whether the result is guaranteed to be
+ *                                  optimal.
+ *
+ * Complexity: O(N * min(max_iterations + N, 2^N)) where N=depgraph.TxCount().
+ */
+template<typename SetType>
+std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations) noexcept
+{
+    if (depgraph.TxCount() == 0) return {{}, true};
+
+    uint64_t iterations_left = max_iterations;
+    std::vector<ClusterIndex> linearization;
+
+    AncestorCandidateFinder anc_finder(depgraph);
+    SearchCandidateFinder src_finder(depgraph);
+    linearization.reserve(depgraph.TxCount());
+    bool optimal = true;
+
+    while (true) {
+        // Initialize best as the best remaining ancestor set.
+        auto best = anc_finder.FindCandidateSet();
+
+        // Invoke bounded search to update best, with up to half of our remaining iterations as
+        // limit.
+        uint64_t max_iterations_now = (iterations_left + 1) / 2;
+        uint64_t iterations_done_now = 0;
+        std::tie(best, iterations_done_now) = src_finder.FindCandidateSet(max_iterations_now, best);
+        iterations_left -= iterations_done_now;
+
+        if (iterations_done_now == max_iterations_now) {
+            optimal = false;
+        }
+
+        // Add to output in topological order.
+        depgraph.AppendTopo(linearization, best.transactions);
+
+        // Update state to reflect best is no longer to be linearized.
+        anc_finder.MarkDone(best.transactions);
+        if (anc_finder.AllDone()) break;
+        src_finder.MarkDone(best.transactions);
+    }
+
+    return {std::move(linearization), optimal};
+}
+
 } // namespace cluster_linearize
 
 #endif // BITCOIN_CLUSTER_LINEARIZE_H
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index f93eb9c8e3b..e4d9a59b8d3 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -11,6 +11,7 @@
 #include <util/bitset.h>
 #include <util/feefrac.h>
 
+#include <algorithm>
 #include <stdint.h>
 #include <vector>
 #include <utility>
@@ -140,6 +141,29 @@ public:
     }
 };
 
+/** A simple linearization algorithm.
+ *
+ * This matches Linearize() in interface and behavior, though with fewer optimizations, and using
+ * just SimpleCandidateFinder rather than AncestorCandidateFinder and SearchCandidateFinder.
+ */
+template<typename SetType>
+std::pair<std::vector<ClusterIndex>, bool> SimpleLinearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations)
+{
+    std::vector<ClusterIndex> linearization;
+    SimpleCandidateFinder finder(depgraph);
+    SetType todo = SetType::Fill(depgraph.TxCount());
+    bool optimal = true;
+    while (todo.Any()) {
+        auto [candidate, iterations_done] = finder.FindCandidateSet(max_iterations);
+        if (iterations_done == max_iterations) optimal = false;
+        depgraph.AppendTopo(linearization, candidate.transactions);
+        todo -= candidate.transactions;
+        finder.MarkDone(candidate.transactions);
+        max_iterations -= iterations_done;
+    }
+    return {std::move(linearization), optimal};
+}
+
 /** Given a dependency graph, and a todo set, read a topological subset of todo from reader. */
 template<typename SetType>
 SetType ReadTopologicalSet(const DepGraph<SetType>& depgraph, const SetType& todo, SpanReader& reader)
@@ -458,3 +482,68 @@ FUZZ_TARGET(clusterlin_search_finder)
     assert(exh_finder.AllDone());
     assert(anc_finder.AllDone());
 }
+
+FUZZ_TARGET(clusterlin_linearize)
+{
+    // Verify the behavior of Linearize().
+
+    // Retrieve an iteration count, and a depgraph from the fuzz input.
+    SpanReader reader(buffer);
+    DepGraph<TestBitSet> depgraph;
+    uint64_t iter_count{0};
+    try {
+        reader >> VARINT(iter_count) >> Using<DepGraphFormatter>(depgraph);
+    } catch (const std::ios_base::failure&) {}
+
+    // Invoke Linearize().
+    iter_count &= 0x7ffff;
+    auto [linearization, optimal] = Linearize(depgraph, iter_count);
+    SanityCheck(depgraph, linearization);
+    auto chunking = ChunkLinearization(depgraph, linearization);
+
+    // If the iteration count is sufficiently high, an optimal linearization must be found.
+    // Each linearization step can use up to 2^k iterations, with steps k=1..n. That sum is
+    // 2 * (2^n - 1)
+    const uint64_t n = depgraph.TxCount();
+    if (n <= 18 && iter_count > 2U * ((uint64_t{1} << n) - 1U)) {
+        assert(optimal);
+    }
+
+    // If Linearize claims optimal result, run quality tests.
+    if (optimal) {
+        // It must be as good as SimpleLinearize.
+        auto [simple_linearization, simple_optimal] = SimpleLinearize(depgraph, MAX_SIMPLE_ITERATIONS);
+        SanityCheck(depgraph, simple_linearization);
+        auto simple_chunking = ChunkLinearization(depgraph, simple_linearization);
+        auto cmp = CompareChunks(chunking, simple_chunking);
+        assert(cmp >= 0);
+        // If SimpleLinearize finds the optimal result too, they must be equal (if not,
+        // SimpleLinearize is broken).
+        if (simple_optimal) assert(cmp == 0);
+
+        // Only for very small clusters, test every topologically-valid permutation.
+        if (depgraph.TxCount() <= 7) {
+            std::vector<ClusterIndex> perm_linearization(depgraph.TxCount());
+            for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) perm_linearization[i] = i;
+            // Iterate over all valid permutations.
+            do {
+                // Determine whether perm_linearization is topological.
+                TestBitSet perm_done;
+                bool perm_is_topo{true};
+                for (auto i : perm_linearization) {
+                    perm_done.Set(i);
+                    if (!depgraph.Ancestors(i).IsSubsetOf(perm_done)) {
+                        perm_is_topo = false;
+                        break;
+                    }
+                }
+                // If so, verify that the obtained linearization is as good as the permutation.
+                if (perm_is_topo) {
+                    auto perm_chunking = ChunkLinearization(depgraph, perm_linearization);
+                    auto cmp = CompareChunks(chunking, perm_chunking);
+                    assert(cmp >= 0);
+                }
+            } while(std::next_permutation(perm_linearization.begin(), perm_linearization.end()));
+        }
+    }
+}
diff --git a/src/test/util/cluster_linearize.h b/src/test/util/cluster_linearize.h
index 771fbd543a7..508a08133c3 100644
--- a/src/test/util/cluster_linearize.h
+++ b/src/test/util/cluster_linearize.h
@@ -7,6 +7,7 @@
 
 #include <cluster_linearize.h>
 #include <serialize.h>
+#include <span.h>
 #include <streams.h>
 #include <util/bitset.h>
 #include <util/feefrac.h>
@@ -331,6 +332,22 @@ void VerifyDepGraphFromCluster(const Cluster<SetType>& cluster, const DepGraph<S
     }
 }
 
+/** Perform a sanity check on a linearization. */
+template<typename SetType>
+void SanityCheck(const DepGraph<SetType>& depgraph, Span<const ClusterIndex> linearization)
+{
+    // Check completeness.
+    assert(linearization.size() == depgraph.TxCount());
+    TestBitSet done;
+    for (auto i : linearization) {
+        // Check transaction position is in range.
+        assert(i < depgraph.TxCount());
+        // Check topology and lack of duplicates.
+        assert((depgraph.Ancestors(i) - done) == TestBitSet::Singleton(i));
+        done.Set(i);
+    }
+}
+
 } // namespace
 
 #endif // BITCOIN_TEST_UTIL_CLUSTER_LINEARIZE_H

From d9b235e7d288814e8ee248b68d91eb68866b32bd Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Wed, 15 May 2024 21:09:31 -0400
Subject: [PATCH 07/12] bench: Candidate finding and linearization benchmarks

Add benchmarks for known bad graphs for the purpose of search (as
an upper bound on work per search iterations) and ancestor sorting
(as an upper bound on linearization work with no search iterations).
---
 src/Makefile.bench.include      |   1 +
 src/bench/cluster_linearize.cpp | 158 ++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100644 src/bench/cluster_linearize.cpp

diff --git a/src/Makefile.bench.include b/src/Makefile.bench.include
index 7e3aa369c7c..fe6333d8c0b 100644
--- a/src/Makefile.bench.include
+++ b/src/Makefile.bench.include
@@ -25,6 +25,7 @@ bench_bench_bitcoin_SOURCES = \
   bench/checkblock.cpp \
   bench/checkblockindex.cpp \
   bench/checkqueue.cpp \
+  bench/cluster_linearize.cpp \
   bench/crypto_hash.cpp \
   bench/data.cpp \
   bench/data.h \
diff --git a/src/bench/cluster_linearize.cpp b/src/bench/cluster_linearize.cpp
new file mode 100644
index 00000000000..cf94580ab20
--- /dev/null
+++ b/src/bench/cluster_linearize.cpp
@@ -0,0 +1,158 @@
+// Copyright (c) The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include <bench/bench.h>
+
+#include <util/bitset.h>
+#include <cluster_linearize.h>
+
+using namespace cluster_linearize;
+
+namespace {
+
+/** Construct a linear graph. These are pessimal for AncestorCandidateFinder, as they maximize
+ *  the number of ancestor set feerate updates. The best ancestor set is always the topmost
+ *  remaining transaction, whose removal requires updating all remaining transactions' ancestor
+ *  set feerates. */
+template<typename SetType>
+DepGraph<SetType> MakeLinearGraph(ClusterIndex ntx)
+{
+    DepGraph<SetType> depgraph;
+    for (ClusterIndex i = 0; i < ntx; ++i) {
+        depgraph.AddTransaction({-int32_t(i), 1});
+        if (i > 0) depgraph.AddDependency(i - 1, i);
+    }
+    return depgraph;
+}
+
+// Construct a difficult graph. These need at least sqrt(2^(n-1)) iterations in the best
+// known algorithms (purely empirically determined).
+template<typename SetType>
+DepGraph<SetType> MakeHardGraph(ClusterIndex ntx)
+{
+    DepGraph<SetType> depgraph;
+    for (ClusterIndex i = 0; i < ntx; ++i) {
+        if (ntx & 1) {
+            // Odd cluster size.
+            //
+            // Mermaid diagram code for the resulting cluster for 11 transactions:
+            // ```mermaid
+            // graph BT
+            // T0["T0: 1/2"];T1["T1: 14/2"];T2["T2: 6/1"];T3["T3: 5/1"];T4["T4: 7/1"];
+            // T5["T5: 5/1"];T6["T6: 7/1"];T7["T7: 5/1"];T8["T8: 7/1"];T9["T9: 5/1"];
+            // T10["T10: 7/1"];
+            // T1-->T0;T1-->T2;T3-->T2;T4-->T3;T4-->T5;T6-->T5;T4-->T7;T8-->T7;T4-->T9;T10-->T9;
+            // ```
+            if (i == 0) {
+                depgraph.AddTransaction({1, 2});
+            } else if (i == 1) {
+                depgraph.AddTransaction({14, 2});
+                depgraph.AddDependency(0, 1);
+            } else if (i == 2) {
+                depgraph.AddTransaction({6, 1});
+                depgraph.AddDependency(2, 1);
+            } else if (i == 3) {
+                depgraph.AddTransaction({5, 1});
+                depgraph.AddDependency(2, 3);
+            } else if ((i & 1) == 0) {
+                depgraph.AddTransaction({7, 1});
+                depgraph.AddDependency(i - 1, i);
+            } else {
+                depgraph.AddTransaction({5, 1});
+                depgraph.AddDependency(i, 4);
+            }
+        } else {
+            // Even cluster size.
+            //
+            // Mermaid diagram code for the resulting cluster for 10 transactions:
+            // ```mermaid
+            // graph BT
+            // T0["T0: 1"];T1["T1: 3"];T2["T2: 1"];T3["T3: 4"];T4["T4: 0"];T5["T5: 4"];T6["T6: 0"];
+            // T7["T7: 4"];T8["T8: 0"];T9["T9: 4"];
+            // T1-->T0;T2-->T0;T3-->T2;T3-->T4;T5-->T4;T3-->T6;T7-->T6;T3-->T8;T9-->T8;
+            // ```
+            if (i == 0) {
+                depgraph.AddTransaction({1, 1});
+            } else if (i == 1) {
+                depgraph.AddTransaction({3, 1});
+                depgraph.AddDependency(0, 1);
+            } else if (i == 2) {
+                depgraph.AddTransaction({1, 1});
+                depgraph.AddDependency(0, 2);
+            } else if (i & 1) {
+                depgraph.AddTransaction({4, 1});
+                depgraph.AddDependency(i - 1, i);
+            } else {
+                depgraph.AddTransaction({0, 1});
+                depgraph.AddDependency(i, 3);
+            }
+        }
+    }
+    return depgraph;
+}
+
+/** Benchmark that does search-based candidate finding with 10000 iterations.
+ *
+ * Its goal is measuring how much time every additional search iteration in linearization costs.
+ */
+template<typename SetType>
+void BenchLinearizePerIterWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
+{
+    const auto depgraph = MakeHardGraph<SetType>(ntx);
+    const auto iter_limit = std::min<uint64_t>(10000, uint64_t{1} << (ntx / 2 - 1));
+    bench.batch(iter_limit).unit("iters").run([&] {
+        SearchCandidateFinder finder(depgraph);
+        auto [candidate, iters_performed] = finder.FindCandidateSet(iter_limit, {});
+        assert(iters_performed == iter_limit);
+    });
+}
+
+/** Benchmark for linearization of a trivial linear graph using just ancestor sort.
+ *
+ * Its goal is measuring how much time linearization may take without any search iterations.
+ *
+ * If P is the resulting time of BenchLinearizePerIterWorstCase, and N is the resulting time of
+ * BenchLinearizeNoItersWorstCase, then an invocation of Linearize with max_iterations=m should
+ * take no more than roughly N+m*P time. This may however be an overestimate, as the worst cases
+ * do not coincide (the ones that are worst for linearization without any search happen to be ones
+ * that do not need many search iterations).
+ */
+template<typename SetType>
+void BenchLinearizeNoItersWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
+{
+    const auto depgraph = MakeLinearGraph<SetType>(ntx);
+    bench.run([&] {
+        Linearize(depgraph, /*max_iterations=*/0);
+    });
+}
+
+} // namespace
+
+static void LinearizePerIter16TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<16>>(16, bench); }
+static void LinearizePerIter32TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<32>>(32, bench); }
+static void LinearizePerIter48TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<48>>(48, bench); }
+static void LinearizePerIter64TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<64>>(64, bench); }
+static void LinearizePerIter75TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<75>>(75, bench); }
+static void LinearizePerIter99TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<99>>(99, bench); }
+
+static void LinearizeNoIters16TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<16>>(16, bench); }
+static void LinearizeNoIters32TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<32>>(32, bench); }
+static void LinearizeNoIters48TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<48>>(48, bench); }
+static void LinearizeNoIters64TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<64>>(64, bench); }
+static void LinearizeNoIters75TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<75>>(75, bench); }
+static void LinearizeNoIters99TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<99>>(99, bench); }
+
+BENCHMARK(LinearizePerIter16TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizePerIter32TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizePerIter48TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizePerIter64TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizePerIter75TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizePerIter99TxWorstCase, benchmark::PriorityLevel::HIGH);
+
+BENCHMARK(LinearizeNoIters16TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters32TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters48TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters64TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters75TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters99TxWorstCase, benchmark::PriorityLevel::HIGH);

From 991ff9a9a4f2171ab99cb0ca1d70ebbc0db9d388 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 9 May 2024 11:53:02 -0400
Subject: [PATCH 08/12] clusterlin: use bounded BFS exploration (optimization)

Switch to BFS exploration of the search tree in SearchCandidateFinder
instead of DFS exploration. This appears to behave better for real
world clusters.

As BFS has the downside of needing far larger search queues, switch
back to DFS temporarily when the queue grows too large.
---
 src/cluster_linearize.h | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index cb6187eeca3..52880529f6f 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -13,6 +13,7 @@
 #include <utility>
 
 #include <util/feefrac.h>
+#include <util/vecdeque.h>
 
 namespace cluster_linearize {
 
@@ -415,7 +416,8 @@ public:
         };
 
         /** The queue of work items. */
-        std::vector<WorkItem> queue;
+        VecDeque<WorkItem> queue;
+        queue.reserve(std::max<size_t>(256, 2 * m_todo.Count()));
 
         // Create an initial entry with m_todo as undecided. Also use it as best if not provided,
         // so that during the work processing loop below, and during the add_fn/split_fn calls, we
@@ -445,7 +447,10 @@ public:
             // Make sure there are undecided transactions left to split on.
             if (und.None()) return;
 
-            // Actually construct a new work item on the queue.
+            // Actually construct a new work item on the queue. Due to the switch to DFS when queue
+            // space runs out (see below), we know that no reallocation of the queue should ever
+            // occur.
+            Assume(queue.size() < queue.capacity());
             queue.emplace_back(std::move(inc), std::move(und));
         };
 
@@ -479,10 +484,33 @@ public:
         };
 
         // Work processing loop.
+        //
+        // New work items are always added at the back of the queue, but items to process use a
+        // hybrid approach where they can be taken from the front or the back.
+        //
+        // Depth-first search (DFS) corresponds to always taking from the back of the queue. This
+        // is very memory-efficient (linear in the number of transactions). Breadth-first search
+        // (BFS) corresponds to always taking from the front, which potentially uses more memory
+        // (up to exponential in the transaction count), but seems to work better in practice.
+        //
+        // The approach here combines the two: use BFS until the queue grows too large, at which
+        // point we temporarily switch to DFS until the size shrinks again.
         while (!queue.empty()) {
+            // Processing the first queue item, and then using DFS for everything it gives rise to,
+            // may increase the queue size by the number of undecided elements in there, minus 1
+            // for the first queue item being removed. Thus, only when that pushes the queue over
+            // its capacity can we not process from the front (BFS), and should we use DFS.
+            while (queue.size() - 1 + queue.front().und.Count() > queue.capacity()) {
+                if (!iterations_left) break;
+                auto elem = queue.back();
+                queue.pop_back();
+                split_fn(std::move(elem));
+            }
+
+            // Process one entry from the front of the queue (BFS exploration)
             if (!iterations_left) break;
-            auto elem = queue.back();
-            queue.pop_back();
+            auto elem = queue.front();
+            queue.pop_front();
             split_fn(std::move(elem));
         }
 

From d5918dc3c6d9480c8a5e295db0f4d4892b0138f6 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 9 May 2024 21:41:52 -0400
Subject: [PATCH 09/12] clusterlin: randomize the SearchCandidateFinder search
 order

To make search non-deterministic, change the BFS logic from always picking
the first queue item to randomly picking the first or second queue item.
---
 src/bench/cluster_linearize.cpp     |  6 +++--
 src/cluster_linearize.h             | 37 +++++++++++++++++++++++++----
 src/test/fuzz/cluster_linearize.cpp | 14 ++++++-----
 3 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/src/bench/cluster_linearize.cpp b/src/bench/cluster_linearize.cpp
index cf94580ab20..bfab5c729bd 100644
--- a/src/bench/cluster_linearize.cpp
+++ b/src/bench/cluster_linearize.cpp
@@ -101,8 +101,9 @@ void BenchLinearizePerIterWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
 {
     const auto depgraph = MakeHardGraph<SetType>(ntx);
     const auto iter_limit = std::min<uint64_t>(10000, uint64_t{1} << (ntx / 2 - 1));
+    uint64_t rng_seed = 0;
     bench.batch(iter_limit).unit("iters").run([&] {
-        SearchCandidateFinder finder(depgraph);
+        SearchCandidateFinder finder(depgraph, rng_seed++);
         auto [candidate, iters_performed] = finder.FindCandidateSet(iter_limit, {});
         assert(iters_performed == iter_limit);
     });
@@ -122,8 +123,9 @@ template<typename SetType>
 void BenchLinearizeNoItersWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
 {
     const auto depgraph = MakeLinearGraph<SetType>(ntx);
+    uint64_t rng_seed = 0;
     bench.run([&] {
-        Linearize(depgraph, /*max_iterations=*/0);
+        Linearize(depgraph, /*max_iterations=*/0, rng_seed++);
     });
 }
 
diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 52880529f6f..f689e7e33a5 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -12,6 +12,7 @@
 #include <vector>
 #include <utility>
 
+#include <random.h>
 #include <util/feefrac.h>
 #include <util/vecdeque.h>
 
@@ -225,6 +226,13 @@ struct SetInfo
         return {transactions | txn, feerate + depgraph.FeeRate(txn - transactions)};
     }
 
+    /** Swap two SetInfo objects. */
+    friend void swap(SetInfo& a, SetInfo& b) noexcept
+    {
+        swap(a.transactions, b.transactions);
+        swap(a.feerate, b.feerate);
+    }
+
     /** Permit equality testing. */
     friend bool operator==(const SetInfo&, const SetInfo&) noexcept = default;
 };
@@ -356,6 +364,8 @@ public:
 template<typename SetType>
 class SearchCandidateFinder
 {
+    /** Internal RNG. */
+    InsecureRandomContext m_rng;
     /** Internal dependency graph for the cluster. */
     const DepGraph<SetType>& m_depgraph;
     /** Which transactions are left to do (sorted indices). */
@@ -365,10 +375,12 @@ public:
     /** Construct a candidate finder for a graph.
      *
      * @param[in] depgraph   Dependency graph for the to-be-linearized cluster.
+     * @param[in] rng_seed   A random seed to control the search order.
      *
      * Complexity: O(1).
      */
-    SearchCandidateFinder(const DepGraph<SetType>& depgraph LIFETIMEBOUND) noexcept :
+    SearchCandidateFinder(const DepGraph<SetType>& depgraph LIFETIMEBOUND, uint64_t rng_seed) noexcept :
+        m_rng(rng_seed),
         m_depgraph(depgraph),
         m_todo(SetType::Fill(depgraph.TxCount())) {}
 
@@ -413,6 +425,13 @@ public:
             /** Construct a new work item. */
             WorkItem(SetInfo<SetType>&& i, SetType&& u) noexcept :
                 inc(std::move(i)), und(std::move(u)) {}
+
+            /** Swap two WorkItems. */
+            void Swap(WorkItem& other) noexcept
+            {
+                swap(inc, other.inc);
+                swap(und, other.und);
+            }
         };
 
         /** The queue of work items. */
@@ -493,9 +512,14 @@ public:
         // (BFS) corresponds to always taking from the front, which potentially uses more memory
         // (up to exponential in the transaction count), but seems to work better in practice.
         //
-        // The approach here combines the two: use BFS until the queue grows too large, at which
-        // point we temporarily switch to DFS until the size shrinks again.
+        // The approach here combines the two: use BFS (plus random swapping) until the queue grows
+        // too large, at which point we temporarily switch to DFS until the size shrinks again.
         while (!queue.empty()) {
+            // Randomly swap the first two items to randomize the search order.
+            if (queue.size() > 1 && m_rng.randbool()) {
+                queue[0].Swap(queue[1]);
+            }
+
             // Processing the first queue item, and then using DFS for everything it gives rise to,
             // may increase the queue size by the number of undecided elements in there, minus 1
             // for the first queue item being removed. Thus, only when that pushes the queue over
@@ -534,6 +558,9 @@ public:
  *
  * @param[in] depgraph            Dependency graph of the cluster to be linearized.
  * @param[in] max_iterations      Upper bound on the number of optimization steps that will be done.
+ * @param[in] rng_seed            A random number seed to control search order. This prevents peers
+ *                                from predicting exactly which clusters would be hard for us to
+ *                                linearize.
  * @return                        A pair of:
  *                                - The resulting linearization.
  *                                - A boolean indicating whether the result is guaranteed to be
@@ -542,7 +569,7 @@ public:
  * Complexity: O(N * min(max_iterations + N, 2^N)) where N=depgraph.TxCount().
  */
 template<typename SetType>
-std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations) noexcept
+std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations, uint64_t rng_seed) noexcept
 {
     if (depgraph.TxCount() == 0) return {{}, true};
 
@@ -550,7 +577,7 @@ std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& de
     std::vector<ClusterIndex> linearization;
 
     AncestorCandidateFinder anc_finder(depgraph);
-    SearchCandidateFinder src_finder(depgraph);
+    SearchCandidateFinder src_finder(depgraph, rng_seed);
     linearization.reserve(depgraph.TxCount());
     bool optimal = true;
 
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index e4d9a59b8d3..6157291364b 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -391,15 +391,16 @@ FUZZ_TARGET(clusterlin_search_finder)
     // and comparing with the results from SimpleCandidateFinder, ExhaustiveCandidateFinder, and
     // AncestorCandidateFinder.
 
-    // Retrieve a depgraph from the fuzz input.
+    // Retrieve an RNG seed and a depgraph from the fuzz input.
     SpanReader reader(buffer);
     DepGraph<TestBitSet> depgraph;
+    uint64_t rng_seed{0};
     try {
-        reader >> Using<DepGraphFormatter>(depgraph);
+        reader >> Using<DepGraphFormatter>(depgraph) >> rng_seed;
     } catch (const std::ios_base::failure&) {}
 
     // Instantiate ALL the candidate finders.
-    SearchCandidateFinder src_finder(depgraph);
+    SearchCandidateFinder src_finder(depgraph, rng_seed);
     SimpleCandidateFinder smp_finder(depgraph);
     ExhaustiveCandidateFinder exh_finder(depgraph);
     AncestorCandidateFinder anc_finder(depgraph);
@@ -487,17 +488,18 @@ FUZZ_TARGET(clusterlin_linearize)
 {
     // Verify the behavior of Linearize().
 
-    // Retrieve an iteration count, and a depgraph from the fuzz input.
+    // Retrieve an RNG seed, an iteration count, and a depgraph from the fuzz input.
     SpanReader reader(buffer);
     DepGraph<TestBitSet> depgraph;
+    uint64_t rng_seed{0};
     uint64_t iter_count{0};
     try {
-        reader >> VARINT(iter_count) >> Using<DepGraphFormatter>(depgraph);
+        reader >> VARINT(iter_count) >> Using<DepGraphFormatter>(depgraph) >> rng_seed;
     } catch (const std::ios_base::failure&) {}
 
     // Invoke Linearize().
     iter_count &= 0x7ffff;
-    auto [linearization, optimal] = Linearize(depgraph, iter_count);
+    auto [linearization, optimal] = Linearize(depgraph, iter_count, rng_seed);
     SanityCheck(depgraph, linearization);
     auto chunking = ChunkLinearization(depgraph, linearization);
 

From 97d98718b005adc0bdf513d724874601d8aa13ad Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Wed, 10 Jul 2024 11:08:42 -0400
Subject: [PATCH 10/12] clusterlin: add LinearizationChunking class

It encapsulates a given linearization in chunked form, permitting arbitrary
subsets of transactions to be removed from the linearization. Its purpose
is adding the Intersect function, which is a crucial operation that will
be used in a further commit to make Linearize improve existing linearizations.
---
 src/cluster_linearize.h             | 109 ++++++++++++++++++++++++++
 src/test/fuzz/cluster_linearize.cpp | 117 ++++++++++++++++++++++++++++
 2 files changed, 226 insertions(+)

diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index f689e7e33a5..4238295630c 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -13,6 +13,7 @@
 #include <utility>
 
 #include <random.h>
+#include <span.h>
 #include <util/feefrac.h>
 #include <util/vecdeque.h>
 
@@ -256,6 +257,114 @@ std::vector<FeeFrac> ChunkLinearization(const DepGraph<SetType>& depgraph, Span<
     return ret;
 }
 
+/** Data structure encapsulating the chunking of a linearization, permitting removal of subsets. */
+template<typename SetType>
+class LinearizationChunking
+{
+    /** The depgraph this linearization is for. */
+    const DepGraph<SetType>& m_depgraph;
+
+    /** The linearization we started from. */
+    Span<const ClusterIndex> m_linearization;
+
+    /** Chunk sets and their feerates, of what remains of the linearization. */
+    std::vector<SetInfo<SetType>> m_chunks;
+
+    /** Which transactions remain in the linearization. */
+    SetType m_todo;
+
+    /** Fill the m_chunks variable. */
+    void BuildChunks() noexcept
+    {
+        // Caller must clear m_chunks.
+        Assume(m_chunks.empty());
+
+        // Iterate over the entries in m_linearization. This is effectively the same
+        // algorithm as ChunkLinearization, but supports skipping parts of the linearization and
+        // keeps track of the sets themselves instead of just their feerates.
+        for (auto idx : m_linearization) {
+            if (!m_todo[idx]) continue;
+            // Start with an initial chunk containing just element idx.
+            SetInfo add(m_depgraph, idx);
+            // Absorb existing final chunks into add while they have lower feerate.
+            while (!m_chunks.empty() && add.feerate >> m_chunks.back().feerate) {
+                add |= m_chunks.back();
+                m_chunks.pop_back();
+            }
+            // Remember new chunk.
+            m_chunks.push_back(std::move(add));
+        }
+    }
+
+public:
+    /** Initialize a LinearizationSubset object for a given length of linearization. */
+    explicit LinearizationChunking(const DepGraph<SetType>& depgraph LIFETIMEBOUND, Span<const ClusterIndex> lin LIFETIMEBOUND) noexcept :
+        m_depgraph(depgraph), m_linearization(lin)
+    {
+        // Mark everything in lin as todo still.
+        for (auto i : m_linearization) m_todo.Set(i);
+        // Compute the initial chunking.
+        m_chunks.reserve(depgraph.TxCount());
+        BuildChunks();
+    }
+
+    /** Determine how many chunks remain in the linearization. */
+    ClusterIndex NumChunksLeft() const noexcept { return m_chunks.size(); }
+
+    /** Access a chunk. Chunk 0 is the highest-feerate prefix of what remains. */
+    const SetInfo<SetType>& GetChunk(ClusterIndex n) const noexcept
+    {
+        Assume(n < m_chunks.size());
+        return m_chunks[n];
+    }
+
+    /** Remove some subset of transactions from the linearization. */
+    void MarkDone(SetType subset) noexcept
+    {
+        Assume(subset.Any());
+        Assume(subset.IsSubsetOf(m_todo));
+        m_todo -= subset;
+        // Rechunk what remains of m_linearization.
+        m_chunks.clear();
+        BuildChunks();
+    }
+
+    /** Find the shortest intersection between subset and the prefixes of remaining chunks
+     *  of the linearization that has a feerate not below subset's.
+     *
+     * This is a crucial operation in guaranteeing improvements to linearizations. If subset has
+     * a feerate not below GetChunk(0)'s, then moving Intersect(subset) to the front of (what
+     * remains of) the linearization is guaranteed not to make it worse at any point.
+     *
+     * See https://delvingbitcoin.org/t/introduction-to-cluster-linearization/1032 for background.
+     */
+    SetInfo<SetType> Intersect(const SetInfo<SetType>& subset) const noexcept
+    {
+        Assume(subset.transactions.IsSubsetOf(m_todo));
+        SetInfo<SetType> accumulator;
+        // Iterate over all chunks of the remaining linearization.
+        for (ClusterIndex i = 0; i < NumChunksLeft(); ++i) {
+            // Find what (if any) intersection the chunk has with subset.
+            const SetType to_add = GetChunk(i).transactions & subset.transactions;
+            if (to_add.Any()) {
+                // If adding that to accumulator makes us hit all of subset, we are done as no
+                // shorter intersection with higher/equal feerate exists.
+                accumulator.transactions |= to_add;
+                if (accumulator.transactions == subset.transactions) break;
+                // Otherwise update the accumulator feerate.
+                accumulator.feerate += m_depgraph.FeeRate(to_add);
+                // If that does result in something better, or something with the same feerate but
+                // smaller, return that. Even if a longer, higher-feerate intersection exists, it
+                // does not hurt to return the shorter one (the remainder of the longer intersection
+                // will generally be found in the next call to Intersect, but even if not, it is not
+                // required for the improvement guarantee this function makes).
+                if (!(accumulator.feerate << subset.feerate)) return accumulator;
+            }
+        }
+        return subset;
+    }
+};
+
 /** Class encapsulating the state needed to find the best remaining ancestor set.
  *
  * It is initialized for an entire DepGraph, and parts of the graph can be dropped by calling
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index 6157291364b..5c8b9f59055 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -484,6 +484,123 @@ FUZZ_TARGET(clusterlin_search_finder)
     assert(anc_finder.AllDone());
 }
 
+FUZZ_TARGET(clusterlin_linearization_chunking)
+{
+    // Verify the behavior of LinearizationChunking.
+
+    // Retrieve a depgraph from the fuzz input.
+    SpanReader reader(buffer);
+    DepGraph<TestBitSet> depgraph;
+    try {
+        reader >> Using<DepGraphFormatter>(depgraph);
+    } catch (const std::ios_base::failure&) {}
+
+    // Retrieve a topologically-valid subset of depgraph.
+    auto todo = TestBitSet::Fill(depgraph.TxCount());
+    auto subset = SetInfo(depgraph, ReadTopologicalSet(depgraph, todo, reader));
+
+    // Retrieve a valid linearization for depgraph.
+    auto linearization = ReadLinearization(depgraph, reader);
+
+    // Construct a LinearizationChunking object, initially for the whole linearization.
+    LinearizationChunking chunking(depgraph, linearization);
+
+    // Incrementally remove transactions from the chunking object, and check various properties at
+    // every step.
+    while (todo.Any()) {
+        assert(chunking.NumChunksLeft() > 0);
+
+        // Construct linearization with just todo.
+        std::vector<ClusterIndex> linearization_left;
+        for (auto i : linearization) {
+            if (todo[i]) linearization_left.push_back(i);
+        }
+
+        // Compute the chunking for linearization_left.
+        auto chunking_left = ChunkLinearization(depgraph, linearization_left);
+
+        // Verify that it matches the feerates of the chunks of chunking.
+        assert(chunking.NumChunksLeft() == chunking_left.size());
+        for (ClusterIndex i = 0; i < chunking.NumChunksLeft(); ++i) {
+            assert(chunking.GetChunk(i).feerate == chunking_left[i]);
+        }
+
+        // Check consistency of chunking.
+        TestBitSet combined;
+        for (ClusterIndex i = 0; i < chunking.NumChunksLeft(); ++i) {
+            const auto& chunk_info = chunking.GetChunk(i);
+            // Chunks must be non-empty.
+            assert(chunk_info.transactions.Any());
+            // Chunk feerates must be monotonically non-increasing.
+            if (i > 0) assert(!(chunk_info.feerate >> chunking.GetChunk(i - 1).feerate));
+            // Chunks must be a subset of what is left of the linearization.
+            assert(chunk_info.transactions.IsSubsetOf(todo));
+            // Chunks' claimed feerates must match their transactions' aggregate feerate.
+            assert(depgraph.FeeRate(chunk_info.transactions) == chunk_info.feerate);
+            // Chunks must be the highest-feerate remaining prefix.
+            SetInfo<TestBitSet> accumulator, best;
+            for (auto j : linearization) {
+                if (todo[j] && !combined[j]) {
+                    accumulator |= SetInfo(depgraph, j);
+                    if (best.feerate.IsEmpty() || accumulator.feerate > best.feerate) {
+                        best = accumulator;
+                    }
+                }
+            }
+            assert(best.transactions == chunk_info.transactions);
+            assert(best.feerate == chunk_info.feerate);
+            // Chunks cannot overlap.
+            assert(!chunk_info.transactions.Overlaps(combined));
+            combined |= chunk_info.transactions;
+            // Chunks must be topological.
+            for (auto idx : chunk_info.transactions) {
+                assert((depgraph.Ancestors(idx) & todo).IsSubsetOf(combined));
+            }
+        }
+        assert(combined == todo);
+
+        // Verify the expected properties of LinearizationChunking::Intersect:
+        auto intersect = chunking.Intersect(subset);
+        // - Intersecting again doesn't change the result.
+        assert(chunking.Intersect(intersect) == intersect);
+        // - The intersection is topological.
+        TestBitSet intersect_anc;
+        for (auto idx : intersect.transactions) {
+            intersect_anc |= (depgraph.Ancestors(idx) & todo);
+        }
+        assert(intersect.transactions == intersect_anc);
+        // - The claimed intersection feerate matches its transactions.
+        assert(intersect.feerate == depgraph.FeeRate(intersect.transactions));
+        // - The intersection may only be empty if its input is empty.
+        assert(intersect.transactions.Any() == subset.transactions.Any());
+        // - The intersection feerate must be as high as the input.
+        assert(intersect.feerate >= subset.feerate);
+        // - No non-empty intersection between the intersection and a prefix of the chunks of the
+        //   remainder of the linearization may be better than the intersection.
+        TestBitSet prefix;
+        for (ClusterIndex i = 0; i < chunking.NumChunksLeft(); ++i) {
+            prefix |= chunking.GetChunk(i).transactions;
+            auto reintersect = SetInfo(depgraph, prefix & intersect.transactions);
+            if (!reintersect.feerate.IsEmpty()) {
+                assert(reintersect.feerate <= intersect.feerate);
+            }
+        }
+
+        // Find a subset to remove from linearization.
+        auto done = ReadTopologicalSet(depgraph, todo, reader);
+        if (done.None()) {
+            // We need to remove a non-empty subset, so fall back to the unlinearized ancestors of
+            // the first transaction in todo if done is empty.
+            done = depgraph.Ancestors(todo.First()) & todo;
+        }
+        todo -= done;
+        chunking.MarkDone(done);
+        subset = SetInfo(depgraph, subset.transactions - done);
+    }
+
+    assert(chunking.NumChunksLeft() == 0);
+}
+
 FUZZ_TARGET(clusterlin_linearize)
 {
     // Verify the behavior of Linearize().

From 28549791b3802fc078128f552c6f53ac3de893a6 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 9 May 2024 09:02:18 -0400
Subject: [PATCH 11/12] clusterlin: permit passing in existing linearization to
 Linearize

This implements the LIMO algorithm for linearizing by improving an existing
linearization. See
https://delvingbitcoin.org/t/limo-combining-the-best-parts-of-linearization-search-and-merging
for details.
---
 src/bench/cluster_linearize.cpp     |  6 ++++--
 src/cluster_linearize.h             | 29 +++++++++++++++++++++++++----
 src/test/fuzz/cluster_linearize.cpp | 27 ++++++++++++++++++++++++---
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/src/bench/cluster_linearize.cpp b/src/bench/cluster_linearize.cpp
index bfab5c729bd..888684fe932 100644
--- a/src/bench/cluster_linearize.cpp
+++ b/src/bench/cluster_linearize.cpp
@@ -109,7 +109,7 @@ void BenchLinearizePerIterWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
     });
 }
 
-/** Benchmark for linearization of a trivial linear graph using just ancestor sort.
+/** Benchmark for linearization improvement of a trivial linear graph using just ancestor sort.
  *
  * Its goal is measuring how much time linearization may take without any search iterations.
  *
@@ -124,8 +124,10 @@ void BenchLinearizeNoItersWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
 {
     const auto depgraph = MakeLinearGraph<SetType>(ntx);
     uint64_t rng_seed = 0;
+    std::vector<ClusterIndex> old_lin(ntx);
+    for (ClusterIndex i = 0; i < ntx; ++i) old_lin[i] = i;
     bench.run([&] {
-        Linearize(depgraph, /*max_iterations=*/0, rng_seed++);
+        Linearize(depgraph, /*max_iterations=*/0, rng_seed++, old_lin);
     });
 }
 
diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
index 4238295630c..07d28a9aa51 100644
--- a/src/cluster_linearize.h
+++ b/src/cluster_linearize.h
@@ -663,23 +663,27 @@ public:
     }
 };
 
-/** Find a linearization for a cluster.
+/** Find or improve a linearization for a cluster.
  *
  * @param[in] depgraph            Dependency graph of the cluster to be linearized.
  * @param[in] max_iterations      Upper bound on the number of optimization steps that will be done.
  * @param[in] rng_seed            A random number seed to control search order. This prevents peers
  *                                from predicting exactly which clusters would be hard for us to
  *                                linearize.
+ * @param[in] old_linearization   An existing linearization for the cluster (which must be
+ *                                topologically valid), or empty.
  * @return                        A pair of:
- *                                - The resulting linearization.
+ *                                - The resulting linearization. It is guaranteed to be at least as
+ *                                  good (in the feerate diagram sense) as old_linearization.
  *                                - A boolean indicating whether the result is guaranteed to be
  *                                  optimal.
  *
  * Complexity: O(N * min(max_iterations + N, 2^N)) where N=depgraph.TxCount().
  */
 template<typename SetType>
-std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations, uint64_t rng_seed) noexcept
+std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations, uint64_t rng_seed, Span<const ClusterIndex> old_linearization = {}) noexcept
 {
+    Assume(old_linearization.empty() || old_linearization.size() == depgraph.TxCount());
     if (depgraph.TxCount() == 0) return {{}, true};
 
     uint64_t iterations_left = max_iterations;
@@ -690,9 +694,17 @@ std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& de
     linearization.reserve(depgraph.TxCount());
     bool optimal = true;
 
+    /** Chunking of what remains of the old linearization. */
+    LinearizationChunking old_chunking(depgraph, old_linearization);
+
     while (true) {
-        // Initialize best as the best remaining ancestor set.
+        // Find the highest-feerate prefix of the remainder of old_linearization.
+        SetInfo<SetType> best_prefix;
+        if (old_chunking.NumChunksLeft()) best_prefix = old_chunking.GetChunk(0);
+
+        // Then initialize best to be either the best remaining ancestor set, or the first chunk.
         auto best = anc_finder.FindCandidateSet();
+        if (!best_prefix.feerate.IsEmpty() && best_prefix.feerate >= best.feerate) best = best_prefix;
 
         // Invoke bounded search to update best, with up to half of our remaining iterations as
         // limit.
@@ -703,6 +715,12 @@ std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& de
 
         if (iterations_done_now == max_iterations_now) {
             optimal = false;
+            // If the search result is not (guaranteed to be) optimal, run intersections to make
+            // sure we don't pick something that makes us unable to reach further diagram points
+            // of the old linearization.
+            if (old_chunking.NumChunksLeft() > 0) {
+                best = old_chunking.Intersect(best);
+            }
         }
 
         // Add to output in topological order.
@@ -712,6 +730,9 @@ std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& de
         anc_finder.MarkDone(best.transactions);
         if (anc_finder.AllDone()) break;
         src_finder.MarkDone(best.transactions);
+        if (old_chunking.NumChunksLeft() > 0) {
+            old_chunking.MarkDone(best.transactions);
+        }
     }
 
     return {std::move(linearization), optimal};
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
index 5c8b9f59055..031cb045593 100644
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -143,8 +143,9 @@ public:
 
 /** A simple linearization algorithm.
  *
- * This matches Linearize() in interface and behavior, though with fewer optimizations, and using
- * just SimpleCandidateFinder rather than AncestorCandidateFinder and SearchCandidateFinder.
+ * This matches Linearize() in interface and behavior, though with fewer optimizations, lacking
+ * the ability to pass in an existing linearization, and using just SimpleCandidateFinder rather
+ * than AncestorCandidateFinder and SearchCandidateFinder.
  */
 template<typename SetType>
 std::pair<std::vector<ClusterIndex>, bool> SimpleLinearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations)
@@ -614,12 +615,32 @@ FUZZ_TARGET(clusterlin_linearize)
         reader >> VARINT(iter_count) >> Using<DepGraphFormatter>(depgraph) >> rng_seed;
     } catch (const std::ios_base::failure&) {}
 
+    // Optionally construct an old linearization for it.
+    std::vector<ClusterIndex> old_linearization;
+    {
+        uint8_t have_old_linearization{0};
+        try {
+            reader >> have_old_linearization;
+        } catch(const std::ios_base::failure&) {}
+        if (have_old_linearization & 1) {
+            old_linearization = ReadLinearization(depgraph, reader);
+            SanityCheck(depgraph, old_linearization);
+        }
+    }
+
     // Invoke Linearize().
     iter_count &= 0x7ffff;
-    auto [linearization, optimal] = Linearize(depgraph, iter_count, rng_seed);
+    auto [linearization, optimal] = Linearize(depgraph, iter_count, rng_seed, old_linearization);
     SanityCheck(depgraph, linearization);
     auto chunking = ChunkLinearization(depgraph, linearization);
 
+    // Linearization must always be as good as the old one, if provided.
+    if (!old_linearization.empty()) {
+        auto old_chunking = ChunkLinearization(depgraph, old_linearization);
+        auto cmp = CompareChunks(chunking, old_chunking);
+        assert(cmp >= 0);
+    }
+
     // If the iteration count is sufficiently high, an optimal linearization must be found.
     // Each linearization step can use up to 2^k iterations, with steps k=1..n. That sum is
     // 2 * (2^n - 1)

From 647fa37cdbadbeebba147ca6b24e138559cffaaf Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 11 Jul 2024 17:08:23 -0400
Subject: [PATCH 12/12] bench: add cluster linearization improvement benchmark

---
 src/bench/cluster_linearize.cpp | 80 +++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 14 deletions(-)

diff --git a/src/bench/cluster_linearize.cpp b/src/bench/cluster_linearize.cpp
index 888684fe932..9987d376a53 100644
--- a/src/bench/cluster_linearize.cpp
+++ b/src/bench/cluster_linearize.cpp
@@ -26,6 +26,21 @@ DepGraph<SetType> MakeLinearGraph(ClusterIndex ntx)
     return depgraph;
 }
 
+/** Construct a wide graph (one root, with N-1 children that are otherwise unrelated, with
+ *  increasing feerates). These graphs are pessimal for the LIMO step in Linearize, because
+ *  rechunking is needed after every candidate (the last transaction gets picked every time).
+ */
+template<typename SetType>
+DepGraph<SetType> MakeWideGraph(ClusterIndex ntx)
+{
+    DepGraph<SetType> depgraph;
+    for (ClusterIndex i = 0; i < ntx; ++i) {
+        depgraph.AddTransaction({int32_t(i) + 1, 1});
+        if (i > 0) depgraph.AddDependency(0, i);
+    }
+    return depgraph;
+}
+
 // Construct a difficult graph. These need at least sqrt(2^(n-1)) iterations in the best
 // known algorithms (purely empirically determined).
 template<typename SetType>
@@ -114,13 +129,16 @@ void BenchLinearizePerIterWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
  * Its goal is measuring how much time linearization may take without any search iterations.
  *
  * If P is the resulting time of BenchLinearizePerIterWorstCase, and N is the resulting time of
- * BenchLinearizeNoItersWorstCase, then an invocation of Linearize with max_iterations=m should
+ * BenchLinearizeNoItersWorstCase*, then an invocation of Linearize with max_iterations=m should
  * take no more than roughly N+m*P time. This may however be an overestimate, as the worst cases
  * do not coincide (the ones that are worst for linearization without any search happen to be ones
  * that do not need many search iterations).
+ *
+ * This benchmark exercises a worst case for AncestorCandidateFinder, but for which improvement is
+ * cheap.
  */
 template<typename SetType>
-void BenchLinearizeNoItersWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
+void BenchLinearizeNoItersWorstCaseAnc(ClusterIndex ntx, benchmark::Bench& bench)
 {
     const auto depgraph = MakeLinearGraph<SetType>(ntx);
     uint64_t rng_seed = 0;
@@ -131,6 +149,26 @@ void BenchLinearizeNoItersWorstCase(ClusterIndex ntx, benchmark::Bench& bench)
     });
 }
 
+/** Benchmark for linearization improvement of a trivial wide graph using just ancestor sort.
+ *
+ * Its goal is measuring how much time improving a linearization may take without any search
+ * iterations, similar to the previous function.
+ *
+ * This benchmark exercises a worst case for improving an existing linearization, but for which
+ * AncestorCandidateFinder is cheap.
+ */
+template<typename SetType>
+void BenchLinearizeNoItersWorstCaseLIMO(ClusterIndex ntx, benchmark::Bench& bench)
+{
+    const auto depgraph = MakeWideGraph<SetType>(ntx);
+    uint64_t rng_seed = 0;
+    std::vector<ClusterIndex> old_lin(ntx);
+    for (ClusterIndex i = 0; i < ntx; ++i) old_lin[i] = i;
+    bench.run([&] {
+        Linearize(depgraph, /*max_iterations=*/0, rng_seed++, old_lin);
+    });
+}
+
 } // namespace
 
 static void LinearizePerIter16TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<16>>(16, bench); }
@@ -140,12 +178,19 @@ static void LinearizePerIter64TxWorstCase(benchmark::Bench& bench) { BenchLinear
 static void LinearizePerIter75TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<75>>(75, bench); }
 static void LinearizePerIter99TxWorstCase(benchmark::Bench& bench) { BenchLinearizePerIterWorstCase<BitSet<99>>(99, bench); }
 
-static void LinearizeNoIters16TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<16>>(16, bench); }
-static void LinearizeNoIters32TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<32>>(32, bench); }
-static void LinearizeNoIters48TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<48>>(48, bench); }
-static void LinearizeNoIters64TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<64>>(64, bench); }
-static void LinearizeNoIters75TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<75>>(75, bench); }
-static void LinearizeNoIters99TxWorstCase(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCase<BitSet<99>>(99, bench); }
+static void LinearizeNoIters16TxWorstCaseAnc(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseAnc<BitSet<16>>(16, bench); }
+static void LinearizeNoIters32TxWorstCaseAnc(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseAnc<BitSet<32>>(32, bench); }
+static void LinearizeNoIters48TxWorstCaseAnc(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseAnc<BitSet<48>>(48, bench); }
+static void LinearizeNoIters64TxWorstCaseAnc(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseAnc<BitSet<64>>(64, bench); }
+static void LinearizeNoIters75TxWorstCaseAnc(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseAnc<BitSet<75>>(75, bench); }
+static void LinearizeNoIters99TxWorstCaseAnc(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseAnc<BitSet<99>>(99, bench); }
+
+static void LinearizeNoIters16TxWorstCaseLIMO(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseLIMO<BitSet<16>>(16, bench); }
+static void LinearizeNoIters32TxWorstCaseLIMO(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseLIMO<BitSet<32>>(32, bench); }
+static void LinearizeNoIters48TxWorstCaseLIMO(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseLIMO<BitSet<48>>(48, bench); }
+static void LinearizeNoIters64TxWorstCaseLIMO(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseLIMO<BitSet<64>>(64, bench); }
+static void LinearizeNoIters75TxWorstCaseLIMO(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseLIMO<BitSet<75>>(75, bench); }
+static void LinearizeNoIters99TxWorstCaseLIMO(benchmark::Bench& bench) { BenchLinearizeNoItersWorstCaseLIMO<BitSet<99>>(99, bench); }
 
 BENCHMARK(LinearizePerIter16TxWorstCase, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizePerIter32TxWorstCase, benchmark::PriorityLevel::HIGH);
@@ -154,9 +199,16 @@ BENCHMARK(LinearizePerIter64TxWorstCase, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizePerIter75TxWorstCase, benchmark::PriorityLevel::HIGH);
 BENCHMARK(LinearizePerIter99TxWorstCase, benchmark::PriorityLevel::HIGH);
 
-BENCHMARK(LinearizeNoIters16TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizeNoIters32TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizeNoIters48TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizeNoIters64TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizeNoIters75TxWorstCase, benchmark::PriorityLevel::HIGH);
-BENCHMARK(LinearizeNoIters99TxWorstCase, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters16TxWorstCaseAnc, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters32TxWorstCaseAnc, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters48TxWorstCaseAnc, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters64TxWorstCaseAnc, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters75TxWorstCaseAnc, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters99TxWorstCaseAnc, benchmark::PriorityLevel::HIGH);
+
+BENCHMARK(LinearizeNoIters16TxWorstCaseLIMO, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters32TxWorstCaseLIMO, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters48TxWorstCaseLIMO, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters64TxWorstCaseLIMO, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters75TxWorstCaseLIMO, benchmark::PriorityLevel::HIGH);
+BENCHMARK(LinearizeNoIters99TxWorstCaseLIMO, benchmark::PriorityLevel::HIGH);