clusterlin: replace cluster linearization with SFL (feature)

This replaces the existing LIMO linearization algorithm (which internally uses ancestor set finding and candidate set finding) with the much more performant spanning-forest linearization algorithm. This removes the old candidate-set search algorithm, and several of its tests, benchmarks, and needed utility code. The worst case time per cost is similar to the previous algorithm, so ACCEPTABLE_ITERS is unchanged.
2026-01-19 23:03:45 +01:00 · 2025-10-23 19:15:21 -04:00
parent 6a8fa821b8
commit 3efc94d656
7 changed files with 81 additions and 994 deletions
--- a/src/test/fuzz/cluster_linearize.cpp
+++ b/src/test/fuzz/cluster_linearize.cpp
@@ -24,29 +24,22 @@
 *          possibly by comparison with other implementations (at the end of the line ->).
 *   <<---: The right side is implemented using the left side.
 *
- *   +-----------------------+
- *   | SearchCandidateFinder | <<---------------------\
- *   +-----------------------+                        |
- *     |                                            +-----------+       +---------------------+
- *     |                                            | Linearize |       | SpanningForestState |
- *     |                                            +-----------+       +---------------------+
- *     |        +-------------------------+           |  |                              |
- *     |        | AncestorCandidateFinder | <<--------/  |                              |
- *     |        +-------------------------+              |                              |
- *     |          |                     ^                |        ^^  PRODUCTION CODE   |
- *     |          |                     |                |        ||                    |
+ *   +---------------------+                        +-----------+
+ *   | SpanningForestState | <<-------------------- | Linearize |
+ *   +---------------------+                        +-----------+
+ *               |                                       |
+ *               |                                       |        ^^  PRODUCTION CODE
+ *               |                                       |        ||
 *  ==============================================================================================
- *     |          |                     |                |        ||                    |
- *     | clusterlin_ancestor_finder*    |                |        vv  TEST CODE         |
- *     |                                |                |                              |
- *     |-clusterlin_search_finder*      |                |-clusterlin_linearize*        |
- *     |                                |                |                              |
- *     v                                |                v              clusterlin_sfl--|
- *   +-----------------------+          |           +-----------------+                 |
- *   | SimpleCandidateFinder | <<-------------------| SimpleLinearize |<----------------/
- *   +-----------------------+          |           +-----------------+
- *                  |                   |                |
- *                  +-------------------/                |
+ *               |                                       |        ||
+ *               |-clusterlin_sfl*                       |        vv  TEST CODE
+ *               |                                       |
+ *               \------------------------------------\  |-clusterlin_linearize*
+ *                                                    |  |
+ *                                                    v  v
+ *   +-----------------------+                      +-----------------+
+ *   | SimpleCandidateFinder | <<-------------------| SimpleLinearize |
+ *   +-----------------------+                      +-----------------+
 *                  |                                    |
 *                  |-clusterlin_simple_finder*          |-clusterlin_simple_linearize*
 *                  v                                    v
@@ -78,11 +71,8 @@ using namespace cluster_linearize;

 namespace {

-/** A simple finder class for candidate sets.
- *
- * This class matches SearchCandidateFinder in interface and behavior, though with fewer
- * optimizations.
- */
+/** A simple finder class for candidate sets (topologically-valid subsets with high feerate), only
+ *  used by SimpleLinearize below. */
 template<typename SetType>
 class SimpleCandidateFinder
 {
@@ -153,7 +143,8 @@ public:
 /** A very simple finder class for optimal candidate sets, which tries every subset.
 *
 * It is even simpler than SimpleCandidateFinder, and exists just to help test the correctness of
- * SimpleCandidateFinder, which is then used to test the correctness of SearchCandidateFinder.
+ * SimpleCandidateFinder, so that it can be used in SimpleLinearize, which is then used to test the
+ * correctness of Linearize.
 */
 template<typename SetType>
 class ExhaustiveCandidateFinder
@@ -204,8 +195,8 @@ public:
 /** A simple linearization algorithm.
 *
 * This matches Linearize() in interface and behavior, though with fewer optimizations, lacking
- * the ability to pass in an existing linearization, and using just SimpleCandidateFinder rather
- * than AncestorCandidateFinder and SearchCandidateFinder.
+ * the ability to pass in an existing linearization, and linearizing by simply finding the
+ * consecutive remaining highest-feerate topological subset using SimpleCandidateFinder.
 */
 template<typename SetType>
 std::pair<std::vector<DepGraphIndex>, bool> SimpleLinearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations)
@@ -766,68 +757,17 @@ FUZZ_TARGET(clusterlin_chunking)
    assert(todo.None());
 }

-FUZZ_TARGET(clusterlin_ancestor_finder)
-{
-    // Verify that AncestorCandidateFinder works as expected.
-
-    // Retrieve a depgraph from the fuzz input.
-    SpanReader reader(buffer);
-    DepGraph<TestBitSet> depgraph;
-    try {
-        reader >> Using<DepGraphFormatter>(depgraph);
-    } catch (const std::ios_base::failure&) {}
-
-    AncestorCandidateFinder anc_finder(depgraph);
-    auto todo = depgraph.Positions();
-    while (todo.Any()) {
-        // Call the ancestor finder's FindCandidateSet for what remains of the graph.
-        assert(!anc_finder.AllDone());
-        assert(todo.Count() == anc_finder.NumRemaining());
-        auto best_anc = anc_finder.FindCandidateSet();
-        // Sanity check the result.
-        assert(best_anc.transactions.Any());
-        assert(best_anc.transactions.IsSubsetOf(todo));
-        assert(depgraph.FeeRate(best_anc.transactions) == best_anc.feerate);
-        assert(depgraph.IsConnected(best_anc.transactions));
-        // Check that it is topologically valid.
-        for (auto i : best_anc.transactions) {
-            assert((depgraph.Ancestors(i) & todo).IsSubsetOf(best_anc.transactions));
-        }
-
-        // Compute all remaining ancestor sets.
-        std::optional<SetInfo<TestBitSet>> real_best_anc;
-        for (auto i : todo) {
-            SetInfo info(depgraph, todo & depgraph.Ancestors(i));
-            if (!real_best_anc.has_value() || info.feerate > real_best_anc->feerate) {
-                real_best_anc = info;
-            }
-        }
-        // The set returned by anc_finder must equal the real best ancestor sets.
-        assert(real_best_anc.has_value());
-        assert(*real_best_anc == best_anc);
-
-        // Find a non-empty topologically valid subset of transactions to remove from the graph.
-        // Using an empty set would mean the next iteration is identical to the current one, and
-        // could cause an infinite loop.
-        auto del_set = ReadTopologicalSet(depgraph, todo, reader, /*non_empty=*/true);
-        todo -= del_set;
-        anc_finder.MarkDone(del_set);
-    }
-    assert(anc_finder.AllDone());
-    assert(anc_finder.NumRemaining() == 0);
-}
-
 static constexpr auto MAX_SIMPLE_ITERATIONS = 300000;

 FUZZ_TARGET(clusterlin_simple_finder)
 {
    // Verify that SimpleCandidateFinder works as expected by sanity checking the results
    // and comparing them (if claimed to be optimal) against the sets found by
-    // ExhaustiveCandidateFinder and AncestorCandidateFinder.
+    // ExhaustiveCandidateFinder.
    //
    // Note that SimpleCandidateFinder is only used in tests; the purpose of this fuzz test is to
-    // establish confidence in SimpleCandidateFinder, so that it can be used to test
-    // SearchCandidateFinder below.
+    // establish confidence in SimpleCandidateFinder, so that it can be used in SimpleLinearize,
+    // which is then used to test Linearize below.

    // Retrieve a depgraph from the fuzz input.
    SpanReader reader(buffer);
@@ -836,18 +776,15 @@ FUZZ_TARGET(clusterlin_simple_finder)
        reader >> Using<DepGraphFormatter>(depgraph);
    } catch (const std::ios_base::failure&) {}

-    // Instantiate the SimpleCandidateFinder to be tested, and the ExhaustiveCandidateFinder and
-    // AncestorCandidateFinder it is being tested against.
+    // Instantiate the SimpleCandidateFinder to be tested, and the ExhaustiveCandidateFinder it is
+    // being tested against.
    SimpleCandidateFinder smp_finder(depgraph);
    ExhaustiveCandidateFinder exh_finder(depgraph);
-    AncestorCandidateFinder anc_finder(depgraph);

    auto todo = depgraph.Positions();
    while (todo.Any()) {
        assert(!smp_finder.AllDone());
        assert(!exh_finder.AllDone());
-        assert(!anc_finder.AllDone());
-        assert(anc_finder.NumRemaining() == todo.Count());

        // Call SimpleCandidateFinder.
        auto [found, iterations_done] = smp_finder.FindCandidateSet(MAX_SIMPLE_ITERATIONS);
@@ -874,10 +811,6 @@ FUZZ_TARGET(clusterlin_simple_finder)

        // Perform further quality checks only if SimpleCandidateFinder claims an optimal result.
        if (optimal) {
-            // Compare with AncestorCandidateFinder.
-            auto anc = anc_finder.FindCandidateSet();
-            assert(anc.feerate <= found.feerate);
-
            if (todo.Count() <= 12) {
                // Compare with ExhaustiveCandidateFinder. This quickly gets computationally
                // expensive for large clusters (O(2^n)), so only do it for sufficiently small ones.
@@ -898,119 +831,10 @@ FUZZ_TARGET(clusterlin_simple_finder)
        todo -= del_set;
        smp_finder.MarkDone(del_set);
        exh_finder.MarkDone(del_set);
-        anc_finder.MarkDone(del_set);
    }

    assert(smp_finder.AllDone());
    assert(exh_finder.AllDone());
-    assert(anc_finder.AllDone());
-    assert(anc_finder.NumRemaining() == 0);
-}
-
-FUZZ_TARGET(clusterlin_search_finder)
-{
-    // Verify that SearchCandidateFinder works as expected by sanity checking the results
-    // and comparing with the results from SimpleCandidateFinder and AncestorCandidateFinder,
-    // if the result is claimed to be optimal.
-
-    // Retrieve an RNG seed, a depgraph, and whether to make it connected, from the fuzz input.
-    SpanReader reader(buffer);
-    DepGraph<TestBitSet> depgraph;
-    uint64_t rng_seed{0};
-    uint8_t make_connected{1};
-    try {
-        reader >> Using<DepGraphFormatter>(depgraph) >> rng_seed >> make_connected;
-    } catch (const std::ios_base::failure&) {}
-    // The most complicated graphs are connected ones (other ones just split up). Optionally force
-    // the graph to be connected.
-    if (make_connected) MakeConnected(depgraph);
-
-    // Instantiate the candidate finders.
-    SearchCandidateFinder src_finder(depgraph, rng_seed);
-    SimpleCandidateFinder smp_finder(depgraph);
-    AncestorCandidateFinder anc_finder(depgraph);
-
-    auto todo = depgraph.Positions();
-    while (todo.Any()) {
-        assert(!src_finder.AllDone());
-        assert(!smp_finder.AllDone());
-        assert(!anc_finder.AllDone());
-        assert(anc_finder.NumRemaining() == todo.Count());
-
-        // For each iteration, read an iteration count limit from the fuzz input.
-        uint64_t max_iterations = 1;
-        try {
-            reader >> VARINT(max_iterations);
-        } catch (const std::ios_base::failure&) {}
-        max_iterations &= 0xfffff;
-
-        // Read an initial subset from the fuzz input (allowed to be empty).
-        auto init_set = ReadTopologicalSet(depgraph, todo, reader, /*non_empty=*/false);
-        SetInfo init_best(depgraph, init_set);
-
-        // Call the search finder's FindCandidateSet for what remains of the graph.
-        auto [found, iterations_done] = src_finder.FindCandidateSet(max_iterations, init_best);
-        bool optimal = iterations_done < max_iterations;
-
-        // Sanity check the result.
-        assert(iterations_done <= max_iterations);
-        assert(found.transactions.Any());
-        assert(found.transactions.IsSubsetOf(todo));
-        assert(depgraph.FeeRate(found.transactions) == found.feerate);
-        if (!init_best.feerate.IsEmpty()) assert(found.feerate >= init_best.feerate);
-        // Check that it is topologically valid.
-        for (auto i : found.transactions) {
-            assert(found.transactions.IsSupersetOf(depgraph.Ancestors(i) & todo));
-        }
-
-        // At most 2^(N-1) iterations can be required: the maximum number of non-empty topological
-        // subsets a (connected) cluster with N transactions can have. Even when the cluster is no
-        // longer connected after removing certain transactions, this holds, because the connected
-        // components are searched separately.
-        assert(iterations_done <= (uint64_t{1} << (todo.Count() - 1)));
-        // Additionally, test that no more than sqrt(2^N)+1 iterations are required. This is just
-        // an empirical bound that seems to hold, without proof. Still, add a test for it so we
-        // can learn about counterexamples if they exist.
-        if (iterations_done >= 1 && todo.Count() <= 63) {
-            Assume((iterations_done - 1) * (iterations_done - 1) <= uint64_t{1} << todo.Count());
-        }
-
-        // Perform quality checks only if SearchCandidateFinder claims an optimal result.
-        if (optimal) {
-            // Optimal sets are always connected.
-            assert(depgraph.IsConnected(found.transactions));
-
-            // Compare with SimpleCandidateFinder.
-            auto [simple, simple_iters] = smp_finder.FindCandidateSet(MAX_SIMPLE_ITERATIONS);
-            assert(found.feerate >= simple.feerate);
-            if (simple_iters < MAX_SIMPLE_ITERATIONS) {
-                assert(found.feerate == simple.feerate);
-            }
-
-            // Compare with AncestorCandidateFinder;
-            auto anc = anc_finder.FindCandidateSet();
-            assert(found.feerate >= anc.feerate);
-
-            // Compare with a non-empty topological set read from the fuzz input (comparing with an
-            // empty set is not interesting).
-            auto read_topo = ReadTopologicalSet(depgraph, todo, reader, /*non_empty=*/true);
-            assert(found.feerate >= depgraph.FeeRate(read_topo));
-        }
-
-        // Find a non-empty topologically valid subset of transactions to remove from the graph.
-        // Using an empty set would mean the next iteration is identical to the current one, and
-        // could cause an infinite loop.
-        auto del_set = ReadTopologicalSet(depgraph, todo, reader, /*non_empty=*/true);
-        todo -= del_set;
-        src_finder.MarkDone(del_set);
-        smp_finder.MarkDone(del_set);
-        anc_finder.MarkDone(del_set);
-    }
-
-    assert(src_finder.AllDone());
-    assert(smp_finder.AllDone());
-    assert(anc_finder.AllDone());
-    assert(anc_finder.NumRemaining() == 0);
 }

 FUZZ_TARGET(clusterlin_linearization_chunking)
@@ -1250,6 +1074,10 @@ FUZZ_TARGET(clusterlin_sfl)
    }
    test_fn(/*is_optimal=*/true);

+    // Verify that optimality is reached within an expected amount of work. This protects against
+    // hypothetical bugs that hugely increase the amount of work needed to reach optimality.
+    assert(sfl.GetCost() <= MaxOptimalLinearizationIters(depgraph.TxCount()));
+
    // The result must be as good as SimpleLinearize.
    auto [simple_linearization, simple_optimal] = SimpleLinearize(depgraph, MAX_SIMPLE_ITERATIONS / 10);
    auto simple_diagram = ChunkLinearization(depgraph, simple_linearization);
@@ -1301,7 +1129,6 @@ FUZZ_TARGET(clusterlin_linearize)
    // Invoke Linearize().
    iter_count &= 0x7ffff;
    auto [linearization, optimal, cost] = Linearize(depgraph, iter_count, rng_seed, old_linearization);
-    assert(cost <= iter_count);
    SanityCheck(depgraph, linearization);
    auto chunking = ChunkLinearization(depgraph, linearization);

@@ -1313,7 +1140,7 @@ FUZZ_TARGET(clusterlin_linearize)
    }

    // If the iteration count is sufficiently high, an optimal linearization must be found.
-    if (iter_count >= MaxOptimalLinearizationIters(depgraph.TxCount())) {
+    if (iter_count > MaxOptimalLinearizationIters(depgraph.TxCount())) {
        assert(optimal);
    }

@@ -1328,9 +1155,13 @@ FUZZ_TARGET(clusterlin_linearize)
        // If SimpleLinearize finds the optimal result too, they must be equal (if not,
        // SimpleLinearize is broken).
        if (simple_optimal) assert(cmp == 0);
-        // If simple_chunking is diagram-optimal, it cannot have more chunks than chunking (as
-        // chunking is claimed to be optimal, which implies minimal chunks).
-        if (cmp == 0) assert(chunking.size() >= simple_chunking.size());
+
+        // Temporarily disabled, as Linearize() currently does not guarantee minimal chunks, even
+        // when it reports an optimal result. This will be re-introduced in a later commit.
+        //
+        // // If simple_chunking is diagram-optimal, it cannot have more chunks than chunking (as
+        // // chunking is claimed to be optimal, which implies minimal chunks).
+        // if (cmp == 0) assert(chunking.size() >= simple_chunking.size());

        // Compare with a linearization read from the fuzz input.
        auto read = ReadLinearization(depgraph, reader);