routing+routerrpc: improve prob. estimation for untried connections

This commit changes mission control to partially base the estimated probability for untried connections on historical results obtained in previous payment attempts. This incentivizes routing nodes to keep all of their channels in good shape.
2025-09-02 03:54:26 +02:00 · 2019-09-04 17:40:14 +02:00
parent fab13900e2
commit 1fac41deed
9 changed files with 410 additions and 97 deletions
--- a/routing/missioncontrol.go
+++ b/routing/missioncontrol.go
@@ -1,7 +1,6 @@
 package routing

 import (
-	"math"
 	"sync"
 	"time"

@@ -47,6 +46,10 @@ const (
 	// prevSuccessProbability is the assumed probability for node pairs that
 	// successfully relayed the previous attempt.
 	prevSuccessProbability = 0.95
+
+	// DefaultAprioriWeight is the default a priori weight. See
+	// MissionControlConfig for further explanation.
+	DefaultAprioriWeight = 0.5
 )

 // NodeResults contains previous results from a node to its peers.
@@ -68,9 +71,6 @@ type MissionControl struct {
 	// particular node.
 	lastPairResult map[route.Vertex]NodeResults

-	// lastNodeFailure tracks the last node level failure per node.
-	lastNodeFailure map[route.Vertex]time.Time
-
 	// lastSecondChance tracks the last time a second chance was granted for
 	// a directed node pair.
 	lastSecondChance map[DirectedNodePair]time.Time
@@ -83,6 +83,10 @@ type MissionControl struct {

 	store *missionControlStore

+	// estimator is the probability estimator that is used with the payment
+	// results that mission control collects.
+	estimator *probabilityEstimator
+
 	sync.Mutex

 	// TODO(roasbeef): further counters, if vertex continually unavailable,
@@ -105,6 +109,15 @@ type MissionControlConfig struct {
 	// MaxMcHistory defines the maximum number of payment results that are
 	// held on disk.
 	MaxMcHistory int
+
+	// AprioriWeight is a value in the range [0, 1] that defines to what
+	// extent historical results should be extrapolated to untried
+	// connections. Setting it to one will completely ignore historical
+	// results and always assume the configured a priori probability for
+	// untried connections. A value of zero will ignore the a priori
+	// probability completely and only base the probability on historical
+	// results, unless there are none available.
+	AprioriWeight float64
 }

 // timedPairResult describes a timestamped pair result.
@@ -157,21 +170,29 @@ func NewMissionControl(db *bbolt.DB, cfg *MissionControlConfig) (
 	*MissionControl, error) {

 	log.Debugf("Instantiating mission control with config: "+
-		"PenaltyHalfLife=%v, AprioriHopProbability=%v",
-		cfg.PenaltyHalfLife, cfg.AprioriHopProbability)
+		"PenaltyHalfLife=%v, AprioriHopProbability=%v, "+
+		"AprioriWeight=%v", cfg.PenaltyHalfLife,
+		cfg.AprioriHopProbability, cfg.AprioriWeight)

 	store, err := newMissionControlStore(db, cfg.MaxMcHistory)
 	if err != nil {
 		return nil, err
 	}

+	estimator := &probabilityEstimator{
+		aprioriHopProbability:  cfg.AprioriHopProbability,
+		aprioriWeight:          cfg.AprioriWeight,
+		penaltyHalfLife:        cfg.PenaltyHalfLife,
+		prevSuccessProbability: prevSuccessProbability,
+	}
+
 	mc := &MissionControl{
 		lastPairResult:   make(map[route.Vertex]NodeResults),
-		lastNodeFailure:  make(map[route.Vertex]time.Time),
 		lastSecondChance: make(map[DirectedNodePair]time.Time),
 		now:              time.Now,
 		cfg:              cfg,
 		store:            store,
+		estimator:        estimator,
 	}

 	if err := mc.init(); err != nil {
@@ -213,7 +234,6 @@ func (m *MissionControl) ResetHistory() error {
 	}

 	m.lastPairResult = make(map[route.Vertex]NodeResults)
-	m.lastNodeFailure = make(map[route.Vertex]time.Time)
 	m.lastSecondChance = make(map[DirectedNodePair]time.Time)

 	log.Debugf("Mission control history cleared")
@@ -229,56 +249,15 @@ func (m *MissionControl) GetProbability(fromNode, toNode route.Vertex,
 	m.Lock()
 	defer m.Unlock()

-	return m.getPairProbability(fromNode, toNode, amt)
-}
+	now := m.now()
+	results := m.lastPairResult[fromNode]

-// getProbAfterFail returns a probability estimate based on a last failure time.
-func (m *MissionControl) getProbAfterFail(lastFailure time.Time) float64 {
-	if lastFailure.IsZero() {
-		return m.cfg.AprioriHopProbability
-	}
-
-	timeSinceLastFailure := m.now().Sub(lastFailure)
-
-	// Calculate success probability based on the weight of the last
-	// failure. When the failure is fresh, its weight is 1 and we'll return
-	// probability 0. Over time the probability recovers to the a priori
-	// probability.
-	weight := m.getWeight(timeSinceLastFailure)
-	probability := m.cfg.AprioriHopProbability * (1 - weight)
-
-	return probability
-}
-
-// getWeight calculates a weight in the range [0, 1] that should be assigned to
-// a payment result. Weight follows an exponential curve that starts at 1 when
-// the result is fresh and asymptotically approaches zero over time. The rate at
-// which this happens is controlled by the penaltyHalfLife parameter.
-func (m *MissionControl) getWeight(age time.Duration) float64 {
-	exp := -age.Hours() / m.cfg.PenaltyHalfLife.Hours()
-	return math.Pow(2, exp)
-}
-
-// getLastPairResult gets the last recorded result for a node pair.
-func (m *MissionControl) getLastPairResult(fromNode,
-	toNode route.Vertex) *timedPairResult {
-
-	nodePairs, ok := m.lastPairResult[fromNode]
-	if !ok {
-		return nil
-	}
-
-	lastResult, ok := nodePairs[toNode]
-	if !ok {
-		return nil
-	}
-
-	return &lastResult
+	return m.estimator.getPairProbability(now, results, toNode, amt)
 }

 // setLastPairResult stores a result for a node pair.
 func (m *MissionControl) setLastPairResult(fromNode,
-	toNode route.Vertex, result *timedPairResult) {
+	toNode route.Vertex, result timedPairResult) {

 	nodePairs, ok := m.lastPairResult[fromNode]
 	if !ok {
@@ -286,43 +265,24 @@ func (m *MissionControl) setLastPairResult(fromNode,
 		m.lastPairResult[fromNode] = nodePairs
 	}

-	nodePairs[toNode] = *result
+	nodePairs[toNode] = result
 }

-// getPairProbability estimates the probability of successfully
-// traversing from fromNode to toNode based on historical payment outcomes.
-func (m *MissionControl) getPairProbability(fromNode,
-	toNode route.Vertex, amt lnwire.MilliSatoshi) float64 {
+// setAllFail stores a fail result for all known connection of the given node.
+func (m *MissionControl) setAllFail(fromNode route.Vertex,
+	timestamp time.Time) {

-	// Start by getting the last node level failure. A node failure is
-	// considered a failure that would have affected every edge. Therefore
-	// we insert a node level failure into the history of every channel. If
-	// there is none, lastFail will be zero.
-	lastFail := m.lastNodeFailure[fromNode]
-
-	// Retrieve the last pair outcome.
-	lastPairResult := m.getLastPairResult(fromNode, toNode)
-
-	// Only look at the last pair outcome if it happened after the last node
-	// level failure. Otherwise the node level failure is the most recent
-	// and used as the basis for calculation of the probability.
-	if lastPairResult != nil && lastPairResult.timestamp.After(lastFail) {
-		if lastPairResult.success {
-			return prevSuccessProbability
-		}
-
-		// Take into account a minimum penalize amount. For balance
-		// errors, a failure may be reported with such a minimum to
-		// prevent too aggresive penalization. We only take into account
-		// a previous failure if the amount that we currently get the
-		// probability for is greater or equal than the minPenalizeAmt
-		// of the previous failure.
-		if amt >= lastPairResult.minPenalizeAmt {
-			lastFail = lastPairResult.timestamp
-		}
+	nodePairs, ok := m.lastPairResult[fromNode]
+	if !ok {
+		return
 	}

-	return m.getProbAfterFail(lastFail)
+	for connection := range nodePairs {
+		nodePairs[connection] = timedPairResult{
+			timestamp:  timestamp,
+			pairResult: failPairResult(0),
+		}
+	}
 }

 // requestSecondChance checks whether the node fromNode can have a second chance
@@ -363,8 +323,7 @@ func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot {
 	defer m.Unlock()

 	log.Debugf("Requesting history snapshot from mission control: "+
-		"node_failure_count=%v, pair_result_count=%v",
-		len(m.lastNodeFailure), len(m.lastPairResult))
+		"pair_result_count=%v", len(m.lastPairResult))

 	pairs := make([]MissionControlPairSnapshot, 0, len(m.lastPairResult))

@@ -475,11 +434,28 @@ func (m *MissionControl) applyPaymentResult(
 		}
 	}

+	// If there is a node-level failure, record a failure for every tried
+	// connection of that node. A node-level failure can be considered as a
+	// failure that would have occurred with any of the node's channels.
+	//
+	// Ideally we'd also record the failure for the untried connections of
+	// the node. Unfortunately this would require access to the graph and
+	// adding this dependency and db calls does not outweigh the benefits.
+	//
+	// Untried connections will fall back to the node probability. After the
+	// call to setAllPairResult below, the node probability will be equal to
+	// the probability of the tried channels except that the a priori
+	// probability is mixed in too. This effect is controlled by the
+	// aprioriWeight parameter. If that parameter isn't set to an extreme
+	// and there are a few known connections, there shouldn't be much of a
+	// difference. The largest difference occurs when aprioriWeight is 1. In
+	// that case, a node-level failure would not be applied to untried
+	// channels.
 	if i.nodeFailure != nil {
 		log.Debugf("Reporting node failure to Mission Control: "+
 			"node=%v", *i.nodeFailure)

-		m.lastNodeFailure[*i.nodeFailure] = result.timeReply
+		m.setAllFail(*i.nodeFailure, result.timeReply)
 	}

 	for pair, pairResult := range i.pairResults {
@@ -492,7 +468,7 @@ func (m *MissionControl) applyPaymentResult(
 				pair, pairResult.minPenalizeAmt)
 		}

-		m.setLastPairResult(pair.From, pair.To, &timedPairResult{
+		m.setLastPairResult(pair.From, pair.To, timedPairResult{
 			timestamp:  result.timeReply,
 			pairResult: pairResult,
 		})