routing: move second chance logic into mission control

If nodes return a channel policy related failure, they may get a second chance. Our graph may not be up to date. Previously this logic was contained in the payment session. This commit moves that into global mission control and thereby removes the last mission control state that was kept on the payment level. Because mission control is not aware of the relation between payment attempts and payments, the second chance logic is no longer based tracking second chances given per payment. Instead a time based approach is used. If a node reports a policy failure that prevents forwarding to its peer, it will get a second chance. But it will get it only if the previous second chance was long enough ago. Also those second chances are no longer dependent on whether an associated channel update is valid. It will get the second chance regardless, to prevent creating a dependency between mission control and the graph. This would interfer with (future) replay of history, because the graph may not be the same anymore at that point.
2025-07-12 22:22:36 +02:00 · 2019-06-26 08:39:34 +02:00
parent d31efddf1b
commit dc13da5abb
7 changed files with 213 additions and 113 deletions
--- a/routing/router.go
+++ b/routing/router.go
@ -178,6 +178,9 @@ type MissionController interface {
 	ReportEdgeFailure(failedEdge edge,
 		minPenalizeAmt lnwire.MilliSatoshi)

+	// ReportEdgePolicyFailure reports a policy related failure.
+	ReportEdgePolicyFailure(failedEdge edge)
+
 	// ReportVertexFailure reports a node level failure.
 	ReportVertexFailure(v route.Vertex)

@ -1826,6 +1829,47 @@ func (r *ChannelRouter) sendPayment(

 }

+// tryApplyChannelUpdate tries to apply a channel update present in the failure
+// message if any.
+func (r *ChannelRouter) tryApplyChannelUpdate(rt *route.Route,
+	errorSourceIdx int, failure lnwire.FailureMessage) error {
+
+	// It makes no sense to apply our own channel updates.
+	if errorSourceIdx == 0 {
+		log.Errorf("Channel update of ourselves received")
+
+		return nil
+	}
+
+	// Extract channel update if the error contains one.
+	update := r.extractChannelUpdate(failure)
+	if update == nil {
+		return nil
+	}
+
+	// Parse pubkey to allow validation of the channel update. This should
+	// always succeed, otherwise there is something wrong in our
+	// implementation. Therefore return an error.
+	errVertex := rt.Hops[errorSourceIdx-1].PubKeyBytes
+	errSource, err := btcec.ParsePubKey(
+		errVertex[:], btcec.S256(),
+	)
+	if err != nil {
+		log.Errorf("Cannot parse pubkey: idx=%v, pubkey=%v",
+			errorSourceIdx, errVertex)
+
+		return err
+	}
+
+	// Apply channel update.
+	if !r.applyChannelUpdate(update, errSource) {
+		log.Debugf("Invalid channel update received: node=%x",
+			errVertex)
+	}
+
+	return nil
+}
+
 // processSendError analyzes the error for the payment attempt received from the
 // switch and updates mission control and/or channel policies. Depending on the
 // error type, this error is either the final outcome of the payment or we need
@ -1851,32 +1895,28 @@ func (r *ChannelRouter) processSendError(paySession PaymentSession,
 		return true, channeldb.FailureReasonError
 	}

-	var (
-		failureSourceIdx = fErr.FailureSourceIdx
+	failureMessage := fErr.FailureMessage
+	failureSourceIdx := fErr.FailureSourceIdx

-		failureVertex route.Vertex
-		failureSource *btcec.PublicKey
-		err           error
-	)
+	// Apply channel update if the error contains one. For unknown
+	// failures, failureMessage is nil.
+	if failureMessage != nil {
+		err := r.tryApplyChannelUpdate(
+			rt, failureSourceIdx, failureMessage,
+		)
+		if err != nil {
+			return true, channeldb.FailureReasonError
+		}
+	}
+
+	var failureVertex route.Vertex

 	// For any non-self failure, look up the source pub key in the hops
 	// slice. Otherwise return the self node pubkey.
 	if failureSourceIdx > 0 {
 		failureVertex = rt.Hops[failureSourceIdx-1].PubKeyBytes
-		failureSource, err = btcec.ParsePubKey(failureVertex[:], btcec.S256())
-		if err != nil {
-			log.Errorf("Cannot parse pubkey %v: %v",
-				failureVertex, err)
-
-			return true, channeldb.FailureReasonError
-		}
 	} else {
 		failureVertex = r.selfNode.PubKeyBytes
-		failureSource, err = r.selfNode.PubKey()
-		if err != nil {
-			log.Errorf("Cannot parse self pubkey: %v", err)
-			return true, channeldb.FailureReasonError
-		}
 	}
 	log.Tracef("Node %x (index %v) reported failure when sending htlc",
 		failureVertex, failureSourceIdx)
@ -1885,41 +1925,7 @@ func (r *ChannelRouter) processSendError(paySession PaymentSession,
 	// update with id may not be available.
 	failedEdge, failedAmt := getFailedEdge(rt, failureSourceIdx)

-	// processChannelUpdateAndRetry is a closure that
-	// handles a failure message containing a channel
-	// update. This function always tries to apply the
-	// channel update and passes on the result to the
-	// payment session to adjust its view on the reliability
-	// of the network.
-	//
-	// As channel id, the locally determined channel id is
-	// used. It does not rely on the channel id that is part
-	// of the channel update message, because the remote
-	// node may lie to us or the update may be corrupt.
-	processChannelUpdateAndRetry := func(
-		update *lnwire.ChannelUpdate,
-		pubKey *btcec.PublicKey) {
-
-		// Try to apply the channel update.
-		updateOk := r.applyChannelUpdate(update, pubKey)
-
-		// If the update could not be applied, prune the
-		// edge. There is no reason to continue trying
-		// this channel.
-		//
-		// TODO: Could even prune the node completely?
-		// Or is there a valid reason for the channel
-		// update to fail?
-		if !updateOk {
-			paySession.ReportEdgeFailure(
-				failedEdge, 0,
-			)
-		}
-
-		paySession.ReportEdgePolicyFailure(failedEdge)
-	}
-
-	switch onionErr := fErr.FailureMessage.(type) {
+	switch fErr.FailureMessage.(type) {

 	// If the end destination didn't know the payment
 	// hash or we sent the wrong payment amount to the
@ -1975,7 +1981,6 @@ func (r *ChannelRouter) processSendError(paySession PaymentSession,
 	// that sent us this error, as it doesn't now what the
 	// correct block height is.
 	case *lnwire.FailExpiryTooSoon:
-		r.applyChannelUpdate(&onionErr.Update, failureSource)
 		paySession.ReportVertexFailure(failureVertex)
 		return false, 0

@ -1996,34 +2001,27 @@ func (r *ChannelRouter) processSendError(paySession PaymentSession,
 	// amount, we'll apply the new minimum amount and retry
 	// routing.
 	case *lnwire.FailAmountBelowMinimum:
-		processChannelUpdateAndRetry(
-			&onionErr.Update, failureSource,
-		)
+		paySession.ReportEdgePolicyFailure(failedEdge)
 		return false, 0

 	// If we get a failure due to a fee, we'll apply the
 	// new fee update, and retry our attempt using the
 	// newly updated fees.
 	case *lnwire.FailFeeInsufficient:
-		processChannelUpdateAndRetry(
-			&onionErr.Update, failureSource,
-		)
+		paySession.ReportEdgePolicyFailure(failedEdge)
 		return false, 0

 	// If we get the failure for an intermediate node that
 	// disagrees with our time lock values, then we'll
 	// apply the new delta value and try it once more.
 	case *lnwire.FailIncorrectCltvExpiry:
-		processChannelUpdateAndRetry(
-			&onionErr.Update, failureSource,
-		)
+		paySession.ReportEdgePolicyFailure(failedEdge)
 		return false, 0

 	// The outgoing channel that this node was meant to
 	// forward one is currently disabled, so we'll apply
 	// the update and continue.
 	case *lnwire.FailChannelDisabled:
-		r.applyChannelUpdate(&onionErr.Update, failureSource)
 		paySession.ReportEdgeFailure(failedEdge, 0)
 		return false, 0

@ -2031,7 +2029,6 @@ func (r *ChannelRouter) processSendError(paySession PaymentSession,
 	// sufficient capacity, so we'll prune this edge for
 	// now, and continue onwards with our path finding.
 	case *lnwire.FailTemporaryChannelFailure:
-		r.applyChannelUpdate(onionErr.Update, failureSource)
 		paySession.ReportEdgeFailure(failedEdge, failedAmt)
 		return false, 0

@ -2103,6 +2100,29 @@ func (r *ChannelRouter) processSendError(paySession PaymentSession,
 	}
 }

+// extractChannelUpdate examines the error and extracts the channel update.
+func (r *ChannelRouter) extractChannelUpdate(
+	failure lnwire.FailureMessage) *lnwire.ChannelUpdate {
+
+	var update *lnwire.ChannelUpdate
+	switch onionErr := failure.(type) {
+	case *lnwire.FailExpiryTooSoon:
+		update = &onionErr.Update
+	case *lnwire.FailAmountBelowMinimum:
+		update = &onionErr.Update
+	case *lnwire.FailFeeInsufficient:
+		update = &onionErr.Update
+	case *lnwire.FailIncorrectCltvExpiry:
+		update = &onionErr.Update
+	case *lnwire.FailChannelDisabled:
+		update = &onionErr.Update
+	case *lnwire.FailTemporaryChannelFailure:
+		update = onionErr.Update
+	}
+
+	return update
+}
+
 // getFailedEdge tries to locate the failing channel given a route and the
 // pubkey of the node that sent the failure. It will assume that the failure is
 // associated with the outgoing channel of the failing node. As a second result,
@ -2147,11 +2167,6 @@ func getFailedEdge(route *route.Route, failureSource int) (edge,
 // database. It returns a bool indicating whether the updates was successful.
 func (r *ChannelRouter) applyChannelUpdate(msg *lnwire.ChannelUpdate,
 	pubKey *btcec.PublicKey) bool {
-	// If we get passed a nil channel update (as it's optional with some
-	// onion errors), then we'll exit early with a success result.
-	if msg == nil {
-		return true
-	}

 	ch, _, _, err := r.GetChannelByID(msg.ShortChannelID)
 	if err != nil {