peer+lnd: add new CLI option to control if we D/C on slow pongs

In this commit, we add a new CLI option to control if we D/C on slow pongs or not. Due to the existence of head-of-the-line blocking at various levels of abstraction (app buffer, slow processing, TCP kernel buffers, etc), if there's a flurry of gossip messages (eg: 1K channel updates), then even with a reasonable processing latency, a peer may still not read our ping in time. To give users another option, we add a flag that allows users to disable this behavior. The default remains.
2025-11-10 06:07:16 +01:00 · 2025-05-09 16:14:43 -07:00
parent b0cba7dd08
commit c493f0c0a9
6 changed files with 179 additions and 82 deletions
--- a/peer/ping_manager.go
+++ b/peer/ping_manager.go
@@ -7,6 +7,7 @@ import (
 	"sync/atomic"
 	"time"

+	"github.com/lightningnetwork/lnd/fn/v2"
 	"github.com/lightningnetwork/lnd/lnwire"
 )

@@ -36,7 +37,8 @@ type PingManagerConfig struct {
 	// OnPongFailure is a closure that is responsible for executing the
 	// logic when a Pong message is either late or does not match our
 	// expectations for that Pong
-	OnPongFailure func(error)
+	OnPongFailure func(failureReason error, timeWaitedForPong time.Duration,
+		lastKnownRTT time.Duration)
 }

 // PingManager is a structure that is designed to manage the internal state
@@ -108,6 +110,26 @@ func (m *PingManager) Start() error {
 	return err
 }

+// getLastRTT safely retrieves the last known RTT, returning 0 if none exists.
+func (m *PingManager) getLastRTT() time.Duration {
+	rttPtr := m.pingTime.Load()
+	if rttPtr == nil {
+		return 0
+	}
+
+	return *rttPtr
+}
+
+// pendingPingWait calculates the time waited since the last ping was sent. If
+// no ping time is reported, None is returned. defaultDuration.
+func (m *PingManager) pendingPingWait() fn.Option[time.Duration] {
+	if m.pingLastSend != nil {
+		return fn.Some(time.Since(*m.pingLastSend))
+	}
+
+	return fn.None[time.Duration]()
+}
+
 // pingHandler is the main goroutine responsible for enforcing the ping/pong
 // protocol.
 func (m *PingManager) pingHandler() {
@@ -119,6 +141,10 @@ func (m *PingManager) pingHandler() {
 		<-m.pingTimeout.C
 	}

+	// Because we don't know if the OnPingFailure callback actually
+	// disconnects a peer (dependent on user config), we should never return
+	// from this loop unless the ping manager is stopped explicitly (which
+	// happens on disconnect).
 	for {
 		select {
 		case <-m.pingTicker.C:
@@ -127,12 +153,20 @@ func (m *PingManager) pingHandler() {
 			// awaiting a pong response.  This should never occur,
 			// but if it does, it implies a timeout.
 			if m.outstandingPongSize >= 0 {
-				e := errors.New("impossible: new ping" +
-					"in unclean state",
+				// Ping was outstanding, meaning it timed out by
+				// the arrival of the next ping interval.
+				timeWaited := m.pendingPingWait().UnwrapOr(
+					m.cfg.IntervalDuration,
 				)
-				m.cfg.OnPongFailure(e)
+				lastRTT := m.getLastRTT()

-				return
+				m.cfg.OnPongFailure(
+					errors.New("ping timed "+
+						"out by next interval"),
+					timeWaited, lastRTT,
+				)
+
+				m.resetPingState()
 			}

 			pongSize := m.cfg.NewPongSize()
@@ -143,52 +177,66 @@ func (m *PingManager) pingHandler() {

 			// Set up our bookkeeping for the new Ping.
 			if err := m.setPingState(pongSize); err != nil {
-				m.cfg.OnPongFailure(err)
+				// This is an internal error related to timer
+				// reset. Pass it to OnPongFailure as it's
+				// critical. Current and last RTT are not
+				// directly applicable here.
+				m.cfg.OnPongFailure(err, 0, 0)

-				return
+				m.resetPingState()
+
+				continue
 			}

 			m.cfg.SendPing(ping)

 		case <-m.pingTimeout.C:
-			m.resetPingState()
+			timeWaited := m.pendingPingWait().UnwrapOr(
+				m.cfg.TimeoutDuration,
+			)
+			lastRTT := m.getLastRTT()

-			e := errors.New("timeout while waiting for " +
-				"pong response",
+			m.cfg.OnPongFailure(
+				errors.New("timeout while waiting for "+
+					"pong response"),
+				timeWaited, lastRTT,
 			)

-			m.cfg.OnPongFailure(e)
-
-			return
+			m.resetPingState()

 		case pong := <-m.pongChan:
 			pongSize := int32(len(pong.PongBytes))

-			// Save off values we are about to override when we
-			// call resetPingState.
+			// Save off values we are about to override when we call
+			// resetPingState.
 			expected := m.outstandingPongSize
-			lastPing := m.pingLastSend
+			lastPingTime := m.pingLastSend

-			m.resetPingState()
+			// This is an unexpected pong, we'll continue.
+			if lastPingTime == nil {
+				continue
+			}

-			// If the pong we receive doesn't match the ping we
-			// sent out, then we fail out.
+			actualRTT := time.Since(*lastPingTime)
+
+			// If the pong we receive doesn't match the ping we sent
+			// out, then we fail out.
 			if pongSize != expected {
-				e := errors.New("pong response does " +
-					"not match expected size",
-				)
+				e := fmt.Errorf("pong response does not match "+
+					"expected size. Expected: %d, Got: %d",
+					expected, pongSize)

-				m.cfg.OnPongFailure(e)
+				lastRTT := m.getLastRTT()
+				m.cfg.OnPongFailure(e, actualRTT, lastRTT)

-				return
+				m.resetPingState()
+
+				continue
 			}

-			// Compute RTT of ping and save that for future
-			// querying.
-			if lastPing != nil {
-				rtt := time.Since(*lastPing)
-				m.pingTime.Store(&rtt)
-			}
+			// Pong is good, update RTT and reset state.
+			m.pingTime.Store(&actualRTT)
+			m.resetPingState()

 		case <-m.quit:
 			return
@@ -231,6 +279,7 @@ func (m *PingManager) setPingState(pongSize uint16) error {
 func (m *PingManager) resetPingState() {
 	m.pingLastSend = nil
 	m.outstandingPongSize = -1
+
 	if !m.pingTimeout.Stop() {
 		select {
 		case <-m.pingTimeout.C: