From 9fecfed3b5ba41dd0666fbbbf46d9eba52306129 Mon Sep 17 00:00:00 2001 From: yyforyongyu Date: Thu, 16 Jan 2025 22:49:25 +0800 Subject: [PATCH] discovery: fix race access to syncer's state This commit fixes the following race, 1. syncer(state=syncingChans) sends QueryChannelRange 2. remote peer replies ReplyChannelRange 3. ProcessQueryMsg fails to process the remote peer's msg as its state is neither waitingQueryChanReply nor waitingQueryRangeReply. 4. syncer marks its new state waitingQueryChanReply, but too late. The historical sync will now fail, and the syncer will be stuck at this state. What's worse is it cannot forward channel announcements to other connected peers now as it will skip the broadcasting during initial graph sync. This is now fixed to make sure the following two steps are atomic, 1. syncer(state=syncingChans) sends QueryChannelRange 2. syncer marks its new state waitingQueryChanReply. --- discovery/gossiper.go | 8 ++++++-- discovery/syncer.go | 16 ++++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/discovery/gossiper.go b/discovery/gossiper.go index ef5c55bd6..23874fed2 100644 --- a/discovery/gossiper.go +++ b/discovery/gossiper.go @@ -832,9 +832,13 @@ func (d *AuthenticatedGossiper) ProcessRemoteAnnouncement(msg lnwire.Message, // If we've found the message target, then we'll dispatch the // message directly to it. - syncer.ProcessQueryMsg(m, peer.QuitSignal()) + err := syncer.ProcessQueryMsg(m, peer.QuitSignal()) + if err != nil { + log.Errorf("Process query msg from peer %x got %v", + peer.PubKey(), err) + } - errChan <- nil + errChan <- err return errChan // If a peer is updating its current update horizon, then we'll dispatch diff --git a/discovery/syncer.go b/discovery/syncer.go index e98281d1c..79794271a 100644 --- a/discovery/syncer.go +++ b/discovery/syncer.go @@ -486,6 +486,15 @@ func (g *GossipSyncer) handleSyncingChans() { return } + // Acquire a lock so the following state transition is atomic. + // + // NOTE: We must lock the following steps as it's possible we get an + // immediate response (ReplyChannelRange) after sending the query msg. + // The response is handled in ProcessQueryMsg, which requires the + // current state to be waitingQueryRangeReply. + g.Lock() + defer g.Unlock() + err = g.cfg.sendToPeer(queryRangeMsg) if err != nil { log.Errorf("Unable to send chan range query: %v", err) @@ -1517,12 +1526,15 @@ func (g *GossipSyncer) ProcessQueryMsg(msg lnwire.Message, peerQuit <-chan struc // Reply messages should only be expected in states where we're waiting // for a reply. case *lnwire.ReplyChannelRange, *lnwire.ReplyShortChanIDsEnd: + g.Lock() syncState := g.syncState() + g.Unlock() + if syncState != waitingQueryRangeReply && syncState != waitingQueryChanReply { - return fmt.Errorf("received unexpected query reply "+ - "message %T", msg) + return fmt.Errorf("unexpected msg %T received in "+ + "state %v", msg, syncState) } msgChan = g.gossipMsgs