watchtower: replace taskpipeline with disk overflow queue

2025-07-12 14:12:27 +02:00 · 2023-03-22 14:30:16 +02:00
parent e91fe50878
commit 56cd825695
4 changed files with 72 additions and 280 deletions
--- a/server.go
+++ b/server.go
@ -1568,17 +1568,18 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
 				return s.channelNotifier.
 					SubscribeChannelEvents()
 			},
-			Signer:         cc.Wallet.Cfg.Signer,
-			NewAddress:     newSweepPkScriptGen(cc.Wallet),
-			SecretKeyRing:  s.cc.KeyRing,
-			Dial:           cfg.net.Dial,
-			AuthDial:       authDial,
-			DB:             dbs.TowerClientDB,
-			Policy:         policy,
-			ChainHash:      *s.cfg.ActiveNetParams.GenesisHash,
-			MinBackoff:     10 * time.Second,
-			MaxBackoff:     5 * time.Minute,
-			ForceQuitDelay: wtclient.DefaultForceQuitDelay,
+			Signer:             cc.Wallet.Cfg.Signer,
+			NewAddress:         newSweepPkScriptGen(cc.Wallet),
+			SecretKeyRing:      s.cc.KeyRing,
+			Dial:               cfg.net.Dial,
+			AuthDial:           authDial,
+			DB:                 dbs.TowerClientDB,
+			Policy:             policy,
+			ChainHash:          *s.cfg.ActiveNetParams.GenesisHash,
+			MinBackoff:         10 * time.Second,
+			MaxBackoff:         5 * time.Minute,
+			ForceQuitDelay:     wtclient.DefaultForceQuitDelay,
+			MaxTasksInMemQueue: wtclient.DefaultMaxTasksInMemQueue,
 		})
 		if err != nil {
 			return nil, err
@ -1601,17 +1602,18 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
 				return s.channelNotifier.
 					SubscribeChannelEvents()
 			},
-			Signer:         cc.Wallet.Cfg.Signer,
-			NewAddress:     newSweepPkScriptGen(cc.Wallet),
-			SecretKeyRing:  s.cc.KeyRing,
-			Dial:           cfg.net.Dial,
-			AuthDial:       authDial,
-			DB:             dbs.TowerClientDB,
-			Policy:         anchorPolicy,
-			ChainHash:      *s.cfg.ActiveNetParams.GenesisHash,
-			MinBackoff:     10 * time.Second,
-			MaxBackoff:     5 * time.Minute,
-			ForceQuitDelay: wtclient.DefaultForceQuitDelay,
+			Signer:             cc.Wallet.Cfg.Signer,
+			NewAddress:         newSweepPkScriptGen(cc.Wallet),
+			SecretKeyRing:      s.cc.KeyRing,
+			Dial:               cfg.net.Dial,
+			AuthDial:           authDial,
+			DB:                 dbs.TowerClientDB,
+			Policy:             anchorPolicy,
+			ChainHash:          *s.cfg.ActiveNetParams.GenesisHash,
+			MinBackoff:         10 * time.Second,
+			MaxBackoff:         5 * time.Minute,
+			ForceQuitDelay:     wtclient.DefaultForceQuitDelay,
+			MaxTasksInMemQueue: wtclient.DefaultMaxTasksInMemQueue,
 		})
 		if err != nil {
 			return nil, err
--- a/watchtower/wtclient/client.go
+++ b/watchtower/wtclient/client.go
@ -244,6 +244,10 @@ type Config struct {
 	// number of blocks to delay closing a session after its last channel
 	// has been closed.
 	SessionCloseRange uint32
+
+	// MaxTasksInMemQueue is the maximum number of backup tasks that should
+	// be kept in-memory. Any more tasks will overflow to disk.
+	MaxTasksInMemQueue uint64
 }

 // BreachRetributionBuilder is a function that can be used to construct a
@ -297,7 +301,7 @@ type TowerClient struct {

 	log btclog.Logger

-	pipeline *taskPipeline
+	pipeline *DiskOverflowQueue[*wtdb.BackupID]

 	negotiator        SessionNegotiator
 	candidateTowers   TowerCandidateIterator
@ -359,10 +363,21 @@ func New(config *Config) (*TowerClient, error) {
 		return nil, err
 	}

+	var (
+		policy  = cfg.Policy.BlobType.String()
+		queueDB = cfg.DB.GetDBQueue([]byte(policy))
+	)
+	queue, err := NewDiskOverflowQueue[*wtdb.BackupID](
+		queueDB, cfg.MaxTasksInMemQueue, plog,
+	)
+	if err != nil {
+		return nil, err
+	}
+
 	c := &TowerClient{
 		cfg:                  cfg,
 		log:                  plog,
-		pipeline:             newTaskPipeline(plog),
+		pipeline:             queue,
 		chanCommitHeights:    make(map[lnwire.ChannelID]uint64),
 		activeSessions:       make(sessionQueueSet),
 		summaries:            chanSummaries,
@ -675,6 +690,7 @@ func (c *TowerClient) Start() error {

 // Stop idempotently initiates a graceful shutdown of the watchtower client.
 func (c *TowerClient) Stop() error {
+	var returnErr error
 	c.stopped.Do(func() {
 		c.log.Debugf("Stopping watchtower client")

@ -697,7 +713,10 @@ func (c *TowerClient) Stop() error {
 		// updates from being accepted. In practice, the links should be
 		// shutdown before the client has been stopped, so all updates
 		// would have been added prior.
-		c.pipeline.Stop()
+		err := c.pipeline.Stop()
+		if err != nil {
+			returnErr = err
+		}

 		// 3. Once the backup queue has shutdown, wait for the main
 		// dispatcher to exit. The backup queue will signal it's
@ -728,7 +747,8 @@ func (c *TowerClient) Stop() error {

 		c.log.Debugf("Client successfully stopped, stats: %s", c.stats)
 	})
-	return nil
+
+	return returnErr
 }

 // ForceQuit idempotently initiates an unclean shutdown of the watchtower
@ -741,7 +761,10 @@ func (c *TowerClient) ForceQuit() {
 		// updates from being accepted. In practice, the links should be
 		// shutdown before the client has been stopped, so all updates
 		// would have been added prior.
-		c.pipeline.ForceQuit()
+		err := c.pipeline.Stop()
+		if err != nil {
+			c.log.Errorf("could not stop backup queue: %v", err)
+		}

 		// 2. Once the backup queue has shutdown, wait for the main
 		// dispatcher to exit. The backup queue will signal it's
@ -840,7 +863,7 @@ func (c *TowerClient) BackupState(chanID *lnwire.ChannelID,
 		CommitHeight: stateNum,
 	}

-	return c.pipeline.QueueBackupTask(id)
+	return c.pipeline.QueueBackupID(id)
 }

 // nextSessionQueue attempts to fetch an active session from our set of
@ -1327,7 +1350,7 @@ func (c *TowerClient) backupDispatcher() {

 			// Process each backup task serially from the queue of
 			// revoked states.
-			case task, ok := <-c.pipeline.NewBackupTasks():
+			case task, ok := <-c.pipeline.NextBackupID():
 				// All backups in the pipeline have been
 				// processed, it is now safe to exit.
 				if !ok {
@ -1639,8 +1662,6 @@ func (c *TowerClient) AddTower(addr *lnwire.NetAddress) error {
 	}:
 	case <-c.pipeline.quit:
 		return ErrClientExiting
-	case <-c.pipeline.forceQuit:
-		return ErrClientExiting
 	}

 	select {
@ -1648,8 +1669,6 @@ func (c *TowerClient) AddTower(addr *lnwire.NetAddress) error {
 		return err
 	case <-c.pipeline.quit:
 		return ErrClientExiting
-	case <-c.pipeline.forceQuit:
-		return ErrClientExiting
 	}
 }

@ -1706,8 +1725,6 @@ func (c *TowerClient) RemoveTower(pubKey *btcec.PublicKey,
 	}:
 	case <-c.pipeline.quit:
 		return ErrClientExiting
-	case <-c.pipeline.forceQuit:
-		return ErrClientExiting
 	}

 	select {
@ -1715,8 +1732,6 @@ func (c *TowerClient) RemoveTower(pubKey *btcec.PublicKey,
 		return err
 	case <-c.pipeline.quit:
 		return ErrClientExiting
-	case <-c.pipeline.forceQuit:
-		return ErrClientExiting
 	}
 }

--- a/watchtower/wtclient/client_test.go
+++ b/watchtower/wtclient/client_test.go
@ -509,12 +509,13 @@ func newHarness(t *testing.T, cfg harnessCfg) *testHarness {
 		NewAddress: func() ([]byte, error) {
 			return addrScript, nil
 		},
-		ReadTimeout:       timeout,
-		WriteTimeout:      timeout,
-		MinBackoff:        time.Millisecond,
-		MaxBackoff:        time.Second,
-		ForceQuitDelay:    10 * time.Second,
-		SessionCloseRange: 1,
+		ReadTimeout:        timeout,
+		WriteTimeout:       timeout,
+		MinBackoff:         time.Millisecond,
+		MaxBackoff:         time.Second,
+		ForceQuitDelay:     10 * time.Second,
+		SessionCloseRange:  1,
+		MaxTasksInMemQueue: 2,
 	}

 	h.clientCfg.BuildBreachRetribution = func(id lnwire.ChannelID,
@ -1094,10 +1095,6 @@ var clientTests = []clientTest{
 			hints := h.advanceChannelN(chanID, numUpdates)
 			h.backupStates(chanID, 0, numUpdates, nil)

-			// Stop the client in the background, to assert the
-			// pipeline is always flushed before it exits.
-			go h.client.Stop()
-
 			// Wait for all the updates to be populated in the
 			// server's database.
 			h.waitServerUpdates(hints, time.Second)
@ -1238,10 +1235,6 @@ var clientTests = []clientTest{
 			// Now, queue the retributions for backup.
 			h.backupStates(chanID, 0, numUpdates, nil)

-			// Stop the client in the background, to assert the
-			// pipeline is always flushed before it exits.
-			go h.client.Stop()
-
 			// Give the client time to saturate a large number of
 			// session queues for which the server has not acked the
 			// state updates that it has received.
@ -1346,9 +1339,6 @@ var clientTests = []clientTest{
 				h.backupStates(id, 0, numUpdates, nil)
 			}

-			// Test reliable flush under multi-client scenario.
-			go h.client.Stop()
-
 			// Wait for all the updates to be populated in the
 			// server's database.
 			h.waitServerUpdates(hints, 10*time.Second)
@ -1395,9 +1385,6 @@ var clientTests = []clientTest{
 			// identical one.
 			h.startClient()

-			// Now, queue the retributions for backup.
-			h.backupStates(chanID, 0, numUpdates, nil)
-
 			// Wait for all the updates to be populated in the
 			// server's database.
 			h.waitServerUpdates(hints, waitTime)
@ -1449,9 +1436,6 @@ var clientTests = []clientTest{
 			h.clientCfg.Policy.SweepFeeRate *= 2
 			h.startClient()

-			// Now, queue the retributions for backup.
-			h.backupStates(chanID, 0, numUpdates, nil)
-
 			// Wait for all the updates to be populated in the
 			// server's database.
 			h.waitServerUpdates(hints, waitTime)
@ -2090,9 +2074,9 @@ var clientTests = []clientTest{
 	},
 	{
 		// This test demonstrates that if there is no active session,
-		// the updates are queued in memory and then lost on restart.
-		// This behaviour will be fixed in upcoming commits.
-		name: "lose updates in task pipeline on restart",
+		// the updates are persisted to disk on restart and reliably
+		// sent.
+		name: "in-mem updates not lost on restart",
 		cfg: harnessCfg{
 			localBalance:  localBalance,
 			remoteBalance: remoteBalance,
@ -2112,36 +2096,25 @@ var clientTests = []clientTest{
 				numUpdates = 5
 			)

-			// Advance the channel to create all states.
+			// Try back up the first few states of the client's
+			// channel. Since the server has not yet started, the
+			// client should have no active session yet and so these
+			// updates will just be kept in an in-memory queue.
 			hints := h.advanceChannelN(chanID, numUpdates)
-			firstBatch := hints[:numUpdates/2]
-			secondBatch := hints[numUpdates/2 : numUpdates]

-			// Attempt to back up the first batch of states of the
-			// client's channel. Since the server has not yet
-			// started, the client should have no active session
-			// yet and so these updates will just be kept in an
-			// in-memory queue.
 			h.backupStates(chanID, 0, numUpdates/2, nil)

 			// Restart the Client (force quit). And also now start
-			// the server. The client should now be able to create
-			// a session with the server.
+			// the server.
 			h.client.ForceQuit()
 			h.startServer()
 			h.startClient()

-			// Attempt to now back up the second batch of states.
+			// Back up a few more states.
 			h.backupStates(chanID, numUpdates/2, numUpdates, nil)

-			// Assert that the server does receive the updates.
-			h.waitServerUpdates(secondBatch, waitTime)
-
-			// Assert that the server definitely still has not
-			// received the initial set of updates.
-			matches, err := h.serverDB.QueryMatches(firstBatch)
-			require.NoError(h.t, err)
-			require.Empty(h.t, matches)
+			// Assert that the server does receive ALL the updates.
+			h.waitServerUpdates(hints[0:numUpdates], waitTime)
 		},
 	},
 }
--- a/watchtower/wtclient/task_pipeline.go
+++ b/watchtower/wtclient/task_pipeline.go
@ -1,198 +0,0 @@
-package wtclient
-
-import (
-	"container/list"
-	"sync"
-	"time"
-
-	"github.com/btcsuite/btclog"
-	"github.com/lightningnetwork/lnd/watchtower/wtdb"
-)
-
-// taskPipeline implements a reliable, in-order queue that ensures its queue
-// fully drained before exiting. Stopping the taskPipeline prevents the pipeline
-// from accepting any further tasks, and will cause the pipeline to exit after
-// all updates have been delivered to the downstream receiver. If this process
-// hangs and is unable to make progress, users can optionally call ForceQuit to
-// abandon the reliable draining of the queue in order to permit shutdown.
-type taskPipeline struct {
-	started sync.Once
-	stopped sync.Once
-	forced  sync.Once
-
-	log btclog.Logger
-
-	queueMtx  sync.Mutex
-	queueCond *sync.Cond
-	queue     *list.List
-
-	newBackupTasks chan *wtdb.BackupID
-
-	quit      chan struct{}
-	forceQuit chan struct{}
-	shutdown  chan struct{}
-}
-
-// newTaskPipeline initializes a new taskPipeline.
-func newTaskPipeline(log btclog.Logger) *taskPipeline {
-	rq := &taskPipeline{
-		log:            log,
-		queue:          list.New(),
-		newBackupTasks: make(chan *wtdb.BackupID),
-		quit:           make(chan struct{}),
-		forceQuit:      make(chan struct{}),
-		shutdown:       make(chan struct{}),
-	}
-	rq.queueCond = sync.NewCond(&rq.queueMtx)
-
-	return rq
-}
-
-// Start spins up the taskPipeline, making it eligible to begin receiving backup
-// tasks and deliver them to the receiver of NewBackupTasks.
-func (q *taskPipeline) Start() {
-	q.started.Do(func() {
-		go q.queueManager()
-	})
-}
-
-// Stop begins a graceful shutdown of the taskPipeline. This method returns once
-// all backupTasks have been delivered via NewBackupTasks, or a ForceQuit causes
-// the delivery of pending tasks to be interrupted.
-func (q *taskPipeline) Stop() {
-	q.stopped.Do(func() {
-		q.log.Debugf("Stopping task pipeline")
-
-		close(q.quit)
-		q.signalUntilShutdown()
-
-		// Skip log if we also force quit.
-		select {
-		case <-q.forceQuit:
-		default:
-			q.log.Debugf("Task pipeline stopped successfully")
-		}
-	})
-}
-
-// ForceQuit signals the taskPipeline to immediately exit, dropping any
-// backupTasks that have not been delivered via NewBackupTasks.
-func (q *taskPipeline) ForceQuit() {
-	q.forced.Do(func() {
-		q.log.Infof("Force quitting task pipeline")
-
-		close(q.forceQuit)
-		q.signalUntilShutdown()
-
-		q.log.Infof("Task pipeline unclean shutdown complete")
-	})
-}
-
-// NewBackupTasks returns a read-only channel for enqueue backupTasks. The
-// channel will be closed after a call to Stop and all pending tasks have been
-// delivered, or if a call to ForceQuit is called before the pending entries
-// have been drained.
-func (q *taskPipeline) NewBackupTasks() <-chan *wtdb.BackupID {
-	return q.newBackupTasks
-}
-
-// QueueBackupTask enqueues a backupTask for reliable delivery to the consumer
-// of NewBackupTasks. If the taskPipeline is shutting down, ErrClientExiting is
-// returned. Otherwise, if QueueBackupTask returns nil it is guaranteed to be
-// delivered via NewBackupTasks unless ForceQuit is called before completion.
-func (q *taskPipeline) QueueBackupTask(task *wtdb.BackupID) error {
-	q.queueCond.L.Lock()
-	select {
-
-	// Reject new tasks after quit has been signaled.
-	case <-q.quit:
-		q.queueCond.L.Unlock()
-		return ErrClientExiting
-
-	// Reject new tasks after force quit has been signaled.
-	case <-q.forceQuit:
-		q.queueCond.L.Unlock()
-		return ErrClientExiting
-
-	default:
-	}
-
-	// Queue the new task and signal the queue's condition variable to wake
-	// up the queueManager for processing.
-	q.queue.PushBack(task)
-	q.queueCond.L.Unlock()
-
-	q.queueCond.Signal()
-
-	return nil
-}
-
-// queueManager processes all incoming backup requests that get added via
-// QueueBackupTask. The manager will exit
-//
-// NOTE: This method MUST be run as a goroutine.
-func (q *taskPipeline) queueManager() {
-	defer close(q.shutdown)
-	defer close(q.newBackupTasks)
-
-	for {
-		q.queueCond.L.Lock()
-		for q.queue.Front() == nil {
-			q.queueCond.Wait()
-
-			select {
-			case <-q.quit:
-				// Exit only after the queue has been fully
-				// drained.
-				if q.queue.Len() == 0 {
-					q.queueCond.L.Unlock()
-					q.log.Debugf("Revoked state pipeline " +
-						"flushed.")
-
-					return
-				}
-
-			case <-q.forceQuit:
-				q.queueCond.L.Unlock()
-				q.log.Debugf("Revoked state pipeline force " +
-					"quit.")
-
-				return
-
-			default:
-			}
-		}
-
-		// Pop the first element from the queue.
-		e := q.queue.Front()
-
-		//nolint:forcetypeassert
-		task := q.queue.Remove(e).(*wtdb.BackupID)
-		q.queueCond.L.Unlock()
-
-		select {
-
-		// Backup task submitted to dispatcher. We don't select on quit
-		// to ensure that we still drain tasks while shutting down.
-		case q.newBackupTasks <- task:
-
-		// Force quit, return immediately to allow the client to exit.
-		case <-q.forceQuit:
-			q.log.Debugf("Revoked state pipeline force quit.")
-			return
-		}
-	}
-}
-
-// signalUntilShutdown strobes the queue's condition variable to ensure the
-// queueManager reliably unblocks to check for the exit condition.
-func (q *taskPipeline) signalUntilShutdown() {
-	for {
-		select {
-		case <-time.After(time.Millisecond):
-			q.queueCond.Signal()
-		case <-q.shutdown:
-			return
-		}
-	}
-}