mirror of
https://github.com/lightningnetwork/lnd.git
synced 2025-04-08 20:28:04 +02:00
wtclient: replay pending tasks on sessionQueue stop
This commit does a few things: - First, it gives the sessionQueue access to the TowerClient task pipeline so that it can replay backup tasks onto the pipeline on Stop. - Given that the above is done, the ForceQuit functionality of the sessionQueue and TowerClient can be removed. - The bug demonstrated in a prior commit is now fixed due to the above changes.
This commit is contained in:
parent
449d6b5500
commit
552ef4bf81
@ -252,9 +252,8 @@ type TowerClient interface {
|
||||
|
||||
// BackupState initiates a request to back up a particular revoked
|
||||
// state. If the method returns nil, the backup is guaranteed to be
|
||||
// successful unless the tower is unavailable and client is force quit,
|
||||
// or the justice transaction would create dust outputs when trying to
|
||||
// abide by the negotiated policy.
|
||||
// successful unless the justice transaction would create dust outputs
|
||||
// when trying to abide by the negotiated policy.
|
||||
BackupState(chanID *lnwire.ChannelID, stateNum uint64) error
|
||||
}
|
||||
|
||||
|
@ -1569,7 +1569,6 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
|
||||
ChainHash: *s.cfg.ActiveNetParams.GenesisHash,
|
||||
MinBackoff: 10 * time.Second,
|
||||
MaxBackoff: 5 * time.Minute,
|
||||
ForceQuitDelay: wtclient.DefaultForceQuitDelay,
|
||||
MaxTasksInMemQueue: cfg.WtClient.MaxTasksInMemQueue,
|
||||
})
|
||||
if err != nil {
|
||||
@ -1603,7 +1602,6 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
|
||||
ChainHash: *s.cfg.ActiveNetParams.GenesisHash,
|
||||
MinBackoff: 10 * time.Second,
|
||||
MaxBackoff: 5 * time.Minute,
|
||||
ForceQuitDelay: wtclient.DefaultForceQuitDelay,
|
||||
MaxTasksInMemQueue: cfg.WtClient.MaxTasksInMemQueue,
|
||||
})
|
||||
if err != nil {
|
||||
|
@ -42,11 +42,6 @@ const (
|
||||
// metrics about the client's operation.
|
||||
DefaultStatInterval = time.Minute
|
||||
|
||||
// DefaultForceQuitDelay specifies the default duration after which the
|
||||
// client should abandon any pending updates or session negotiations
|
||||
// before terminating.
|
||||
DefaultForceQuitDelay = 10 * time.Second
|
||||
|
||||
// DefaultSessionCloseRange is the range over which we will generate a
|
||||
// random number of blocks to delay closing a session after its last
|
||||
// channel has been closed.
|
||||
@ -138,9 +133,8 @@ type Client interface {
|
||||
|
||||
// BackupState initiates a request to back up a particular revoked
|
||||
// state. If the method returns nil, the backup is guaranteed to be
|
||||
// successful unless the client is force quit, or the justice
|
||||
// transaction would create dust outputs when trying to abide by the
|
||||
// negotiated policy.
|
||||
// successful unless the justice transaction would create dust outputs
|
||||
// when trying to abide by the negotiated policy.
|
||||
BackupState(chanID *lnwire.ChannelID, stateNum uint64) error
|
||||
|
||||
// Start initializes the watchtower client, allowing it process requests
|
||||
@ -151,10 +145,6 @@ type Client interface {
|
||||
// so, it will attempt to flush the pipeline and deliver any queued
|
||||
// states to the tower before exiting.
|
||||
Stop() error
|
||||
|
||||
// ForceQuit will forcibly shutdown the watchtower client. Calling this
|
||||
// may lead to queued states being dropped.
|
||||
ForceQuit()
|
||||
}
|
||||
|
||||
// Config provides the TowerClient with access to the resources it requires to
|
||||
@ -213,13 +203,6 @@ type Config struct {
|
||||
// the tower must be watching to monitor for breaches.
|
||||
ChainHash chainhash.Hash
|
||||
|
||||
// ForceQuitDelay is the duration after attempting to shutdown that the
|
||||
// client will automatically abort any pending backups if an unclean
|
||||
// shutdown is detected. If the value is less than or equal to zero, a
|
||||
// call to Stop may block indefinitely. The client can always be
|
||||
// ForceQuit externally irrespective of the chosen parameter.
|
||||
ForceQuitDelay time.Duration
|
||||
|
||||
// ReadTimeout is the duration we will wait during a read before
|
||||
// breaking out of a blocking read. If the value is less than or equal
|
||||
// to zero, the default will be used instead.
|
||||
@ -295,7 +278,6 @@ type staleTowerMsg struct {
|
||||
type TowerClient struct {
|
||||
started sync.Once
|
||||
stopped sync.Once
|
||||
forced sync.Once
|
||||
|
||||
cfg *Config
|
||||
|
||||
@ -323,9 +305,8 @@ type TowerClient struct {
|
||||
newTowers chan *newTowerMsg
|
||||
staleTowers chan *staleTowerMsg
|
||||
|
||||
wg sync.WaitGroup
|
||||
quit chan struct{}
|
||||
forceQuit chan struct{}
|
||||
wg sync.WaitGroup
|
||||
quit chan struct{}
|
||||
}
|
||||
|
||||
// Compile-time constraint to ensure *TowerClient implements the Client
|
||||
@ -385,7 +366,6 @@ func New(config *Config) (*TowerClient, error) {
|
||||
stats: new(ClientStats),
|
||||
newTowers: make(chan *newTowerMsg),
|
||||
staleTowers: make(chan *staleTowerMsg),
|
||||
forceQuit: make(chan struct{}),
|
||||
quit: make(chan struct{}),
|
||||
}
|
||||
|
||||
@ -697,58 +677,44 @@ func (c *TowerClient) Stop() error {
|
||||
c.stopped.Do(func() {
|
||||
c.log.Debugf("Stopping watchtower client")
|
||||
|
||||
// 1. To ensure we don't hang forever on shutdown due to
|
||||
// unintended failures, we'll delay a call to force quit the
|
||||
// pipeline if a ForceQuitDelay is specified. This will have no
|
||||
// effect if the pipeline shuts down cleanly before the delay
|
||||
// fires.
|
||||
//
|
||||
// For full safety, this can be set to 0 and wait out
|
||||
// indefinitely. However for mobile clients which may have a
|
||||
// limited amount of time to exit before the background process
|
||||
// is killed, this offers a way to ensure the process
|
||||
// terminates.
|
||||
if c.cfg.ForceQuitDelay > 0 {
|
||||
time.AfterFunc(c.cfg.ForceQuitDelay, c.ForceQuit)
|
||||
}
|
||||
|
||||
// 2. Shutdown the backup queue, which will prevent any further
|
||||
// updates from being accepted. In practice, the links should be
|
||||
// shutdown before the client has been stopped, so all updates
|
||||
// would have been added prior.
|
||||
err := c.pipeline.Stop()
|
||||
// 1. Stop the session negotiator.
|
||||
err := c.negotiator.Stop()
|
||||
if err != nil {
|
||||
returnErr = err
|
||||
}
|
||||
|
||||
// 3. Once the backup queue has shutdown, wait for the main
|
||||
// dispatcher to exit. The backup queue will signal it's
|
||||
// completion to the dispatcher, which releases the wait group
|
||||
// after all tasks have been assigned to session queues.
|
||||
// 2. Stop the backup dispatcher and any other goroutines.
|
||||
close(c.quit)
|
||||
c.wg.Wait()
|
||||
|
||||
// 4. Since all valid tasks have been assigned to session
|
||||
// queues, we no longer need to negotiate sessions.
|
||||
err = c.negotiator.Stop()
|
||||
if err != nil {
|
||||
returnErr = err
|
||||
// 3. If there was a left over 'prevTask' from the backup
|
||||
// dispatcher, replay that onto the pipeline.
|
||||
if c.prevTask != nil {
|
||||
err = c.pipeline.QueueBackupID(c.prevTask)
|
||||
if err != nil {
|
||||
returnErr = err
|
||||
}
|
||||
}
|
||||
|
||||
c.log.Debugf("Waiting for active session queues to finish "+
|
||||
"draining, stats: %s", c.stats)
|
||||
|
||||
// 5. Shutdown all active session queues in parallel. These will
|
||||
// exit once all updates have been acked by the watchtower.
|
||||
// 4. Shutdown all active session queues in parallel. These will
|
||||
// exit once all unhandled updates have been replayed to the
|
||||
// task pipeline.
|
||||
c.activeSessions.ApplyAndWait(func(s *sessionQueue) func() {
|
||||
return s.Stop
|
||||
return func() {
|
||||
err := s.Stop()
|
||||
if err != nil {
|
||||
c.log.Errorf("could not stop session "+
|
||||
"queue: %s: %v", s.ID(), err)
|
||||
|
||||
returnErr = err
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Skip log if force quitting.
|
||||
select {
|
||||
case <-c.forceQuit:
|
||||
return
|
||||
default:
|
||||
// 5. Shutdown the backup queue, which will prevent any further
|
||||
// updates from being accepted.
|
||||
if err = c.pipeline.Stop(); err != nil {
|
||||
returnErr = err
|
||||
}
|
||||
|
||||
c.log.Debugf("Client successfully stopped, stats: %s", c.stats)
|
||||
@ -757,43 +723,6 @@ func (c *TowerClient) Stop() error {
|
||||
return returnErr
|
||||
}
|
||||
|
||||
// ForceQuit idempotently initiates an unclean shutdown of the watchtower
|
||||
// client. This should only be executed if Stop is unable to exit cleanly.
|
||||
func (c *TowerClient) ForceQuit() {
|
||||
c.forced.Do(func() {
|
||||
c.log.Infof("Force quitting watchtower client")
|
||||
|
||||
// 1. Shutdown the backup queue, which will prevent any further
|
||||
// updates from being accepted. In practice, the links should be
|
||||
// shutdown before the client has been stopped, so all updates
|
||||
// would have been added prior.
|
||||
err := c.pipeline.Stop()
|
||||
if err != nil {
|
||||
c.log.Errorf("could not stop backup queue: %v", err)
|
||||
}
|
||||
|
||||
// 2. Once the backup queue has shutdown, wait for the main
|
||||
// dispatcher to exit. The backup queue will signal it's
|
||||
// completion to the dispatcher, which releases the wait group
|
||||
// after all tasks have been assigned to session queues.
|
||||
close(c.forceQuit)
|
||||
c.wg.Wait()
|
||||
|
||||
// 3. Since all valid tasks have been assigned to session
|
||||
// queues, we no longer need to negotiate sessions.
|
||||
c.negotiator.Stop()
|
||||
|
||||
// 4. Force quit all active session queues in parallel. These
|
||||
// will exit once all updates have been acked by the watchtower.
|
||||
c.activeSessions.ApplyAndWait(func(s *sessionQueue) func() {
|
||||
return s.ForceQuit
|
||||
})
|
||||
|
||||
c.log.Infof("Watchtower client unclean shutdown complete, "+
|
||||
"stats: %s", c.stats)
|
||||
})
|
||||
}
|
||||
|
||||
// RegisterChannel persistently initializes any channel-dependent parameters
|
||||
// within the client. This should be called during link startup to ensure that
|
||||
// the client is able to support the link during operation.
|
||||
@ -832,7 +761,6 @@ func (c *TowerClient) RegisterChannel(chanID lnwire.ChannelID) error {
|
||||
|
||||
// BackupState initiates a request to back up a particular revoked state. If the
|
||||
// method returns nil, the backup is guaranteed to be successful unless the:
|
||||
// - client is force quit,
|
||||
// - justice transaction would create dust outputs when trying to abide by the
|
||||
// negotiated policy, or
|
||||
// - breached outputs contain too little value to sweep at the target sweep
|
||||
@ -955,9 +883,6 @@ func (c *TowerClient) handleChannelCloses(chanSub subscribe.Subscription) {
|
||||
err)
|
||||
}
|
||||
|
||||
case <-c.forceQuit:
|
||||
return
|
||||
|
||||
case <-c.quit:
|
||||
return
|
||||
}
|
||||
@ -1085,9 +1010,6 @@ func (c *TowerClient) handleClosableSessions(
|
||||
}
|
||||
}
|
||||
|
||||
case <-c.forceQuit:
|
||||
return
|
||||
|
||||
case <-c.quit:
|
||||
return
|
||||
}
|
||||
@ -1246,8 +1168,7 @@ func (c *TowerClient) deleteSessionFromTower(sess *wtdb.ClientSession) error {
|
||||
|
||||
// backupDispatcher processes events coming from the taskPipeline and is
|
||||
// responsible for detecting when the client needs to renegotiate a session to
|
||||
// fulfill continuing demand. The event loop exits after all tasks have been
|
||||
// received from the upstream taskPipeline, or the taskPipeline is force quit.
|
||||
// fulfill continuing demand. The event loop exits if the TowerClient is quit.
|
||||
//
|
||||
// NOTE: This method MUST be run as a goroutine.
|
||||
func (c *TowerClient) backupDispatcher() {
|
||||
@ -1297,7 +1218,7 @@ func (c *TowerClient) backupDispatcher() {
|
||||
case msg := <-c.staleTowers:
|
||||
msg.errChan <- c.handleStaleTower(msg)
|
||||
|
||||
case <-c.forceQuit:
|
||||
case <-c.quit:
|
||||
return
|
||||
}
|
||||
|
||||
@ -1381,6 +1302,9 @@ func (c *TowerClient) backupDispatcher() {
|
||||
// of its corresponding candidate sessions as inactive.
|
||||
case msg := <-c.staleTowers:
|
||||
msg.errChan <- c.handleStaleTower(msg)
|
||||
|
||||
case <-c.quit:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1422,7 +1346,7 @@ func (c *TowerClient) processTask(task *wtdb.BackupID) {
|
||||
// sessionQueue will be removed if accepting the task left the sessionQueue in
|
||||
// an exhausted state.
|
||||
func (c *TowerClient) taskAccepted(task *wtdb.BackupID,
|
||||
newStatus reserveStatus) {
|
||||
newStatus sessionQueueStatus) {
|
||||
|
||||
c.log.Infof("Queued %v successfully for session %v", task,
|
||||
c.sessionQueue.ID())
|
||||
@ -1436,11 +1360,11 @@ func (c *TowerClient) taskAccepted(task *wtdb.BackupID,
|
||||
switch newStatus {
|
||||
|
||||
// The sessionQueue still has capacity after accepting this task.
|
||||
case reserveAvailable:
|
||||
case sessionQueueAvailable:
|
||||
|
||||
// The sessionQueue is full after accepting this task, so we will need
|
||||
// to request a new one before proceeding.
|
||||
case reserveExhausted:
|
||||
case sessionQueueExhausted:
|
||||
c.stats.sessionExhausted()
|
||||
|
||||
c.log.Debugf("Session %s exhausted", c.sessionQueue.ID())
|
||||
@ -1456,16 +1380,17 @@ func (c *TowerClient) taskAccepted(task *wtdb.BackupID,
|
||||
// the state the was in *before* the task was rejected. The client's prevTask
|
||||
// will cache the task if the sessionQueue was exhausted beforehand, and nil
|
||||
// the sessionQueue to find a new session. If the sessionQueue was not
|
||||
// exhausted, the client marks the task as ineligible, as this implies we
|
||||
// couldn't construct a valid justice transaction given the session's policy.
|
||||
// exhausted and not shutting down, the client marks the task as ineligible, as
|
||||
// this implies we couldn't construct a valid justice transaction given the
|
||||
// session's policy.
|
||||
func (c *TowerClient) taskRejected(task *wtdb.BackupID,
|
||||
curStatus reserveStatus) {
|
||||
curStatus sessionQueueStatus) {
|
||||
|
||||
switch curStatus {
|
||||
|
||||
// The sessionQueue has available capacity but the task was rejected,
|
||||
// this indicates that the task was ineligible for backup.
|
||||
case reserveAvailable:
|
||||
case sessionQueueAvailable:
|
||||
c.stats.taskIneligible()
|
||||
|
||||
c.log.Infof("Ignoring ineligible %v", task)
|
||||
@ -1491,7 +1416,7 @@ func (c *TowerClient) taskRejected(task *wtdb.BackupID,
|
||||
|
||||
// The sessionQueue rejected the task because it is full, we will stash
|
||||
// this task and try to add it to the next available sessionQueue.
|
||||
case reserveExhausted:
|
||||
case sessionQueueExhausted:
|
||||
c.stats.sessionExhausted()
|
||||
|
||||
c.log.Debugf("Session %v exhausted, %v queued for next session",
|
||||
@ -1501,6 +1426,18 @@ func (c *TowerClient) taskRejected(task *wtdb.BackupID,
|
||||
// once a new session queue is available.
|
||||
c.sessionQueue = nil
|
||||
c.prevTask = task
|
||||
|
||||
// The sessionQueue rejected the task because it is shutting down. We
|
||||
// will stash this task and try to add it to the next available
|
||||
// sessionQueue.
|
||||
case sessionQueueShuttingDown:
|
||||
c.log.Debugf("Session %v is shutting down, %v queued for "+
|
||||
"next session", c.sessionQueue.ID(), task)
|
||||
|
||||
// Cache the task that we pulled off, so that we can process it
|
||||
// once a new session queue is available.
|
||||
c.sessionQueue = nil
|
||||
c.prevTask = task
|
||||
}
|
||||
}
|
||||
|
||||
@ -1600,6 +1537,7 @@ func (c *TowerClient) newSessionQueue(s *ClientSession,
|
||||
MaxBackoff: c.cfg.MaxBackoff,
|
||||
Log: c.log,
|
||||
BuildBreachRetribution: c.cfg.BuildBreachRetribution,
|
||||
TaskPipeline: c.pipeline,
|
||||
}, updates)
|
||||
}
|
||||
|
||||
@ -1790,6 +1728,14 @@ func (c *TowerClient) handleStaleTower(msg *staleTowerMsg) error {
|
||||
}
|
||||
for sessionID := range sessions {
|
||||
delete(c.candidateSessions, sessionID)
|
||||
|
||||
// Shutdown the session so that any pending updates are
|
||||
// replayed back onto the main task pipeline.
|
||||
err = c.activeSessions.StopAndRemove(sessionID)
|
||||
if err != nil {
|
||||
c.log.Errorf("could not stop session %s: %w", sessionID,
|
||||
err)
|
||||
}
|
||||
}
|
||||
|
||||
// If our active session queue corresponds to the stale tower, we'll
|
||||
|
@ -488,7 +488,6 @@ func newHarness(t *testing.T, cfg harnessCfg) *testHarness {
|
||||
WriteTimeout: timeout,
|
||||
MinBackoff: time.Millisecond,
|
||||
MaxBackoff: time.Second,
|
||||
ForceQuitDelay: 10 * time.Second,
|
||||
SessionCloseRange: 1,
|
||||
MaxTasksInMemQueue: 2,
|
||||
}
|
||||
@ -508,7 +507,9 @@ func newHarness(t *testing.T, cfg harnessCfg) *testHarness {
|
||||
}
|
||||
|
||||
h.startClient()
|
||||
t.Cleanup(h.client.ForceQuit)
|
||||
t.Cleanup(func() {
|
||||
require.NoError(t, h.client.Stop())
|
||||
})
|
||||
|
||||
h.makeChannel(0, h.cfg.localBalance, h.cfg.remoteBalance)
|
||||
if !cfg.noRegisterChan0 {
|
||||
@ -952,27 +953,6 @@ func (s *serverHarness) restart(op func(cfg *wtserver.Config)) {
|
||||
op(s.cfg)
|
||||
}
|
||||
|
||||
// assertUpdatesNotFound asserts that a set of hints are not found in the
|
||||
// server's DB.
|
||||
func (s *serverHarness) assertUpdatesNotFound(hints []blob.BreachHint) {
|
||||
s.t.Helper()
|
||||
|
||||
hintSet := make(map[blob.BreachHint]struct{})
|
||||
for _, hint := range hints {
|
||||
hintSet[hint] = struct{}{}
|
||||
}
|
||||
|
||||
time.Sleep(time.Second)
|
||||
|
||||
matches, err := s.db.QueryMatches(hints)
|
||||
require.NoError(s.t, err, "unable to query for hints")
|
||||
|
||||
for _, match := range matches {
|
||||
_, ok := hintSet[match.Hint]
|
||||
require.False(s.t, ok, "breach hint was found in server DB")
|
||||
}
|
||||
}
|
||||
|
||||
// waitForUpdates blocks until the breach hints provided all appear in the
|
||||
// watchtower's database or the timeout expires. This is used to test that the
|
||||
// client in fact sends the updates to the server, even if it is offline.
|
||||
@ -1238,12 +1218,9 @@ var clientTests = []clientTest{
|
||||
h.backupState(chanID, numSent, nil)
|
||||
numSent++
|
||||
|
||||
// Force quit the client to abort the state updates it
|
||||
// has queued. The sleep ensures that the session queues
|
||||
// have enough time to commit the state updates before
|
||||
// the client is killed.
|
||||
time.Sleep(time.Second)
|
||||
h.client.ForceQuit()
|
||||
// Stop the client to abort the state updates it has
|
||||
// queued.
|
||||
require.NoError(h.t, h.client.Stop())
|
||||
|
||||
// Restart the server and allow it to ack the updates
|
||||
// after the client retransmits the unacked update.
|
||||
@ -1437,8 +1414,8 @@ var clientTests = []clientTest{
|
||||
// server should have no updates.
|
||||
h.server.waitForUpdates(nil, waitTime)
|
||||
|
||||
// Force quit the client since it has queued backups.
|
||||
h.client.ForceQuit()
|
||||
// Stop the client since it has queued backups.
|
||||
require.NoError(h.t, h.client.Stop())
|
||||
|
||||
// Restart the server and allow it to ack session
|
||||
// creation.
|
||||
@ -1489,8 +1466,8 @@ var clientTests = []clientTest{
|
||||
// server should have no updates.
|
||||
h.server.waitForUpdates(nil, waitTime)
|
||||
|
||||
// Force quit the client since it has queued backups.
|
||||
h.client.ForceQuit()
|
||||
// Stop the client since it has queued backups.
|
||||
require.NoError(h.t, h.client.Stop())
|
||||
|
||||
// Restart the server and allow it to ack session
|
||||
// creation.
|
||||
@ -1672,56 +1649,6 @@ var clientTests = []clientTest{
|
||||
h.server.waitForUpdates(hints[numUpdates/2:], waitTime)
|
||||
},
|
||||
},
|
||||
{
|
||||
// Asserts that the client's force quite delay will properly
|
||||
// shutdown the client if it is unable to completely drain the
|
||||
// task pipeline.
|
||||
name: "force unclean shutdown",
|
||||
cfg: harnessCfg{
|
||||
localBalance: localBalance,
|
||||
remoteBalance: remoteBalance,
|
||||
policy: wtpolicy.Policy{
|
||||
TxPolicy: defaultTxPolicy,
|
||||
MaxUpdates: 5,
|
||||
},
|
||||
},
|
||||
fn: func(h *testHarness) {
|
||||
const (
|
||||
chanID = 0
|
||||
numUpdates = 6
|
||||
maxUpdates = 5
|
||||
)
|
||||
|
||||
// Advance the channel to create all states.
|
||||
hints := h.advanceChannelN(chanID, numUpdates)
|
||||
|
||||
// Back up 4 of the 5 states for the negotiated session.
|
||||
h.backupStates(chanID, 0, maxUpdates-1, nil)
|
||||
h.server.waitForUpdates(hints[:maxUpdates-1], waitTime)
|
||||
|
||||
// Now, restart the tower and prevent it from acking any
|
||||
// new sessions. We do this here as once the last slot
|
||||
// is exhausted the client will attempt to renegotiate.
|
||||
h.server.restart(func(cfg *wtserver.Config) {
|
||||
cfg.NoAckCreateSession = true
|
||||
})
|
||||
|
||||
// Back up the remaining two states. Once the first is
|
||||
// processed, the session will be exhausted but the
|
||||
// client won't be able to renegotiate a session for
|
||||
// the final state. We'll only wait for the first five
|
||||
// states to arrive at the tower.
|
||||
h.backupStates(chanID, maxUpdates-1, numUpdates, nil)
|
||||
h.server.waitForUpdates(hints[:maxUpdates], waitTime)
|
||||
|
||||
// Finally, stop the client which will continue to
|
||||
// attempt session negotiation since it has one more
|
||||
// state to process. After the force quite delay
|
||||
// expires, the client should force quite itself and
|
||||
// allow the test to complete.
|
||||
h.server.stop()
|
||||
},
|
||||
},
|
||||
{
|
||||
// Assert that if a client changes the address for a server and
|
||||
// then tries to back up updates then the client will switch to
|
||||
@ -1937,7 +1864,7 @@ var clientTests = []clientTest{
|
||||
require.False(h.t, h.isSessionClosable(sessionIDs[0]))
|
||||
|
||||
// Restart the client.
|
||||
h.client.ForceQuit()
|
||||
require.NoError(h.t, h.client.Stop())
|
||||
h.startClient()
|
||||
|
||||
// The session should now have been marked as closable.
|
||||
@ -2176,9 +2103,8 @@ var clientTests = []clientTest{
|
||||
|
||||
h.backupStates(chanID, 0, numUpdates/2, nil)
|
||||
|
||||
// Restart the Client (force quit). And also now start
|
||||
// the server.
|
||||
h.client.ForceQuit()
|
||||
// Restart the Client. And also now start the server.
|
||||
require.NoError(h.t, h.client.Stop())
|
||||
h.server.start()
|
||||
h.startClient()
|
||||
|
||||
@ -2237,8 +2163,7 @@ var clientTests = []clientTest{
|
||||
{
|
||||
// Show that if a client switches to a new tower _after_ backup
|
||||
// tasks have been bound to the session with the first old tower
|
||||
// then these updates are _not_ replayed onto the new tower.
|
||||
// This is a bug that will be fixed in a future commit.
|
||||
// then these updates are replayed onto the new tower.
|
||||
name: "switch to new tower after tasks are bound",
|
||||
cfg: harnessCfg{
|
||||
localBalance: localBalance,
|
||||
@ -2290,18 +2215,11 @@ var clientTests = []clientTest{
|
||||
// Back up the final task.
|
||||
h.backupStates(chanID, numUpdates-1, numUpdates, nil)
|
||||
|
||||
// Show that only the latest backup is backed up to the
|
||||
// server and that the ones backed up while no tower was
|
||||
// online were _not_ backed up to either server. This is
|
||||
// a bug that will be fixed in a future commit.
|
||||
// Show that all the backups (the ones added while no
|
||||
// towers were online and the one added after adding the
|
||||
// second tower) are backed up to the second tower.
|
||||
server2.waitForUpdates(
|
||||
hints[numUpdates-1:], time.Second,
|
||||
)
|
||||
server2.assertUpdatesNotFound(
|
||||
hints[numUpdates/2 : numUpdates-1],
|
||||
)
|
||||
h.server.assertUpdatesNotFound(
|
||||
hints[numUpdates/2 : numUpdates-1],
|
||||
hints[numUpdates/2:numUpdates], waitTime,
|
||||
)
|
||||
},
|
||||
},
|
||||
|
@ -16,17 +16,21 @@ import (
|
||||
"github.com/lightningnetwork/lnd/watchtower/wtwire"
|
||||
)
|
||||
|
||||
// reserveStatus is an enum that signals how full a particular session is.
|
||||
type reserveStatus uint8
|
||||
// sessionQueueStatus is an enum that signals how full a particular session is.
|
||||
type sessionQueueStatus uint8
|
||||
|
||||
const (
|
||||
// reserveAvailable indicates that the session has space for at least
|
||||
// one more backup.
|
||||
reserveAvailable reserveStatus = iota
|
||||
// sessionQueueAvailable indicates that the session has space for at
|
||||
// least one more backup.
|
||||
sessionQueueAvailable sessionQueueStatus = iota
|
||||
|
||||
// reserveExhausted indicates that all slots in the session have been
|
||||
// allocated.
|
||||
reserveExhausted
|
||||
// sessionQueueExhausted indicates that all slots in the session have
|
||||
// been allocated.
|
||||
sessionQueueExhausted
|
||||
|
||||
// sessionQueueShuttingDown indicates that the session queue is
|
||||
// shutting down and so is no longer accepting any more backups.
|
||||
sessionQueueShuttingDown
|
||||
)
|
||||
|
||||
// sessionQueueConfig bundles the resources required by the sessionQueue to
|
||||
@ -62,6 +66,10 @@ type sessionQueueConfig struct {
|
||||
// certain revoked commitment height.
|
||||
BuildBreachRetribution BreachRetributionBuilder
|
||||
|
||||
// TaskPipeline is a pipeline which the sessionQueue should use to send
|
||||
// any unhandled tasks on shutdown of the queue.
|
||||
TaskPipeline *DiskOverflowQueue[*wtdb.BackupID]
|
||||
|
||||
// DB provides access to the client's stable storage.
|
||||
DB DB
|
||||
|
||||
@ -85,10 +93,8 @@ type sessionQueueConfig struct {
|
||||
|
||||
// sessionQueue implements a reliable queue that will encrypt and send accepted
|
||||
// backups to the watchtower specified in the config's ClientSession. Calling
|
||||
// Quit will attempt to perform a clean shutdown by receiving an ACK from the
|
||||
// tower for all pending backups before exiting. The clean shutdown can be
|
||||
// aborted by using ForceQuit, which will attempt to shut down the queue
|
||||
// immediately.
|
||||
// Stop will attempt to perform a clean shutdown replaying any un-committed
|
||||
// pending updates to the TowerClient's main task pipeline.
|
||||
type sessionQueue struct {
|
||||
started sync.Once
|
||||
stopped sync.Once
|
||||
@ -109,9 +115,8 @@ type sessionQueue struct {
|
||||
|
||||
retryBackoff time.Duration
|
||||
|
||||
quit chan struct{}
|
||||
forceQuit chan struct{}
|
||||
shutdown chan struct{}
|
||||
quit chan struct{}
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// newSessionQueue initializes a fresh sessionQueue.
|
||||
@ -133,8 +138,6 @@ func newSessionQueue(cfg *sessionQueueConfig,
|
||||
seqNum: cfg.ClientSession.SeqNum,
|
||||
retryBackoff: cfg.MinBackoff,
|
||||
quit: make(chan struct{}),
|
||||
forceQuit: make(chan struct{}),
|
||||
shutdown: make(chan struct{}),
|
||||
}
|
||||
sq.queueCond = sync.NewCond(&sq.queueMtx)
|
||||
|
||||
@ -151,41 +154,77 @@ func newSessionQueue(cfg *sessionQueueConfig,
|
||||
// backups.
|
||||
func (q *sessionQueue) Start() {
|
||||
q.started.Do(func() {
|
||||
q.wg.Add(1)
|
||||
go q.sessionManager()
|
||||
})
|
||||
}
|
||||
|
||||
// Stop idempotently stops the sessionQueue by initiating a clean shutdown that
|
||||
// will clear all pending tasks in the queue before returning to the caller.
|
||||
func (q *sessionQueue) Stop() {
|
||||
func (q *sessionQueue) Stop() error {
|
||||
var returnErr error
|
||||
q.stopped.Do(func() {
|
||||
q.log.Debugf("SessionQueue(%s) stopping ...", q.ID())
|
||||
|
||||
close(q.quit)
|
||||
q.signalUntilShutdown()
|
||||
|
||||
// Skip log if we also force quit.
|
||||
select {
|
||||
case <-q.forceQuit:
|
||||
shutdown := make(chan struct{})
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-time.After(time.Millisecond):
|
||||
q.queueCond.Signal()
|
||||
case <-shutdown:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
q.wg.Wait()
|
||||
close(shutdown)
|
||||
|
||||
// Now, for any task in the pending queue that we have not yet
|
||||
// created a CommittedUpdate for, re-add the task to the main
|
||||
// task pipeline.
|
||||
updates, err := q.cfg.DB.FetchSessionCommittedUpdates(q.ID())
|
||||
if err != nil {
|
||||
returnErr = err
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
unAckedUpdates := make(map[wtdb.BackupID]bool)
|
||||
for _, update := range updates {
|
||||
unAckedUpdates[update.BackupID] = true
|
||||
}
|
||||
|
||||
// Push any task that was on the pending queue that there is
|
||||
// not yet a committed update for back to the main task
|
||||
// pipeline.
|
||||
q.queueCond.L.Lock()
|
||||
for q.pendingQueue.Len() > 0 {
|
||||
next := q.pendingQueue.Front()
|
||||
q.pendingQueue.Remove(next)
|
||||
|
||||
//nolint:forcetypeassert
|
||||
task := next.Value.(*backupTask)
|
||||
|
||||
if unAckedUpdates[task.id] {
|
||||
continue
|
||||
}
|
||||
|
||||
err := q.cfg.TaskPipeline.QueueBackupID(&task.id)
|
||||
if err != nil {
|
||||
log.Errorf("could not re-queue backup task: "+
|
||||
"%v", err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
q.queueCond.L.Unlock()
|
||||
|
||||
q.log.Debugf("SessionQueue(%s) stopped", q.ID())
|
||||
})
|
||||
}
|
||||
|
||||
// ForceQuit idempotently aborts any clean shutdown in progress and returns to
|
||||
// he caller after all lingering goroutines have spun down.
|
||||
func (q *sessionQueue) ForceQuit() {
|
||||
q.forced.Do(func() {
|
||||
q.log.Infof("SessionQueue(%s) force quitting...", q.ID())
|
||||
|
||||
close(q.forceQuit)
|
||||
q.signalUntilShutdown()
|
||||
|
||||
q.log.Infof("SessionQueue(%s) force quit", q.ID())
|
||||
})
|
||||
return returnErr
|
||||
}
|
||||
|
||||
// ID returns the wtdb.SessionID for the queue, which can be used to uniquely
|
||||
@ -196,10 +235,28 @@ func (q *sessionQueue) ID() *wtdb.SessionID {
|
||||
|
||||
// AcceptTask attempts to queue a backupTask for delivery to the sessionQueue's
|
||||
// tower. The session will only be accepted if the queue is not already
|
||||
// exhausted and the task is successfully bound to the ClientSession.
|
||||
func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
|
||||
// exhausted or shutting down and the task is successfully bound to the
|
||||
// ClientSession.
|
||||
func (q *sessionQueue) AcceptTask(task *backupTask) (sessionQueueStatus, bool) {
|
||||
// Exit early if the queue has started shutting down.
|
||||
select {
|
||||
case <-q.quit:
|
||||
return sessionQueueShuttingDown, false
|
||||
default:
|
||||
}
|
||||
|
||||
q.queueCond.L.Lock()
|
||||
|
||||
// There is a chance that sessionQueue started shutting down between
|
||||
// the last quit channel check and waiting for the lock. So check one
|
||||
// more time here.
|
||||
select {
|
||||
case <-q.quit:
|
||||
q.queueCond.L.Unlock()
|
||||
return sessionQueueShuttingDown, false
|
||||
default:
|
||||
}
|
||||
|
||||
numPending := uint32(q.pendingQueue.Len())
|
||||
maxUpdates := q.cfg.ClientSession.Policy.MaxUpdates
|
||||
q.log.Debugf("SessionQueue(%s) deciding to accept %v seqnum=%d "+
|
||||
@ -207,14 +264,14 @@ func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
|
||||
q.ID(), task.id, q.seqNum, numPending, maxUpdates)
|
||||
|
||||
// Examine the current reserve status of the session queue.
|
||||
curStatus := q.reserveStatus()
|
||||
curStatus := q.status()
|
||||
|
||||
switch curStatus {
|
||||
|
||||
// The session queue is exhausted, and cannot accept the task because it
|
||||
// is full. Reject the task such that it can be tried against a
|
||||
// different session.
|
||||
case reserveExhausted:
|
||||
case sessionQueueExhausted:
|
||||
q.queueCond.L.Unlock()
|
||||
return curStatus, false
|
||||
|
||||
@ -224,7 +281,7 @@ func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
|
||||
// tried again.
|
||||
//
|
||||
// TODO(conner): queue backups and retry with different session params.
|
||||
case reserveAvailable:
|
||||
case sessionQueueAvailable:
|
||||
err := task.bindSession(
|
||||
&q.cfg.ClientSession.ClientSessionBody,
|
||||
q.cfg.BuildBreachRetribution,
|
||||
@ -244,7 +301,7 @@ func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
|
||||
// Finally, compute the session's *new* reserve status. This will be
|
||||
// used by the client to determine if it can continue using this session
|
||||
// queue, or if it should negotiate a new one.
|
||||
newStatus := q.reserveStatus()
|
||||
newStatus := q.status()
|
||||
q.queueCond.L.Unlock()
|
||||
|
||||
q.queueCond.Signal()
|
||||
@ -255,7 +312,7 @@ func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
|
||||
// sessionManager is the primary event loop for the sessionQueue, and is
|
||||
// responsible for encrypting and sending accepted tasks to the tower.
|
||||
func (q *sessionQueue) sessionManager() {
|
||||
defer close(q.shutdown)
|
||||
defer q.wg.Done()
|
||||
|
||||
for {
|
||||
q.queueCond.L.Lock()
|
||||
@ -266,12 +323,6 @@ func (q *sessionQueue) sessionManager() {
|
||||
|
||||
select {
|
||||
case <-q.quit:
|
||||
if q.commitQueue.Len() == 0 &&
|
||||
q.pendingQueue.Len() == 0 {
|
||||
q.queueCond.L.Unlock()
|
||||
return
|
||||
}
|
||||
case <-q.forceQuit:
|
||||
q.queueCond.L.Unlock()
|
||||
return
|
||||
default:
|
||||
@ -279,12 +330,9 @@ func (q *sessionQueue) sessionManager() {
|
||||
}
|
||||
q.queueCond.L.Unlock()
|
||||
|
||||
// Exit immediately if a force quit has been requested. If
|
||||
// either of the queues still has state updates to send to the
|
||||
// tower, we may never exit in the above case if we are unable
|
||||
// to reach the tower for some reason.
|
||||
// Exit immediately if the sessionQueue has been stopped.
|
||||
select {
|
||||
case <-q.forceQuit:
|
||||
case <-q.quit:
|
||||
return
|
||||
default:
|
||||
}
|
||||
@ -333,7 +381,7 @@ func (q *sessionQueue) drainBackups() {
|
||||
q.increaseBackoff()
|
||||
select {
|
||||
case <-time.After(q.retryBackoff):
|
||||
case <-q.forceQuit:
|
||||
case <-q.quit:
|
||||
}
|
||||
return
|
||||
}
|
||||
@ -366,7 +414,7 @@ func (q *sessionQueue) drainBackups() {
|
||||
q.increaseBackoff()
|
||||
select {
|
||||
case <-time.After(q.retryBackoff):
|
||||
case <-q.forceQuit:
|
||||
case <-q.quit:
|
||||
}
|
||||
return
|
||||
}
|
||||
@ -388,7 +436,7 @@ func (q *sessionQueue) drainBackups() {
|
||||
// when we will do so.
|
||||
select {
|
||||
case <-time.After(time.Millisecond):
|
||||
case <-q.forceQuit:
|
||||
case <-q.quit:
|
||||
return
|
||||
}
|
||||
}
|
||||
@ -635,21 +683,21 @@ func (q *sessionQueue) sendStateUpdate(conn wtserver.Peer,
|
||||
return nil
|
||||
}
|
||||
|
||||
// reserveStatus returns a reserveStatus indicating whether the sessionQueue can
|
||||
// accept another task. reserveAvailable is returned when a task can be
|
||||
// accepted, and reserveExhausted is returned if the all slots in the session
|
||||
// have been allocated.
|
||||
// status returns a sessionQueueStatus indicating whether the sessionQueue can
|
||||
// accept another task. sessionQueueAvailable is returned when a task can be
|
||||
// accepted, and sessionQueueExhausted is returned if the all slots in the
|
||||
// session have been allocated.
|
||||
//
|
||||
// NOTE: This method MUST be called with queueCond's exclusive lock held.
|
||||
func (q *sessionQueue) reserveStatus() reserveStatus {
|
||||
func (q *sessionQueue) status() sessionQueueStatus {
|
||||
numPending := uint32(q.pendingQueue.Len())
|
||||
maxUpdates := uint32(q.cfg.ClientSession.Policy.MaxUpdates)
|
||||
|
||||
if uint32(q.seqNum)+numPending < maxUpdates {
|
||||
return reserveAvailable
|
||||
return sessionQueueAvailable
|
||||
}
|
||||
|
||||
return reserveExhausted
|
||||
return sessionQueueExhausted
|
||||
|
||||
}
|
||||
|
||||
@ -667,19 +715,6 @@ func (q *sessionQueue) increaseBackoff() {
|
||||
}
|
||||
}
|
||||
|
||||
// signalUntilShutdown strobes the sessionQueue's condition variable until the
|
||||
// main event loop exits.
|
||||
func (q *sessionQueue) signalUntilShutdown() {
|
||||
for {
|
||||
select {
|
||||
case <-time.After(time.Millisecond):
|
||||
q.queueCond.Signal()
|
||||
case <-q.shutdown:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sessionQueueSet maintains a mapping of SessionIDs to their corresponding
|
||||
// sessionQueue.
|
||||
type sessionQueueSet struct {
|
||||
@ -706,18 +741,18 @@ func (s *sessionQueueSet) AddAndStart(sessionQueue *sessionQueue) {
|
||||
|
||||
// StopAndRemove stops the given session queue and removes it from the
|
||||
// sessionQueueSet.
|
||||
func (s *sessionQueueSet) StopAndRemove(id wtdb.SessionID) {
|
||||
func (s *sessionQueueSet) StopAndRemove(id wtdb.SessionID) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
queue, ok := s.queues[id]
|
||||
if !ok {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
queue.Stop()
|
||||
|
||||
delete(s.queues, id)
|
||||
|
||||
return queue.Stop()
|
||||
}
|
||||
|
||||
// Get fetches and returns the sessionQueue with the given ID.
|
||||
|
Loading…
x
Reference in New Issue
Block a user