wtclient: replay pending tasks on sessionQueue stop

This commit does a few things:
- First, it gives the sessionQueue access to the TowerClient task
  pipeline so that it can replay backup tasks onto the pipeline on Stop.
- Given that the above is done, the ForceQuit functionality of the
  sessionQueue and TowerClient can be removed.
- The bug demonstrated in a prior commit is now fixed due to the above
  changes.
This commit is contained in:
Elle Mouton 2023-05-25 10:48:10 +02:00
parent 449d6b5500
commit 552ef4bf81
No known key found for this signature in database
GPG Key ID: D7D916376026F177
5 changed files with 200 additions and 304 deletions

View File

@ -252,9 +252,8 @@ type TowerClient interface {
// BackupState initiates a request to back up a particular revoked
// state. If the method returns nil, the backup is guaranteed to be
// successful unless the tower is unavailable and client is force quit,
// or the justice transaction would create dust outputs when trying to
// abide by the negotiated policy.
// successful unless the justice transaction would create dust outputs
// when trying to abide by the negotiated policy.
BackupState(chanID *lnwire.ChannelID, stateNum uint64) error
}

View File

@ -1569,7 +1569,6 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
ChainHash: *s.cfg.ActiveNetParams.GenesisHash,
MinBackoff: 10 * time.Second,
MaxBackoff: 5 * time.Minute,
ForceQuitDelay: wtclient.DefaultForceQuitDelay,
MaxTasksInMemQueue: cfg.WtClient.MaxTasksInMemQueue,
})
if err != nil {
@ -1603,7 +1602,6 @@ func newServer(cfg *Config, listenAddrs []net.Addr,
ChainHash: *s.cfg.ActiveNetParams.GenesisHash,
MinBackoff: 10 * time.Second,
MaxBackoff: 5 * time.Minute,
ForceQuitDelay: wtclient.DefaultForceQuitDelay,
MaxTasksInMemQueue: cfg.WtClient.MaxTasksInMemQueue,
})
if err != nil {

View File

@ -42,11 +42,6 @@ const (
// metrics about the client's operation.
DefaultStatInterval = time.Minute
// DefaultForceQuitDelay specifies the default duration after which the
// client should abandon any pending updates or session negotiations
// before terminating.
DefaultForceQuitDelay = 10 * time.Second
// DefaultSessionCloseRange is the range over which we will generate a
// random number of blocks to delay closing a session after its last
// channel has been closed.
@ -138,9 +133,8 @@ type Client interface {
// BackupState initiates a request to back up a particular revoked
// state. If the method returns nil, the backup is guaranteed to be
// successful unless the client is force quit, or the justice
// transaction would create dust outputs when trying to abide by the
// negotiated policy.
// successful unless the justice transaction would create dust outputs
// when trying to abide by the negotiated policy.
BackupState(chanID *lnwire.ChannelID, stateNum uint64) error
// Start initializes the watchtower client, allowing it process requests
@ -151,10 +145,6 @@ type Client interface {
// so, it will attempt to flush the pipeline and deliver any queued
// states to the tower before exiting.
Stop() error
// ForceQuit will forcibly shutdown the watchtower client. Calling this
// may lead to queued states being dropped.
ForceQuit()
}
// Config provides the TowerClient with access to the resources it requires to
@ -213,13 +203,6 @@ type Config struct {
// the tower must be watching to monitor for breaches.
ChainHash chainhash.Hash
// ForceQuitDelay is the duration after attempting to shutdown that the
// client will automatically abort any pending backups if an unclean
// shutdown is detected. If the value is less than or equal to zero, a
// call to Stop may block indefinitely. The client can always be
// ForceQuit externally irrespective of the chosen parameter.
ForceQuitDelay time.Duration
// ReadTimeout is the duration we will wait during a read before
// breaking out of a blocking read. If the value is less than or equal
// to zero, the default will be used instead.
@ -295,7 +278,6 @@ type staleTowerMsg struct {
type TowerClient struct {
started sync.Once
stopped sync.Once
forced sync.Once
cfg *Config
@ -323,9 +305,8 @@ type TowerClient struct {
newTowers chan *newTowerMsg
staleTowers chan *staleTowerMsg
wg sync.WaitGroup
quit chan struct{}
forceQuit chan struct{}
wg sync.WaitGroup
quit chan struct{}
}
// Compile-time constraint to ensure *TowerClient implements the Client
@ -385,7 +366,6 @@ func New(config *Config) (*TowerClient, error) {
stats: new(ClientStats),
newTowers: make(chan *newTowerMsg),
staleTowers: make(chan *staleTowerMsg),
forceQuit: make(chan struct{}),
quit: make(chan struct{}),
}
@ -697,58 +677,44 @@ func (c *TowerClient) Stop() error {
c.stopped.Do(func() {
c.log.Debugf("Stopping watchtower client")
// 1. To ensure we don't hang forever on shutdown due to
// unintended failures, we'll delay a call to force quit the
// pipeline if a ForceQuitDelay is specified. This will have no
// effect if the pipeline shuts down cleanly before the delay
// fires.
//
// For full safety, this can be set to 0 and wait out
// indefinitely. However for mobile clients which may have a
// limited amount of time to exit before the background process
// is killed, this offers a way to ensure the process
// terminates.
if c.cfg.ForceQuitDelay > 0 {
time.AfterFunc(c.cfg.ForceQuitDelay, c.ForceQuit)
}
// 2. Shutdown the backup queue, which will prevent any further
// updates from being accepted. In practice, the links should be
// shutdown before the client has been stopped, so all updates
// would have been added prior.
err := c.pipeline.Stop()
// 1. Stop the session negotiator.
err := c.negotiator.Stop()
if err != nil {
returnErr = err
}
// 3. Once the backup queue has shutdown, wait for the main
// dispatcher to exit. The backup queue will signal it's
// completion to the dispatcher, which releases the wait group
// after all tasks have been assigned to session queues.
// 2. Stop the backup dispatcher and any other goroutines.
close(c.quit)
c.wg.Wait()
// 4. Since all valid tasks have been assigned to session
// queues, we no longer need to negotiate sessions.
err = c.negotiator.Stop()
if err != nil {
returnErr = err
// 3. If there was a left over 'prevTask' from the backup
// dispatcher, replay that onto the pipeline.
if c.prevTask != nil {
err = c.pipeline.QueueBackupID(c.prevTask)
if err != nil {
returnErr = err
}
}
c.log.Debugf("Waiting for active session queues to finish "+
"draining, stats: %s", c.stats)
// 5. Shutdown all active session queues in parallel. These will
// exit once all updates have been acked by the watchtower.
// 4. Shutdown all active session queues in parallel. These will
// exit once all unhandled updates have been replayed to the
// task pipeline.
c.activeSessions.ApplyAndWait(func(s *sessionQueue) func() {
return s.Stop
return func() {
err := s.Stop()
if err != nil {
c.log.Errorf("could not stop session "+
"queue: %s: %v", s.ID(), err)
returnErr = err
}
}
})
// Skip log if force quitting.
select {
case <-c.forceQuit:
return
default:
// 5. Shutdown the backup queue, which will prevent any further
// updates from being accepted.
if err = c.pipeline.Stop(); err != nil {
returnErr = err
}
c.log.Debugf("Client successfully stopped, stats: %s", c.stats)
@ -757,43 +723,6 @@ func (c *TowerClient) Stop() error {
return returnErr
}
// ForceQuit idempotently initiates an unclean shutdown of the watchtower
// client. This should only be executed if Stop is unable to exit cleanly.
func (c *TowerClient) ForceQuit() {
c.forced.Do(func() {
c.log.Infof("Force quitting watchtower client")
// 1. Shutdown the backup queue, which will prevent any further
// updates from being accepted. In practice, the links should be
// shutdown before the client has been stopped, so all updates
// would have been added prior.
err := c.pipeline.Stop()
if err != nil {
c.log.Errorf("could not stop backup queue: %v", err)
}
// 2. Once the backup queue has shutdown, wait for the main
// dispatcher to exit. The backup queue will signal it's
// completion to the dispatcher, which releases the wait group
// after all tasks have been assigned to session queues.
close(c.forceQuit)
c.wg.Wait()
// 3. Since all valid tasks have been assigned to session
// queues, we no longer need to negotiate sessions.
c.negotiator.Stop()
// 4. Force quit all active session queues in parallel. These
// will exit once all updates have been acked by the watchtower.
c.activeSessions.ApplyAndWait(func(s *sessionQueue) func() {
return s.ForceQuit
})
c.log.Infof("Watchtower client unclean shutdown complete, "+
"stats: %s", c.stats)
})
}
// RegisterChannel persistently initializes any channel-dependent parameters
// within the client. This should be called during link startup to ensure that
// the client is able to support the link during operation.
@ -832,7 +761,6 @@ func (c *TowerClient) RegisterChannel(chanID lnwire.ChannelID) error {
// BackupState initiates a request to back up a particular revoked state. If the
// method returns nil, the backup is guaranteed to be successful unless the:
// - client is force quit,
// - justice transaction would create dust outputs when trying to abide by the
// negotiated policy, or
// - breached outputs contain too little value to sweep at the target sweep
@ -955,9 +883,6 @@ func (c *TowerClient) handleChannelCloses(chanSub subscribe.Subscription) {
err)
}
case <-c.forceQuit:
return
case <-c.quit:
return
}
@ -1085,9 +1010,6 @@ func (c *TowerClient) handleClosableSessions(
}
}
case <-c.forceQuit:
return
case <-c.quit:
return
}
@ -1246,8 +1168,7 @@ func (c *TowerClient) deleteSessionFromTower(sess *wtdb.ClientSession) error {
// backupDispatcher processes events coming from the taskPipeline and is
// responsible for detecting when the client needs to renegotiate a session to
// fulfill continuing demand. The event loop exits after all tasks have been
// received from the upstream taskPipeline, or the taskPipeline is force quit.
// fulfill continuing demand. The event loop exits if the TowerClient is quit.
//
// NOTE: This method MUST be run as a goroutine.
func (c *TowerClient) backupDispatcher() {
@ -1297,7 +1218,7 @@ func (c *TowerClient) backupDispatcher() {
case msg := <-c.staleTowers:
msg.errChan <- c.handleStaleTower(msg)
case <-c.forceQuit:
case <-c.quit:
return
}
@ -1381,6 +1302,9 @@ func (c *TowerClient) backupDispatcher() {
// of its corresponding candidate sessions as inactive.
case msg := <-c.staleTowers:
msg.errChan <- c.handleStaleTower(msg)
case <-c.quit:
return
}
}
}
@ -1422,7 +1346,7 @@ func (c *TowerClient) processTask(task *wtdb.BackupID) {
// sessionQueue will be removed if accepting the task left the sessionQueue in
// an exhausted state.
func (c *TowerClient) taskAccepted(task *wtdb.BackupID,
newStatus reserveStatus) {
newStatus sessionQueueStatus) {
c.log.Infof("Queued %v successfully for session %v", task,
c.sessionQueue.ID())
@ -1436,11 +1360,11 @@ func (c *TowerClient) taskAccepted(task *wtdb.BackupID,
switch newStatus {
// The sessionQueue still has capacity after accepting this task.
case reserveAvailable:
case sessionQueueAvailable:
// The sessionQueue is full after accepting this task, so we will need
// to request a new one before proceeding.
case reserveExhausted:
case sessionQueueExhausted:
c.stats.sessionExhausted()
c.log.Debugf("Session %s exhausted", c.sessionQueue.ID())
@ -1456,16 +1380,17 @@ func (c *TowerClient) taskAccepted(task *wtdb.BackupID,
// the state the was in *before* the task was rejected. The client's prevTask
// will cache the task if the sessionQueue was exhausted beforehand, and nil
// the sessionQueue to find a new session. If the sessionQueue was not
// exhausted, the client marks the task as ineligible, as this implies we
// couldn't construct a valid justice transaction given the session's policy.
// exhausted and not shutting down, the client marks the task as ineligible, as
// this implies we couldn't construct a valid justice transaction given the
// session's policy.
func (c *TowerClient) taskRejected(task *wtdb.BackupID,
curStatus reserveStatus) {
curStatus sessionQueueStatus) {
switch curStatus {
// The sessionQueue has available capacity but the task was rejected,
// this indicates that the task was ineligible for backup.
case reserveAvailable:
case sessionQueueAvailable:
c.stats.taskIneligible()
c.log.Infof("Ignoring ineligible %v", task)
@ -1491,7 +1416,7 @@ func (c *TowerClient) taskRejected(task *wtdb.BackupID,
// The sessionQueue rejected the task because it is full, we will stash
// this task and try to add it to the next available sessionQueue.
case reserveExhausted:
case sessionQueueExhausted:
c.stats.sessionExhausted()
c.log.Debugf("Session %v exhausted, %v queued for next session",
@ -1501,6 +1426,18 @@ func (c *TowerClient) taskRejected(task *wtdb.BackupID,
// once a new session queue is available.
c.sessionQueue = nil
c.prevTask = task
// The sessionQueue rejected the task because it is shutting down. We
// will stash this task and try to add it to the next available
// sessionQueue.
case sessionQueueShuttingDown:
c.log.Debugf("Session %v is shutting down, %v queued for "+
"next session", c.sessionQueue.ID(), task)
// Cache the task that we pulled off, so that we can process it
// once a new session queue is available.
c.sessionQueue = nil
c.prevTask = task
}
}
@ -1600,6 +1537,7 @@ func (c *TowerClient) newSessionQueue(s *ClientSession,
MaxBackoff: c.cfg.MaxBackoff,
Log: c.log,
BuildBreachRetribution: c.cfg.BuildBreachRetribution,
TaskPipeline: c.pipeline,
}, updates)
}
@ -1790,6 +1728,14 @@ func (c *TowerClient) handleStaleTower(msg *staleTowerMsg) error {
}
for sessionID := range sessions {
delete(c.candidateSessions, sessionID)
// Shutdown the session so that any pending updates are
// replayed back onto the main task pipeline.
err = c.activeSessions.StopAndRemove(sessionID)
if err != nil {
c.log.Errorf("could not stop session %s: %w", sessionID,
err)
}
}
// If our active session queue corresponds to the stale tower, we'll

View File

@ -488,7 +488,6 @@ func newHarness(t *testing.T, cfg harnessCfg) *testHarness {
WriteTimeout: timeout,
MinBackoff: time.Millisecond,
MaxBackoff: time.Second,
ForceQuitDelay: 10 * time.Second,
SessionCloseRange: 1,
MaxTasksInMemQueue: 2,
}
@ -508,7 +507,9 @@ func newHarness(t *testing.T, cfg harnessCfg) *testHarness {
}
h.startClient()
t.Cleanup(h.client.ForceQuit)
t.Cleanup(func() {
require.NoError(t, h.client.Stop())
})
h.makeChannel(0, h.cfg.localBalance, h.cfg.remoteBalance)
if !cfg.noRegisterChan0 {
@ -952,27 +953,6 @@ func (s *serverHarness) restart(op func(cfg *wtserver.Config)) {
op(s.cfg)
}
// assertUpdatesNotFound asserts that a set of hints are not found in the
// server's DB.
func (s *serverHarness) assertUpdatesNotFound(hints []blob.BreachHint) {
s.t.Helper()
hintSet := make(map[blob.BreachHint]struct{})
for _, hint := range hints {
hintSet[hint] = struct{}{}
}
time.Sleep(time.Second)
matches, err := s.db.QueryMatches(hints)
require.NoError(s.t, err, "unable to query for hints")
for _, match := range matches {
_, ok := hintSet[match.Hint]
require.False(s.t, ok, "breach hint was found in server DB")
}
}
// waitForUpdates blocks until the breach hints provided all appear in the
// watchtower's database or the timeout expires. This is used to test that the
// client in fact sends the updates to the server, even if it is offline.
@ -1238,12 +1218,9 @@ var clientTests = []clientTest{
h.backupState(chanID, numSent, nil)
numSent++
// Force quit the client to abort the state updates it
// has queued. The sleep ensures that the session queues
// have enough time to commit the state updates before
// the client is killed.
time.Sleep(time.Second)
h.client.ForceQuit()
// Stop the client to abort the state updates it has
// queued.
require.NoError(h.t, h.client.Stop())
// Restart the server and allow it to ack the updates
// after the client retransmits the unacked update.
@ -1437,8 +1414,8 @@ var clientTests = []clientTest{
// server should have no updates.
h.server.waitForUpdates(nil, waitTime)
// Force quit the client since it has queued backups.
h.client.ForceQuit()
// Stop the client since it has queued backups.
require.NoError(h.t, h.client.Stop())
// Restart the server and allow it to ack session
// creation.
@ -1489,8 +1466,8 @@ var clientTests = []clientTest{
// server should have no updates.
h.server.waitForUpdates(nil, waitTime)
// Force quit the client since it has queued backups.
h.client.ForceQuit()
// Stop the client since it has queued backups.
require.NoError(h.t, h.client.Stop())
// Restart the server and allow it to ack session
// creation.
@ -1672,56 +1649,6 @@ var clientTests = []clientTest{
h.server.waitForUpdates(hints[numUpdates/2:], waitTime)
},
},
{
// Asserts that the client's force quite delay will properly
// shutdown the client if it is unable to completely drain the
// task pipeline.
name: "force unclean shutdown",
cfg: harnessCfg{
localBalance: localBalance,
remoteBalance: remoteBalance,
policy: wtpolicy.Policy{
TxPolicy: defaultTxPolicy,
MaxUpdates: 5,
},
},
fn: func(h *testHarness) {
const (
chanID = 0
numUpdates = 6
maxUpdates = 5
)
// Advance the channel to create all states.
hints := h.advanceChannelN(chanID, numUpdates)
// Back up 4 of the 5 states for the negotiated session.
h.backupStates(chanID, 0, maxUpdates-1, nil)
h.server.waitForUpdates(hints[:maxUpdates-1], waitTime)
// Now, restart the tower and prevent it from acking any
// new sessions. We do this here as once the last slot
// is exhausted the client will attempt to renegotiate.
h.server.restart(func(cfg *wtserver.Config) {
cfg.NoAckCreateSession = true
})
// Back up the remaining two states. Once the first is
// processed, the session will be exhausted but the
// client won't be able to renegotiate a session for
// the final state. We'll only wait for the first five
// states to arrive at the tower.
h.backupStates(chanID, maxUpdates-1, numUpdates, nil)
h.server.waitForUpdates(hints[:maxUpdates], waitTime)
// Finally, stop the client which will continue to
// attempt session negotiation since it has one more
// state to process. After the force quite delay
// expires, the client should force quite itself and
// allow the test to complete.
h.server.stop()
},
},
{
// Assert that if a client changes the address for a server and
// then tries to back up updates then the client will switch to
@ -1937,7 +1864,7 @@ var clientTests = []clientTest{
require.False(h.t, h.isSessionClosable(sessionIDs[0]))
// Restart the client.
h.client.ForceQuit()
require.NoError(h.t, h.client.Stop())
h.startClient()
// The session should now have been marked as closable.
@ -2176,9 +2103,8 @@ var clientTests = []clientTest{
h.backupStates(chanID, 0, numUpdates/2, nil)
// Restart the Client (force quit). And also now start
// the server.
h.client.ForceQuit()
// Restart the Client. And also now start the server.
require.NoError(h.t, h.client.Stop())
h.server.start()
h.startClient()
@ -2237,8 +2163,7 @@ var clientTests = []clientTest{
{
// Show that if a client switches to a new tower _after_ backup
// tasks have been bound to the session with the first old tower
// then these updates are _not_ replayed onto the new tower.
// This is a bug that will be fixed in a future commit.
// then these updates are replayed onto the new tower.
name: "switch to new tower after tasks are bound",
cfg: harnessCfg{
localBalance: localBalance,
@ -2290,18 +2215,11 @@ var clientTests = []clientTest{
// Back up the final task.
h.backupStates(chanID, numUpdates-1, numUpdates, nil)
// Show that only the latest backup is backed up to the
// server and that the ones backed up while no tower was
// online were _not_ backed up to either server. This is
// a bug that will be fixed in a future commit.
// Show that all the backups (the ones added while no
// towers were online and the one added after adding the
// second tower) are backed up to the second tower.
server2.waitForUpdates(
hints[numUpdates-1:], time.Second,
)
server2.assertUpdatesNotFound(
hints[numUpdates/2 : numUpdates-1],
)
h.server.assertUpdatesNotFound(
hints[numUpdates/2 : numUpdates-1],
hints[numUpdates/2:numUpdates], waitTime,
)
},
},

View File

@ -16,17 +16,21 @@ import (
"github.com/lightningnetwork/lnd/watchtower/wtwire"
)
// reserveStatus is an enum that signals how full a particular session is.
type reserveStatus uint8
// sessionQueueStatus is an enum that signals how full a particular session is.
type sessionQueueStatus uint8
const (
// reserveAvailable indicates that the session has space for at least
// one more backup.
reserveAvailable reserveStatus = iota
// sessionQueueAvailable indicates that the session has space for at
// least one more backup.
sessionQueueAvailable sessionQueueStatus = iota
// reserveExhausted indicates that all slots in the session have been
// allocated.
reserveExhausted
// sessionQueueExhausted indicates that all slots in the session have
// been allocated.
sessionQueueExhausted
// sessionQueueShuttingDown indicates that the session queue is
// shutting down and so is no longer accepting any more backups.
sessionQueueShuttingDown
)
// sessionQueueConfig bundles the resources required by the sessionQueue to
@ -62,6 +66,10 @@ type sessionQueueConfig struct {
// certain revoked commitment height.
BuildBreachRetribution BreachRetributionBuilder
// TaskPipeline is a pipeline which the sessionQueue should use to send
// any unhandled tasks on shutdown of the queue.
TaskPipeline *DiskOverflowQueue[*wtdb.BackupID]
// DB provides access to the client's stable storage.
DB DB
@ -85,10 +93,8 @@ type sessionQueueConfig struct {
// sessionQueue implements a reliable queue that will encrypt and send accepted
// backups to the watchtower specified in the config's ClientSession. Calling
// Quit will attempt to perform a clean shutdown by receiving an ACK from the
// tower for all pending backups before exiting. The clean shutdown can be
// aborted by using ForceQuit, which will attempt to shut down the queue
// immediately.
// Stop will attempt to perform a clean shutdown replaying any un-committed
// pending updates to the TowerClient's main task pipeline.
type sessionQueue struct {
started sync.Once
stopped sync.Once
@ -109,9 +115,8 @@ type sessionQueue struct {
retryBackoff time.Duration
quit chan struct{}
forceQuit chan struct{}
shutdown chan struct{}
quit chan struct{}
wg sync.WaitGroup
}
// newSessionQueue initializes a fresh sessionQueue.
@ -133,8 +138,6 @@ func newSessionQueue(cfg *sessionQueueConfig,
seqNum: cfg.ClientSession.SeqNum,
retryBackoff: cfg.MinBackoff,
quit: make(chan struct{}),
forceQuit: make(chan struct{}),
shutdown: make(chan struct{}),
}
sq.queueCond = sync.NewCond(&sq.queueMtx)
@ -151,41 +154,77 @@ func newSessionQueue(cfg *sessionQueueConfig,
// backups.
func (q *sessionQueue) Start() {
q.started.Do(func() {
q.wg.Add(1)
go q.sessionManager()
})
}
// Stop idempotently stops the sessionQueue by initiating a clean shutdown that
// will clear all pending tasks in the queue before returning to the caller.
func (q *sessionQueue) Stop() {
func (q *sessionQueue) Stop() error {
var returnErr error
q.stopped.Do(func() {
q.log.Debugf("SessionQueue(%s) stopping ...", q.ID())
close(q.quit)
q.signalUntilShutdown()
// Skip log if we also force quit.
select {
case <-q.forceQuit:
shutdown := make(chan struct{})
go func() {
for {
select {
case <-time.After(time.Millisecond):
q.queueCond.Signal()
case <-shutdown:
return
}
}
}()
q.wg.Wait()
close(shutdown)
// Now, for any task in the pending queue that we have not yet
// created a CommittedUpdate for, re-add the task to the main
// task pipeline.
updates, err := q.cfg.DB.FetchSessionCommittedUpdates(q.ID())
if err != nil {
returnErr = err
return
default:
}
unAckedUpdates := make(map[wtdb.BackupID]bool)
for _, update := range updates {
unAckedUpdates[update.BackupID] = true
}
// Push any task that was on the pending queue that there is
// not yet a committed update for back to the main task
// pipeline.
q.queueCond.L.Lock()
for q.pendingQueue.Len() > 0 {
next := q.pendingQueue.Front()
q.pendingQueue.Remove(next)
//nolint:forcetypeassert
task := next.Value.(*backupTask)
if unAckedUpdates[task.id] {
continue
}
err := q.cfg.TaskPipeline.QueueBackupID(&task.id)
if err != nil {
log.Errorf("could not re-queue backup task: "+
"%v", err)
continue
}
}
q.queueCond.L.Unlock()
q.log.Debugf("SessionQueue(%s) stopped", q.ID())
})
}
// ForceQuit idempotently aborts any clean shutdown in progress and returns to
// he caller after all lingering goroutines have spun down.
func (q *sessionQueue) ForceQuit() {
q.forced.Do(func() {
q.log.Infof("SessionQueue(%s) force quitting...", q.ID())
close(q.forceQuit)
q.signalUntilShutdown()
q.log.Infof("SessionQueue(%s) force quit", q.ID())
})
return returnErr
}
// ID returns the wtdb.SessionID for the queue, which can be used to uniquely
@ -196,10 +235,28 @@ func (q *sessionQueue) ID() *wtdb.SessionID {
// AcceptTask attempts to queue a backupTask for delivery to the sessionQueue's
// tower. The session will only be accepted if the queue is not already
// exhausted and the task is successfully bound to the ClientSession.
func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
// exhausted or shutting down and the task is successfully bound to the
// ClientSession.
func (q *sessionQueue) AcceptTask(task *backupTask) (sessionQueueStatus, bool) {
// Exit early if the queue has started shutting down.
select {
case <-q.quit:
return sessionQueueShuttingDown, false
default:
}
q.queueCond.L.Lock()
// There is a chance that sessionQueue started shutting down between
// the last quit channel check and waiting for the lock. So check one
// more time here.
select {
case <-q.quit:
q.queueCond.L.Unlock()
return sessionQueueShuttingDown, false
default:
}
numPending := uint32(q.pendingQueue.Len())
maxUpdates := q.cfg.ClientSession.Policy.MaxUpdates
q.log.Debugf("SessionQueue(%s) deciding to accept %v seqnum=%d "+
@ -207,14 +264,14 @@ func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
q.ID(), task.id, q.seqNum, numPending, maxUpdates)
// Examine the current reserve status of the session queue.
curStatus := q.reserveStatus()
curStatus := q.status()
switch curStatus {
// The session queue is exhausted, and cannot accept the task because it
// is full. Reject the task such that it can be tried against a
// different session.
case reserveExhausted:
case sessionQueueExhausted:
q.queueCond.L.Unlock()
return curStatus, false
@ -224,7 +281,7 @@ func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
// tried again.
//
// TODO(conner): queue backups and retry with different session params.
case reserveAvailable:
case sessionQueueAvailable:
err := task.bindSession(
&q.cfg.ClientSession.ClientSessionBody,
q.cfg.BuildBreachRetribution,
@ -244,7 +301,7 @@ func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
// Finally, compute the session's *new* reserve status. This will be
// used by the client to determine if it can continue using this session
// queue, or if it should negotiate a new one.
newStatus := q.reserveStatus()
newStatus := q.status()
q.queueCond.L.Unlock()
q.queueCond.Signal()
@ -255,7 +312,7 @@ func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) {
// sessionManager is the primary event loop for the sessionQueue, and is
// responsible for encrypting and sending accepted tasks to the tower.
func (q *sessionQueue) sessionManager() {
defer close(q.shutdown)
defer q.wg.Done()
for {
q.queueCond.L.Lock()
@ -266,12 +323,6 @@ func (q *sessionQueue) sessionManager() {
select {
case <-q.quit:
if q.commitQueue.Len() == 0 &&
q.pendingQueue.Len() == 0 {
q.queueCond.L.Unlock()
return
}
case <-q.forceQuit:
q.queueCond.L.Unlock()
return
default:
@ -279,12 +330,9 @@ func (q *sessionQueue) sessionManager() {
}
q.queueCond.L.Unlock()
// Exit immediately if a force quit has been requested. If
// either of the queues still has state updates to send to the
// tower, we may never exit in the above case if we are unable
// to reach the tower for some reason.
// Exit immediately if the sessionQueue has been stopped.
select {
case <-q.forceQuit:
case <-q.quit:
return
default:
}
@ -333,7 +381,7 @@ func (q *sessionQueue) drainBackups() {
q.increaseBackoff()
select {
case <-time.After(q.retryBackoff):
case <-q.forceQuit:
case <-q.quit:
}
return
}
@ -366,7 +414,7 @@ func (q *sessionQueue) drainBackups() {
q.increaseBackoff()
select {
case <-time.After(q.retryBackoff):
case <-q.forceQuit:
case <-q.quit:
}
return
}
@ -388,7 +436,7 @@ func (q *sessionQueue) drainBackups() {
// when we will do so.
select {
case <-time.After(time.Millisecond):
case <-q.forceQuit:
case <-q.quit:
return
}
}
@ -635,21 +683,21 @@ func (q *sessionQueue) sendStateUpdate(conn wtserver.Peer,
return nil
}
// reserveStatus returns a reserveStatus indicating whether the sessionQueue can
// accept another task. reserveAvailable is returned when a task can be
// accepted, and reserveExhausted is returned if the all slots in the session
// have been allocated.
// status returns a sessionQueueStatus indicating whether the sessionQueue can
// accept another task. sessionQueueAvailable is returned when a task can be
// accepted, and sessionQueueExhausted is returned if the all slots in the
// session have been allocated.
//
// NOTE: This method MUST be called with queueCond's exclusive lock held.
func (q *sessionQueue) reserveStatus() reserveStatus {
func (q *sessionQueue) status() sessionQueueStatus {
numPending := uint32(q.pendingQueue.Len())
maxUpdates := uint32(q.cfg.ClientSession.Policy.MaxUpdates)
if uint32(q.seqNum)+numPending < maxUpdates {
return reserveAvailable
return sessionQueueAvailable
}
return reserveExhausted
return sessionQueueExhausted
}
@ -667,19 +715,6 @@ func (q *sessionQueue) increaseBackoff() {
}
}
// signalUntilShutdown strobes the sessionQueue's condition variable until the
// main event loop exits.
func (q *sessionQueue) signalUntilShutdown() {
for {
select {
case <-time.After(time.Millisecond):
q.queueCond.Signal()
case <-q.shutdown:
return
}
}
}
// sessionQueueSet maintains a mapping of SessionIDs to their corresponding
// sessionQueue.
type sessionQueueSet struct {
@ -706,18 +741,18 @@ func (s *sessionQueueSet) AddAndStart(sessionQueue *sessionQueue) {
// StopAndRemove stops the given session queue and removes it from the
// sessionQueueSet.
func (s *sessionQueueSet) StopAndRemove(id wtdb.SessionID) {
func (s *sessionQueueSet) StopAndRemove(id wtdb.SessionID) error {
s.mu.Lock()
defer s.mu.Unlock()
queue, ok := s.queues[id]
if !ok {
return
return nil
}
queue.Stop()
delete(s.queues, id)
return queue.Stop()
}
// Get fetches and returns the sessionQueue with the given ID.