htlcswitch: add new LinkFailureDisconnect action

In this commit, we add a new LinkFailureDisconnect action that'll be
used if we detect that the remote party hasn't sent a revoke and ack
when it actually should.

Before this commit, we would log our action, tear down the link, but
then not actually force a connection recycle, as we assumed that if the
TCP connection was actually stale, then the read/write timeout would
expire.

In practice this doesn't always seem to be the case, so we make a strong
action here to actually force a disconnection in hopes that either side
will reconnect and keep the good times rollin' 🕺.
This commit is contained in:
Olaoluwa Osuntokun
2023-05-19 17:18:00 -07:00
parent cb5fc71659
commit 4d633f04e3
5 changed files with 40 additions and 12 deletions

View File

@@ -1,10 +1,10 @@
# Release Notes # Release Notes
## Mempool ## Mempool Optimizations
* Optimized [mempool * Optimized [mempool
management](https://github.com/lightningnetwork/lnd/pull/7681) to lower the CPU management](https://github.com/lightningnetwork/lnd/pull/7681) to lower the
usage. CPU usage.
## Misc ## Misc
@@ -12,7 +12,16 @@ usage.
all macaroon DB root keys on `ChangePassword`/`GenerateNewRootKey` all macaroon DB root keys on `ChangePassword`/`GenerateNewRootKey`
respectively. respectively.
## Channel Link Bug Fix
* If we detect the remote link is inactive, [we'll now tear down the
connection](https://github.com/lightningnetwork/lnd/pull/7711) in addition to
stopping the link's statemachine. If we're persistently connected with the
peer, then this'll force a reconnect, which may restart things and help avoid
certain force close scenarios.
# Contributors (Alphabetical Order) # Contributors (Alphabetical Order)
* Elle Mouton * Elle Mouton
* Olaoluwa Osuntokun
* Yong Yu * Yong Yu

View File

@@ -1037,7 +1037,7 @@ func (l *channelLink) htlcManager() {
l.fail( l.fail(
LinkFailureError{ LinkFailureError{
code: ErrSyncError, code: ErrSyncError,
FailureAction: LinkFailureForceClose, // nolint:lll FailureAction: LinkFailureForceClose, //nolint:lll
}, },
"unable to synchronize channel "+ "unable to synchronize channel "+
"states: %v", err, "states: %v", err,
@@ -1239,8 +1239,13 @@ func (l *channelLink) htlcManager() {
} }
case <-l.cfg.PendingCommitTicker.Ticks(): case <-l.cfg.PendingCommitTicker.Ticks():
l.fail(LinkFailureError{code: ErrRemoteUnresponsive}, l.fail(
"unable to complete dance") LinkFailureError{
code: ErrRemoteUnresponsive,
FailureAction: LinkFailureDisconnect,
},
"unable to complete dance",
)
return return
// A message from the switch was just received. This indicates // A message from the switch was just received. This indicates

View File

@@ -5457,7 +5457,8 @@ func TestChannelLinkFail(t *testing.T) {
// If we expect the link to force close the channel in this // If we expect the link to force close the channel in this
// case, check that it happens. If not, make sure it does not // case, check that it happens. If not, make sure it does not
// happen. // happen.
isForceCloseErr := (linkErr.FailureAction == LinkFailureForceClose) isForceCloseErr := (linkErr.FailureAction ==
LinkFailureForceClose)
require.True( require.True(
t, test.shouldForceClose == isForceCloseErr, test.name, t, test.shouldForceClose == isForceCloseErr, test.name,
) )
@@ -6343,11 +6344,12 @@ func TestPendingCommitTicker(t *testing.T) {
// Assert that we get the expected link failure from Alice. // Assert that we get the expected link failure from Alice.
select { select {
case linkErr := <-linkErrs: case linkErr := <-linkErrs:
if linkErr.code != ErrRemoteUnresponsive { require.Equal(
t.Fatalf("error code mismatch, "+ t, linkErr.code, ErrRemoteUnresponsive,
"want: ErrRemoteUnresponsive, got: %v", fmt.Sprintf("error code mismatch, want: "+
linkErr.code) "ErrRemoteUnresponsive, got: %v", linkErr.code),
} )
require.Equal(t, linkErr.FailureAction, LinkFailureDisconnect)
case <-time.After(time.Second): case <-time.After(time.Second):
t.Fatalf("did not receive failure") t.Fatalf("did not receive failure")

View File

@@ -64,6 +64,11 @@ const (
// LinkFailureForceClose indicates that the channel should be force // LinkFailureForceClose indicates that the channel should be force
// closed. // closed.
LinkFailureForceClose LinkFailureForceClose
// LinkFailureDisconnect indicates that we should disconnect in an
// attempt to recycle the connection. This can be useful if we think a
// TCP connection or state machine is stalled.
LinkFailureDisconnect
) )
// LinkFailureError encapsulates an error that will make us fail the current // LinkFailureError encapsulates an error that will make us fail the current

View File

@@ -3142,6 +3142,13 @@ func (p *Brontide) handleLinkFailure(failure linkFailureReport) {
"remote peer: %v", err) "remote peer: %v", err)
} }
} }
// If the failure action is disconnect, then we'll execute that now. If
// we had to send an error above, it was a sync call, so we expect the
// message to be flushed on the wire by now.
if failure.linkErr.FailureAction == htlcswitch.LinkFailureDisconnect {
p.Disconnect(fmt.Errorf("link requested disconnect"))
}
} }
// tryLinkShutdown attempts to fetch a target link from the switch, calls // tryLinkShutdown attempts to fetch a target link from the switch, calls