diff --git a/itest/list_on_test.go b/itest/list_on_test.go index adedfd541..f02036e7c 100644 --- a/itest/list_on_test.go +++ b/itest/list_on_test.go @@ -233,6 +233,10 @@ var allTestCases = []*lntest.TestCase{ Name: "etcd failover", TestFunc: testEtcdFailover, }, + { + Name: "leader health check", + TestFunc: testLeaderHealthCheck, + }, { Name: "hold invoice force close", TestFunc: testHoldInvoiceForceClose, diff --git a/itest/lnd_etcd_failover_test.go b/itest/lnd_etcd_failover_test.go index c2a345889..53b1dd6b3 100644 --- a/itest/lnd_etcd_failover_test.go +++ b/itest/lnd_etcd_failover_test.go @@ -4,6 +4,11 @@ package itest import ( + "context" + "fmt" + "io" + "net" + "sync" "testing" "time" @@ -137,3 +142,200 @@ func testEtcdFailoverCase(ht *lntest.HarnessTest, kill bool) { // process. ht.Shutdown(carol2) } + +// Proxy is a simple TCP proxy that forwards all traffic between a local and a +// remote address. We use it to simulate a network partition in the leader +// health check test. +type Proxy struct { + listenAddr string + targetAddr string + cancel context.CancelFunc + wg sync.WaitGroup + stopped chan struct{} +} + +// NewProxy creates a new Proxy instance with a provided context. +func NewProxy(listenAddr, targetAddr string) *Proxy { + return &Proxy{ + listenAddr: listenAddr, + targetAddr: targetAddr, + stopped: make(chan struct{}), + } +} + +// Start starts the proxy. It listens on the listen address and forwards all +// traffic to the target address. +func (p *Proxy) Start(ctx context.Context, t *testing.T) { + listener, err := net.Listen("tcp", p.listenAddr) + require.NoError(t, err, "Failed to listen on %s", p.listenAddr) + t.Logf("Proxy is listening on %s", p.listenAddr) + + proxyCtx, cancel := context.WithCancel(ctx) + p.cancel = cancel + + p.wg.Add(1) + go func() { + defer func() { + close(p.stopped) + p.wg.Done() + }() + + for { + select { + case <-proxyCtx.Done(): + listener.Close() + return + default: + } + + conn, err := listener.Accept() + if err != nil { + if proxyCtx.Err() != nil { + // Context is done, exit the loop + return + } + t.Logf("Proxy failed to accept connection: %v", + err) + + continue + } + + p.wg.Add(1) + go p.handleConnection(proxyCtx, t, conn) + } + }() +} + +// handleConnection handles an accepted connection and forwards all traffic +// between the listener and target. +func (p *Proxy) handleConnection(ctx context.Context, t *testing.T, + conn net.Conn) { + + targetConn, err := net.Dial("tcp", p.targetAddr) + require.NoError(t, err, "Failed to connect to target %s", p.targetAddr) + + defer func() { + conn.Close() + targetConn.Close() + p.wg.Done() + }() + + done := make(chan struct{}) + + p.wg.Add(2) + go func() { + defer p.wg.Done() + // Ignore the copy error due to the connection being closed. + _, _ = io.Copy(targetConn, conn) + }() + + go func() { + defer p.wg.Done() + // Ignore the copy error due to the connection being closed. + _, _ = io.Copy(conn, targetConn) + close(done) + }() + + select { + case <-ctx.Done(): + case <-done: + } +} + +// Stop stops the proxy and waits for all connections to be closed and all +// goroutines to be stopped. +func (p *Proxy) Stop(t *testing.T) { + require.NotNil(t, p.cancel, "Proxy is not started") + + p.cancel() + p.wg.Wait() + <-p.stopped + + t.Log("Proxy stopped", time.Now()) +} + +// testLeaderHealthCheck tests that a node is properly shut down when the leader +// health check fails. +func testLeaderHealthCheck(ht *lntest.HarnessTest) { + clientPort := port.NextAvailablePort() + + // Let's start a test etcd instance that we'll redirect through a proxy. + etcdCfg, cleanup, err := kvdb.StartEtcdTestBackend( + ht.T.TempDir(), uint16(clientPort), + uint16(port.NextAvailablePort()), "", + ) + require.NoError(ht, err, "Failed to start etcd instance") + + // Make leader election session TTL 5 sec to make the test run fast. + const leaderSessionTTL = 5 + + // Create an election observer that we will use to monitor the leader + // election. + observer, err := cluster.MakeLeaderElector( + ht.Context(), cluster.EtcdLeaderElector, "observer", + lncfg.DefaultEtcdElectionPrefix, leaderSessionTTL, etcdCfg, + ) + require.NoError(ht, err, "Cannot start election observer") + + // Start a proxy that will forward all traffic to the etcd instance. + clientAddr := fmt.Sprintf("localhost:%d", clientPort) + proxyAddr := fmt.Sprintf("localhost:%d", port.NextAvailablePort()) + + ctx, cancel := context.WithCancel(ht.Context()) + defer cancel() + + proxy := NewProxy(proxyAddr, clientAddr) + proxy.Start(ctx, ht.T) + + // Copy the etcd config so that we can modify the host to point to the + // proxy. + proxyEtcdCfg := *etcdCfg + // With the proxy in place, we can now configure the etcd client to + // connect to the proxy instead of the etcd instance. + proxyEtcdCfg.Host = "http://" + proxyAddr + + defer cleanup() + + // Start Carol-1 with cluster support and connect to etcd through the + // proxy. + password := []byte("the quick brown fox jumps the lazy dog") + stateless := false + cluster := true + + carol, _, _ := ht.NewNodeWithSeedEtcd( + "Carol-1", &proxyEtcdCfg, password, stateless, cluster, + leaderSessionTTL, + ) + + // Make sure Carol-1 is indeed the leader. + assertLeader(ht, observer, "Carol-1") + + // At this point Carol-1 is the elected leader, while Carol-2 will wait + // to become the leader when Carol-1 releases the lease. Note that for + // Carol-2 we don't use the proxy as we want to simulate a network + // partition only for Carol-1. + carol2 := ht.NewNodeEtcd( + "Carol-2", etcdCfg, password, cluster, leaderSessionTTL, + ) + + // Stop the proxy so that we simulate a network partition which + // consequently will make the leader health check fail and force Carol + // to shut down. + proxy.Stop(ht.T) + + // Wait for Carol-1 to stop. If the health check wouldn't properly work + // this call would timeout and trigger a test failure. + require.NoError(ht.T, carol.WaitForProcessExit()) + + // Now that Carol-1 is shut down we should fail over to Carol-2. + failoverTimeout := time.Duration(2*leaderSessionTTL) * time.Second + + // Make sure that Carol-2 becomes the leader (reported by Carol-2). + err = carol2.WaitUntilLeader(failoverTimeout) + + require.NoError(ht, err, "Waiting for Carol-2 to become the leader "+ + "failed") + + // Make sure Carol-2 is indeed the leader (repoted by the observer). + assertLeader(ht, observer, "Carol-2") +} diff --git a/itest/lnd_no_etcd_dummy_failover_test.go b/itest/lnd_no_etcd_dummy_failover_test.go index 2d731febc..468eca3f6 100644 --- a/itest/lnd_no_etcd_dummy_failover_test.go +++ b/itest/lnd_no_etcd_dummy_failover_test.go @@ -8,3 +8,7 @@ import "github.com/lightningnetwork/lnd/lntest" // testEtcdFailover is an empty itest when LND is not compiled with etcd // support. func testEtcdFailover(ht *lntest.HarnessTest) {} + +// testLeaderHealthCheck is an empty itest when LND is not compiled with etcd +// support. +func testLeaderHealthCheck(ht *lntest.HarnessTest) {}