From 31b66962d83aeac12ffba3a69c4597bc236ee204 Mon Sep 17 00:00:00 2001
From: yyforyongyu <yong2452@gmail.com>
Date: Thu, 5 Dec 2024 10:25:23 +0800
Subject: [PATCH] lntest: properly handle shutdown error

This commit removes the panic used in checking the shutdown log.
Instead, the error is returned and asserted in `shutdownAllNodes` so
it's easier to check which node failed in which test. We also catch all
the errors returned from `StopDaemon` call to properly access the
shutdown behavior.
---
 lntest/harness.go              | 32 ++++++++-------
 lntest/harness_node_manager.go |  5 ++-
 lntest/node/harness_node.go    | 72 +++++++++++++++++-----------------
 3 files changed, 58 insertions(+), 51 deletions(-)

diff --git a/lntest/harness.go b/lntest/harness.go
index 54f183d39..f32f67773 100644
--- a/lntest/harness.go
+++ b/lntest/harness.go
@@ -332,7 +332,7 @@ func (h *HarnessTest) Subtest(t *testing.T) *HarnessTest {
 		// Don't bother run the cleanups if the test is failed.
 		if st.Failed() {
 			st.Log("test failed, skipped cleanup")
-			st.shutdownAllNodes()
+			st.shutdownNodesNoAssert()
 			return
 		}
 
@@ -402,16 +402,20 @@ func (h *HarnessTest) checkAndLimitBlocksMined(startHeight int32) {
 		"50 blocks in one test")
 }
 
+// shutdownNodesNoAssert will shutdown all running nodes without assertions.
+// This is used when the test has already failed, we don't want to log more
+// errors but focusing on the original error.
+func (h *HarnessTest) shutdownNodesNoAssert() {
+	for _, node := range h.manager.activeNodes {
+		_ = h.manager.shutdownNode(node)
+	}
+}
+
 // shutdownAllNodes will shutdown all running nodes.
 func (h *HarnessTest) shutdownAllNodes() {
+	var err error
 	for _, node := range h.manager.activeNodes {
-		// The process may not be in a state to always shutdown
-		// immediately, so we'll retry up to a hard limit to ensure we
-		// eventually shutdown.
-		err := wait.NoError(func() error {
-			return h.manager.shutdownNode(node)
-		}, DefaultTimeout)
-
+		err = h.manager.shutdownNode(node)
 		if err == nil {
 			continue
 		}
@@ -421,6 +425,8 @@ func (h *HarnessTest) shutdownAllNodes() {
 		// processes.
 		h.Logf("unable to shutdown %s, got err: %v", node.Name(), err)
 	}
+
+	require.NoError(h, err, "failed to shutdown all nodes")
 }
 
 // cleanupStandbyNode is a function should be called with defer whenever a
@@ -516,12 +522,7 @@ func (h *HarnessTest) NewNodeWithCoins(name string,
 
 // Shutdown shuts down the given node and asserts that no errors occur.
 func (h *HarnessTest) Shutdown(node *node.HarnessNode) {
-	// The process may not be in a state to always shutdown immediately, so
-	// we'll retry up to a hard limit to ensure we eventually shutdown.
-	err := wait.NoError(func() error {
-		return h.manager.shutdownNode(node)
-	}, DefaultTimeout)
-
+	err := h.manager.shutdownNode(node)
 	require.NoErrorf(h, err, "unable to shutdown %v in %v", node.Name(),
 		h.manager.currentTestCase)
 }
@@ -764,9 +765,10 @@ func (h *HarnessTest) NewNodeRemoteSigner(name string, extraArgs []string,
 
 // KillNode kills the node and waits for the node process to stop.
 func (h *HarnessTest) KillNode(hn *node.HarnessNode) {
+	delete(h.manager.activeNodes, hn.Cfg.NodeID)
+
 	h.Logf("Manually killing the node %s", hn.Name())
 	require.NoErrorf(h, hn.KillAndWait(), "%s: kill got error", hn.Name())
-	delete(h.manager.activeNodes, hn.Cfg.NodeID)
 }
 
 // SetFeeEstimate sets a fee rate to be returned from fee estimator.
diff --git a/lntest/harness_node_manager.go b/lntest/harness_node_manager.go
index 2040acec7..48afa6b35 100644
--- a/lntest/harness_node_manager.go
+++ b/lntest/harness_node_manager.go
@@ -112,11 +112,14 @@ func (nm *nodeManager) registerNode(node *node.HarnessNode) {
 // ShutdownNode stops an active lnd process and returns when the process has
 // exited and any temporary directories have been cleaned up.
 func (nm *nodeManager) shutdownNode(node *node.HarnessNode) error {
+	// Remove the node from the active nodes map even if the shutdown
+	// fails as the shutdown cannot be retried in that case.
+	delete(nm.activeNodes, node.Cfg.NodeID)
+
 	if err := node.Shutdown(); err != nil {
 		return err
 	}
 
-	delete(nm.activeNodes, node.Cfg.NodeID)
 	return nil
 }
 
diff --git a/lntest/node/harness_node.go b/lntest/node/harness_node.go
index d1353c25c..0ac74f877 100644
--- a/lntest/node/harness_node.go
+++ b/lntest/node/harness_node.go
@@ -636,12 +636,11 @@ func (hn *HarnessNode) cleanup() error {
 // waitForProcessExit Launch a new goroutine which that bubbles up any
 // potential fatal process errors to the goroutine running the tests.
 func (hn *HarnessNode) WaitForProcessExit() error {
-	var err error
+	var errReturned error
 
 	errChan := make(chan error, 1)
 	go func() {
-		err = hn.cmd.Wait()
-		errChan <- err
+		errChan <- hn.cmd.Wait()
 	}()
 
 	select {
@@ -656,24 +655,36 @@ func (hn *HarnessNode) WaitForProcessExit() error {
 			return nil
 		}
 
+		// The process may have already been killed in the test, in
+		// that case we will skip the error and continue processing
+		// the logs.
+		if strings.Contains(err.Error(), "signal: killed") {
+			break
+		}
+
 		// Otherwise, we print the error, break the select and save
 		// logs.
 		hn.printErrf("wait process exit got err: %v", err)
-
-		break
+		errReturned = err
 
 	case <-time.After(wait.DefaultTimeout):
 		hn.printErrf("timeout waiting for process to exit")
 	}
 
 	// Make sure log file is closed and renamed if necessary.
-	finalizeLogfile(hn)
+	filename := finalizeLogfile(hn)
 
-	// Rename the etcd.log file if the node was running on embedded
-	// etcd.
+	// Assert the node has shut down from the log file.
+	err1 := assertNodeShutdown(filename)
+	if err1 != nil {
+		return fmt.Errorf("[%s]: assert shutdown failed in log[%s]: %w",
+			hn.Name(), filename, err1)
+	}
+
+	// Rename the etcd.log file if the node was running on embedded etcd.
 	finalizeEtcdLog(hn)
 
-	return err
+	return errReturned
 }
 
 // Stop attempts to stop the active lnd process.
@@ -700,23 +711,21 @@ func (hn *HarnessNode) Stop() error {
 
 		err := wait.NoError(func() error {
 			_, err := hn.RPC.LN.StopDaemon(ctxt, &req)
-
-			switch {
-			case err == nil:
-				return nil
-
-			// Try again if a recovery/rescan is in progress.
-			case strings.Contains(
-				err.Error(), "recovery in progress",
-			):
-				return err
-
-			default:
+			if err == nil {
 				return nil
 			}
+
+			// If the connection is already closed, we can exit
+			// early as the node has already been shut down in the
+			// test, e.g., in etcd leader health check test.
+			if strings.Contains(err.Error(), "connection refused") {
+				return nil
+			}
+
+			return err
 		}, wait.DefaultTimeout)
 		if err != nil {
-			return err
+			return fmt.Errorf("shutdown timeout: %w", err)
 		}
 
 		// Wait for goroutines to be finished.
@@ -724,6 +733,7 @@ func (hn *HarnessNode) Stop() error {
 		go func() {
 			hn.Watcher.wg.Wait()
 			close(done)
+			hn.Watcher = nil
 		}()
 
 		// If the goroutines fail to finish before timeout, we'll print
@@ -966,31 +976,23 @@ func getFinalizedLogFilePrefix(hn *HarnessNode) string {
 
 // finalizeLogfile makes sure the log file cleanup function is initialized,
 // even if no log file is created.
-func finalizeLogfile(hn *HarnessNode) {
+func finalizeLogfile(hn *HarnessNode) string {
 	// Exit early if there's no log file.
 	if hn.logFile == nil {
-		return
+		return ""
 	}
 
 	hn.logFile.Close()
 
 	// If logoutput flag is not set, return early.
 	if !*logOutput {
-		return
+		return ""
 	}
 
-	newFileName := fmt.Sprintf("%v.log",
-		getFinalizedLogFilePrefix(hn),
-	)
+	newFileName := fmt.Sprintf("%v.log", getFinalizedLogFilePrefix(hn))
 	renameFile(hn.filename, newFileName)
 
-	// Assert the node has shut down from the log file.
-	err := assertNodeShutdown(newFileName)
-	if err != nil {
-		err := fmt.Errorf("[%s]: assert shutdown failed in log[%s]: %w",
-			hn.Name(), newFileName, err)
-		panic(err)
-	}
+	return newFileName
 }
 
 // assertNodeShutdown asserts that the node has shut down properly by checking