From 31b66962d83aeac12ffba3a69c4597bc236ee204 Mon Sep 17 00:00:00 2001 From: yyforyongyu Date: Thu, 5 Dec 2024 10:25:23 +0800 Subject: [PATCH] lntest: properly handle shutdown error This commit removes the panic used in checking the shutdown log. Instead, the error is returned and asserted in `shutdownAllNodes` so it's easier to check which node failed in which test. We also catch all the errors returned from `StopDaemon` call to properly access the shutdown behavior. --- lntest/harness.go | 32 ++++++++------- lntest/harness_node_manager.go | 5 ++- lntest/node/harness_node.go | 72 +++++++++++++++++----------------- 3 files changed, 58 insertions(+), 51 deletions(-) diff --git a/lntest/harness.go b/lntest/harness.go index 54f183d39..f32f67773 100644 --- a/lntest/harness.go +++ b/lntest/harness.go @@ -332,7 +332,7 @@ func (h *HarnessTest) Subtest(t *testing.T) *HarnessTest { // Don't bother run the cleanups if the test is failed. if st.Failed() { st.Log("test failed, skipped cleanup") - st.shutdownAllNodes() + st.shutdownNodesNoAssert() return } @@ -402,16 +402,20 @@ func (h *HarnessTest) checkAndLimitBlocksMined(startHeight int32) { "50 blocks in one test") } +// shutdownNodesNoAssert will shutdown all running nodes without assertions. +// This is used when the test has already failed, we don't want to log more +// errors but focusing on the original error. +func (h *HarnessTest) shutdownNodesNoAssert() { + for _, node := range h.manager.activeNodes { + _ = h.manager.shutdownNode(node) + } +} + // shutdownAllNodes will shutdown all running nodes. func (h *HarnessTest) shutdownAllNodes() { + var err error for _, node := range h.manager.activeNodes { - // The process may not be in a state to always shutdown - // immediately, so we'll retry up to a hard limit to ensure we - // eventually shutdown. - err := wait.NoError(func() error { - return h.manager.shutdownNode(node) - }, DefaultTimeout) - + err = h.manager.shutdownNode(node) if err == nil { continue } @@ -421,6 +425,8 @@ func (h *HarnessTest) shutdownAllNodes() { // processes. h.Logf("unable to shutdown %s, got err: %v", node.Name(), err) } + + require.NoError(h, err, "failed to shutdown all nodes") } // cleanupStandbyNode is a function should be called with defer whenever a @@ -516,12 +522,7 @@ func (h *HarnessTest) NewNodeWithCoins(name string, // Shutdown shuts down the given node and asserts that no errors occur. func (h *HarnessTest) Shutdown(node *node.HarnessNode) { - // The process may not be in a state to always shutdown immediately, so - // we'll retry up to a hard limit to ensure we eventually shutdown. - err := wait.NoError(func() error { - return h.manager.shutdownNode(node) - }, DefaultTimeout) - + err := h.manager.shutdownNode(node) require.NoErrorf(h, err, "unable to shutdown %v in %v", node.Name(), h.manager.currentTestCase) } @@ -764,9 +765,10 @@ func (h *HarnessTest) NewNodeRemoteSigner(name string, extraArgs []string, // KillNode kills the node and waits for the node process to stop. func (h *HarnessTest) KillNode(hn *node.HarnessNode) { + delete(h.manager.activeNodes, hn.Cfg.NodeID) + h.Logf("Manually killing the node %s", hn.Name()) require.NoErrorf(h, hn.KillAndWait(), "%s: kill got error", hn.Name()) - delete(h.manager.activeNodes, hn.Cfg.NodeID) } // SetFeeEstimate sets a fee rate to be returned from fee estimator. diff --git a/lntest/harness_node_manager.go b/lntest/harness_node_manager.go index 2040acec7..48afa6b35 100644 --- a/lntest/harness_node_manager.go +++ b/lntest/harness_node_manager.go @@ -112,11 +112,14 @@ func (nm *nodeManager) registerNode(node *node.HarnessNode) { // ShutdownNode stops an active lnd process and returns when the process has // exited and any temporary directories have been cleaned up. func (nm *nodeManager) shutdownNode(node *node.HarnessNode) error { + // Remove the node from the active nodes map even if the shutdown + // fails as the shutdown cannot be retried in that case. + delete(nm.activeNodes, node.Cfg.NodeID) + if err := node.Shutdown(); err != nil { return err } - delete(nm.activeNodes, node.Cfg.NodeID) return nil } diff --git a/lntest/node/harness_node.go b/lntest/node/harness_node.go index d1353c25c..0ac74f877 100644 --- a/lntest/node/harness_node.go +++ b/lntest/node/harness_node.go @@ -636,12 +636,11 @@ func (hn *HarnessNode) cleanup() error { // waitForProcessExit Launch a new goroutine which that bubbles up any // potential fatal process errors to the goroutine running the tests. func (hn *HarnessNode) WaitForProcessExit() error { - var err error + var errReturned error errChan := make(chan error, 1) go func() { - err = hn.cmd.Wait() - errChan <- err + errChan <- hn.cmd.Wait() }() select { @@ -656,24 +655,36 @@ func (hn *HarnessNode) WaitForProcessExit() error { return nil } + // The process may have already been killed in the test, in + // that case we will skip the error and continue processing + // the logs. + if strings.Contains(err.Error(), "signal: killed") { + break + } + // Otherwise, we print the error, break the select and save // logs. hn.printErrf("wait process exit got err: %v", err) - - break + errReturned = err case <-time.After(wait.DefaultTimeout): hn.printErrf("timeout waiting for process to exit") } // Make sure log file is closed and renamed if necessary. - finalizeLogfile(hn) + filename := finalizeLogfile(hn) - // Rename the etcd.log file if the node was running on embedded - // etcd. + // Assert the node has shut down from the log file. + err1 := assertNodeShutdown(filename) + if err1 != nil { + return fmt.Errorf("[%s]: assert shutdown failed in log[%s]: %w", + hn.Name(), filename, err1) + } + + // Rename the etcd.log file if the node was running on embedded etcd. finalizeEtcdLog(hn) - return err + return errReturned } // Stop attempts to stop the active lnd process. @@ -700,23 +711,21 @@ func (hn *HarnessNode) Stop() error { err := wait.NoError(func() error { _, err := hn.RPC.LN.StopDaemon(ctxt, &req) - - switch { - case err == nil: - return nil - - // Try again if a recovery/rescan is in progress. - case strings.Contains( - err.Error(), "recovery in progress", - ): - return err - - default: + if err == nil { return nil } + + // If the connection is already closed, we can exit + // early as the node has already been shut down in the + // test, e.g., in etcd leader health check test. + if strings.Contains(err.Error(), "connection refused") { + return nil + } + + return err }, wait.DefaultTimeout) if err != nil { - return err + return fmt.Errorf("shutdown timeout: %w", err) } // Wait for goroutines to be finished. @@ -724,6 +733,7 @@ func (hn *HarnessNode) Stop() error { go func() { hn.Watcher.wg.Wait() close(done) + hn.Watcher = nil }() // If the goroutines fail to finish before timeout, we'll print @@ -966,31 +976,23 @@ func getFinalizedLogFilePrefix(hn *HarnessNode) string { // finalizeLogfile makes sure the log file cleanup function is initialized, // even if no log file is created. -func finalizeLogfile(hn *HarnessNode) { +func finalizeLogfile(hn *HarnessNode) string { // Exit early if there's no log file. if hn.logFile == nil { - return + return "" } hn.logFile.Close() // If logoutput flag is not set, return early. if !*logOutput { - return + return "" } - newFileName := fmt.Sprintf("%v.log", - getFinalizedLogFilePrefix(hn), - ) + newFileName := fmt.Sprintf("%v.log", getFinalizedLogFilePrefix(hn)) renameFile(hn.filename, newFileName) - // Assert the node has shut down from the log file. - err := assertNodeShutdown(newFileName) - if err != nil { - err := fmt.Errorf("[%s]: assert shutdown failed in log[%s]: %w", - hn.Name(), newFileName, err) - panic(err) - } + return newFileName } // assertNodeShutdown asserts that the node has shut down properly by checking