mirror of
https://github.com/lightningnetwork/lnd.git
synced 2025-09-14 18:30:52 +02:00
multi: add leader check to the healthcheck monitor
This commit extends our healtcheck with an optional leader check. This is to ensure that given network partition or other cluster wide failure we act as soon as possible to avoid a split-brain situation where a new leader is elected but we still hold onto our etcd client.
This commit is contained in:
26
lnd.go
26
lnd.go
@@ -24,6 +24,7 @@ import (
|
||||
"github.com/lightningnetwork/lnd/build"
|
||||
"github.com/lightningnetwork/lnd/chanacceptor"
|
||||
"github.com/lightningnetwork/lnd/channeldb"
|
||||
"github.com/lightningnetwork/lnd/cluster"
|
||||
"github.com/lightningnetwork/lnd/keychain"
|
||||
"github.com/lightningnetwork/lnd/lncfg"
|
||||
"github.com/lightningnetwork/lnd/lnrpc"
|
||||
@@ -56,6 +57,14 @@ const (
|
||||
// admin macaroon unless the administrator explicitly allowed it. Thus
|
||||
// there's no harm allowing group read.
|
||||
adminMacaroonFilePermissions = 0640
|
||||
|
||||
// leaderResignTimeout is the timeout used when resigning from the
|
||||
// leader role. This is kept short so LND can shut down quickly in case
|
||||
// of a system failure or network partition making the cluster
|
||||
// unresponsive. The cluster itself should ensure that the leader is not
|
||||
// elected again until the previous leader has resigned or the leader
|
||||
// election timeout has passed.
|
||||
leaderResignTimeout = 5 * time.Second
|
||||
)
|
||||
|
||||
// AdminAuthOptions returns a list of DialOptions that can be used to
|
||||
@@ -381,6 +390,7 @@ func Main(cfg *Config, lisCfg ListenerCfg, implCfg *ImplementationCfg,
|
||||
// blocked until this instance is elected as the current leader or
|
||||
// shutting down.
|
||||
elected := false
|
||||
var leaderElector cluster.LeaderElector
|
||||
if cfg.Cluster.EnableLeaderElection {
|
||||
electionCtx, cancelElection := context.WithCancel(ctx)
|
||||
|
||||
@@ -392,7 +402,7 @@ func Main(cfg *Config, lisCfg ListenerCfg, implCfg *ImplementationCfg,
|
||||
ltndLog.Infof("Using %v leader elector",
|
||||
cfg.Cluster.LeaderElector)
|
||||
|
||||
leaderElector, err := cfg.Cluster.MakeLeaderElector(
|
||||
leaderElector, err = cfg.Cluster.MakeLeaderElector(
|
||||
electionCtx, cfg.DB,
|
||||
)
|
||||
if err != nil {
|
||||
@@ -407,7 +417,17 @@ func Main(cfg *Config, lisCfg ListenerCfg, implCfg *ImplementationCfg,
|
||||
ltndLog.Infof("Attempting to resign from leader role "+
|
||||
"(%v)", cfg.Cluster.ID)
|
||||
|
||||
if err := leaderElector.Resign(); err != nil {
|
||||
// Ensure that we don't block the shutdown process if
|
||||
// the leader resigning process takes too long. The
|
||||
// cluster will ensure that the leader is not elected
|
||||
// again until the previous leader has resigned or the
|
||||
// leader election timeout has passed.
|
||||
timeoutCtx, cancel := context.WithTimeout(
|
||||
ctx, leaderResignTimeout,
|
||||
)
|
||||
defer cancel()
|
||||
|
||||
if err := leaderElector.Resign(timeoutCtx); err != nil {
|
||||
ltndLog.Errorf("Leader elector failed to "+
|
||||
"resign: %v", err)
|
||||
}
|
||||
@@ -579,7 +599,7 @@ func Main(cfg *Config, lisCfg ListenerCfg, implCfg *ImplementationCfg,
|
||||
server, err := newServer(
|
||||
cfg, cfg.Listeners, dbs, activeChainControl, &idKeyDesc,
|
||||
activeChainControl.Cfg.WalletUnlockParams.ChansToRestore,
|
||||
multiAcceptor, torController, tlsManager,
|
||||
multiAcceptor, torController, tlsManager, leaderElector,
|
||||
)
|
||||
if err != nil {
|
||||
return mkErr("unable to create server: %v", err)
|
||||
|
Reference in New Issue
Block a user