From 0e5bd3eaa84c46d19f7334f9741aecfed6fd12b5 Mon Sep 17 00:00:00 2001 From: secwall Date: Fri, 17 May 2024 16:14:46 +0500 Subject: [PATCH] Join lone cluster nodes back --- internal/app/repair.go | 17 +++++++++++ internal/config/config.go | 4 +++ internal/redis/node.go | 23 ++++++++++++++ .../05_cluster_replication_fix.feature | 30 +++++++++++++++++++ 4 files changed, 74 insertions(+) diff --git a/internal/app/repair.go b/internal/app/repair.go index dae3751..99ad99f 100644 --- a/internal/app/repair.go +++ b/internal/app/repair.go @@ -88,6 +88,23 @@ func (app *App) repairReplica(node *redis.Node, masterState, state *HostState, m app.logger.Error(fmt.Sprintf("Unable to make %s replica of %s", node.FQDN(), master), "error", err) } case modeCluster: + alone, err := node.IsClusterNodeAlone(app.ctx) + if err != nil { + app.logger.Error(fmt.Sprintf("Unable to check if %s is alone", node.FQDN()), "error", err) + return + } + if alone { + masterIP, err := masterNode.GetIP() + if err != nil { + app.logger.Error(fmt.Sprintf("Unable to make %s replica of %s", node.FQDN(), master), "error", err) + return + } + err = node.ClusterMeet(app.ctx, masterIP, app.config.Redis.Port, app.config.Redis.ClusterBusPort) + if err != nil { + app.logger.Error(fmt.Sprintf("Unable to make %s meet with master %s at %s:%d:%d", node.FQDN(), master, masterIP, app.config.Redis.Port, app.config.Redis.ClusterBusPort), "error", err) + return + } + } masterID, err := masterNode.ClusterGetID(app.ctx) if err != nil { app.logger.Error(fmt.Sprintf("Unable to get cluster id of %s", master), "error", err.Error()) diff --git a/internal/config/config.go b/internal/config/config.go index 93ccaa2..f88606b 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -16,6 +16,7 @@ import ( // RedisConfig contains redis connection info and params type RedisConfig struct { Port int `yaml:"port"` + ClusterBusPort int `yaml:"cluster_bus_port"` UseTLS bool `yaml:"use_tls"` TLSCAPath string `yaml:"tls_ca_path"` AuthUser string `yaml:"auth_user"` @@ -46,6 +47,7 @@ type RedisRenamesConfig struct { ClusterFailover string `yaml:"cluster_failover"` ClusterMyID string `yaml:"cluster_myid"` ClusterReplicate string `yaml:"cluster_replicate"` + ClusterMeet string `yaml:"cluster_meet"` Config string `yaml:"config"` ReplicaOf string `yaml:"replicaof"` } @@ -90,6 +92,7 @@ type Config struct { func DefaultRedisConfig() RedisConfig { return RedisConfig{ Port: 6379, + ClusterBusPort: 16379, UseTLS: false, TLSCAPath: "", AuthUser: "", @@ -122,6 +125,7 @@ func DefaultRedisRenamesConfig() RedisRenamesConfig { ClusterFailover: "FAILOVER", ClusterMyID: "MYID", ClusterReplicate: "REPLICATE", + ClusterMeet: "MEET", Config: "CONFIG", ReplicaOf: "REPLICAOF", } diff --git a/internal/redis/node.go b/internal/redis/node.go index bd30083..ba5c71e 100644 --- a/internal/redis/node.go +++ b/internal/redis/node.go @@ -519,3 +519,26 @@ func (n *Node) ClusterPromoteTakeover(ctx context.Context) error { cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterFailover, "TAKEOVER") return cmd.Err() } + +// IsClusterNodeAlone checks if node sees only itself +func (n *Node) IsClusterNodeAlone(ctx context.Context) (bool, error) { + cmd := n.conn.ClusterNodes(ctx) + err := cmd.Err() + if err != nil { + return false, err + } + lines := strings.Split(cmd.Val(), "\n") + var count int + for _, line := range lines { + if len(strings.TrimSpace(line)) > 0 { + count++ + } + } + return count == 1, nil +} + +// ClusterMeet makes replica join the cluster +func (n *Node) ClusterMeet(ctx context.Context, addr string, port, clusterBusPort int) error { + cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterMeet, addr, strconv.Itoa(port), strconv.Itoa(clusterBusPort)) + return cmd.Err() +} diff --git a/tests/features/05_cluster_replication_fix.feature b/tests/features/05_cluster_replication_fix.feature index 6b17ef4..6c1dadf 100644 --- a/tests/features/05_cluster_replication_fix.feature +++ b/tests/features/05_cluster_replication_fix.feature @@ -154,3 +154,33 @@ Feature: Cluster mode broken replication fix When I break replication on host "redis3" Then redis host "redis3" should become replica of "redis1" within "15" seconds And replication on redis host "redis3" should run fine within "60" seconds + + Scenario: Cluster lone node is joined in cluster back + Given clustered shard is up and running + Then redis host "redis1" should be master + And redis host "redis2" should become replica of "redis1" within "15" seconds + And replication on redis host "redis2" should run fine within "15" seconds + And redis host "redis3" should become replica of "redis1" within "15" seconds + And replication on redis host "redis3" should run fine within "15" seconds + And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + """ + ["redis1","redis2","redis3"] + """ + When I run command on host "redis3" + """ + rm -f /etc/redis/cluster.conf + """ + And I run command on host "redis3" + """ + supervisorctl signal KILL redis + """ + And I run command on host "redis3" + """ + supervisorctl start redis + """ + Then redis host "redis3" should become replica of "redis1" within "15" seconds + And replication on redis host "redis3" should run fine within "15" seconds + And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + """ + ["redis1","redis2","redis3"] + """