Skip to content

Commit

Permalink
Join lone cluster nodes back
Browse files Browse the repository at this point in the history
  • Loading branch information
secwall committed May 17, 2024
1 parent c2560dd commit ef38688
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 0 deletions.
17 changes: 17 additions & 0 deletions internal/app/repair.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,23 @@ func (app *App) repairReplica(node *redis.Node, masterState, state *HostState, m
app.logger.Error(fmt.Sprintf("Unable to make %s replica of %s", node.FQDN(), master), "error", err)
}
case modeCluster:
alone, err := node.IsClusterNodeAlone(app.ctx)
if err != nil {
app.logger.Error(fmt.Sprintf("Unable to check if %s is alone", node.FQDN()), "error", err)
return
}
if alone {
masterIP, err := masterNode.GetIP()
if err != nil {
app.logger.Error(fmt.Sprintf("Unable to make %s replica of %s", node.FQDN(), master), "error", err)
return
}
err = node.ClusterMeet(app.ctx, masterIP, app.config.Redis.Port, app.config.Redis.ClusterBusPort)
if err != nil {
app.logger.Error(fmt.Sprintf("Unable to make %s meet with master %s at %s:%d:%d", node.FQDN(), master, masterIP, app.config.Redis.Port, app.config.Redis.ClusterBusPort), "error", err)
return
}
}
masterID, err := masterNode.ClusterGetID(app.ctx)
if err != nil {
app.logger.Error(fmt.Sprintf("Unable to get cluster id of %s", master), "error", err.Error())
Expand Down
4 changes: 4 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
// RedisConfig contains redis connection info and params
type RedisConfig struct {
Port int `yaml:"port"`
ClusterBusPort int `yaml:"cluster_bus_port"`
UseTLS bool `yaml:"use_tls"`
TLSCAPath string `yaml:"tls_ca_path"`
AuthUser string `yaml:"auth_user"`
Expand Down Expand Up @@ -46,6 +47,7 @@ type RedisRenamesConfig struct {
ClusterFailover string `yaml:"cluster_failover"`
ClusterMyID string `yaml:"cluster_myid"`
ClusterReplicate string `yaml:"cluster_replicate"`
ClusterMeet string `yaml:"cluster_meet"`
Config string `yaml:"config"`
ReplicaOf string `yaml:"replicaof"`
}
Expand Down Expand Up @@ -90,6 +92,7 @@ type Config struct {
func DefaultRedisConfig() RedisConfig {
return RedisConfig{
Port: 6379,
ClusterBusPort: 16379,
UseTLS: false,
TLSCAPath: "",
AuthUser: "",
Expand Down Expand Up @@ -122,6 +125,7 @@ func DefaultRedisRenamesConfig() RedisRenamesConfig {
ClusterFailover: "FAILOVER",
ClusterMyID: "MYID",
ClusterReplicate: "REPLICATE",
ClusterMeet: "MEET",
Config: "CONFIG",
ReplicaOf: "REPLICAOF",
}
Expand Down
17 changes: 17 additions & 0 deletions internal/redis/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -519,3 +519,20 @@ func (n *Node) ClusterPromoteTakeover(ctx context.Context) error {
cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterFailover, "TAKEOVER")
return cmd.Err()
}

// IsClusterNodeAlone checks if node sees only itself
func (n *Node) IsClusterNodeAlone(ctx context.Context) (bool, error) {
cmd := n.conn.ClusterNodes(ctx)
err := cmd.Err()
if err != nil {
return false, err
}
lines := strings.Split(cmd.Val(), "\n")
return len(lines) == 1, nil
}

// ClusterMeet makes replica join the cluster
func (n *Node) ClusterMeet(ctx context.Context, addr string, port, clusterBusPort int) error {
cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterMeet, addr, strconv.Itoa(port), strconv.Itoa(clusterBusPort))
return cmd.Err()
}
30 changes: 30 additions & 0 deletions tests/features/05_cluster_replication_fix.feature
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,33 @@ Feature: Cluster mode broken replication fix
When I break replication on host "redis3"
Then redis host "redis3" should become replica of "redis1" within "15" seconds
And replication on redis host "redis3" should run fine within "60" seconds

Scenario: Cluster lone node is joined in cluster back
Given clustered shard is up and running
Then redis host "redis1" should be master
And redis host "redis2" should become replica of "redis1" within "15" seconds
And replication on redis host "redis2" should run fine within "15" seconds
And redis host "redis3" should become replica of "redis1" within "15" seconds
And replication on redis host "redis3" should run fine within "15" seconds
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["redis1","redis2","redis3"]
"""
When I run command on host "redis3"
"""
rm -f /etc/redis/cluster.conf
"""
And I run command on host "redis3"
"""
supervisorctl signal KILL redis
"""
And I run command on host "redis3"
"""
supervisorctl start redis
"""
Then redis host "redis3" should become replica of "redis1" within "15" seconds
And replication on redis host "redis3" should run fine within "15" seconds
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["redis1","redis2","redis3"]
"""

0 comments on commit ef38688

Please sign in to comment.