Skip to content

Commit

Permalink
In cluster mode prefer node with slots in 2 masters case
Browse files Browse the repository at this point in the history
  • Loading branch information
secwall committed May 20, 2024
1 parent c3be087 commit 7e0b2f2
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 0 deletions.
16 changes: 16 additions & 0 deletions internal/app/master.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,22 @@ func (app *App) getMasterHost(shardState map[string]*HostState) (string, error)
}
}
if len(masters) > 1 {
if app.mode == modeCluster {
mastersWithSlots := make([]string, 0)
for _, master := range masters {
node := app.shard.Get(master)
hasSlots, err := node.HasClusterSlots(app.ctx)
if err != nil {
return "", fmt.Errorf("unable to check slots on %s", master)
}
if hasSlots {
mastersWithSlots = append(mastersWithSlots, master)
}
}
if len(mastersWithSlots) == 1 {
return mastersWithSlots[0], nil
}
}
return "", fmt.Errorf("got more than 1 master: %s", masters)
}
if len(masters) == 0 {
Expand Down
20 changes: 20 additions & 0 deletions internal/redis/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -542,3 +542,23 @@ func (n *Node) ClusterMeet(ctx context.Context, addr string, port, clusterBusPor
cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterMeet, addr, strconv.Itoa(port), strconv.Itoa(clusterBusPort))
return cmd.Err()
}

// HasClusterSlots checks if node has any slot assigned
func (n *Node) HasClusterSlots(ctx context.Context) (bool, error) {
cmd := n.conn.ClusterNodes(ctx)
err := cmd.Err()
if err != nil {
return false, err
}
lines := strings.Split(cmd.Val(), "\n")
for _, line := range lines {
splitted := strings.Split(line, " ")
if len(splitted) < 3 {
continue
}
if strings.Contains(splitted[2], "myself") {
return len(splitted) > 8, nil
}
}
return false, nil
}
79 changes: 79 additions & 0 deletions tests/features/05_cluster_replication_fix.feature
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ Feature: Cluster mode broken replication fix
rm -f /etc/redis/cluster.conf
"""
And I run command on host "redis3"
"""
sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf
"""
And I run command on host "redis3"
"""
supervisorctl signal KILL redis
"""
Expand All @@ -184,3 +188,78 @@ Feature: Cluster mode broken replication fix
"""
["redis1","redis2","redis3"]
"""

Scenario: Cluster splitbrain is fixed in favor of node with slots
Given clustered shard is up and running
Then redis host "redis1" should be master
And redis host "redis2" should become replica of "redis1" within "15" seconds
And replication on redis host "redis2" should run fine within "15" seconds
And redis host "redis3" should become replica of "redis1" within "15" seconds
And replication on redis host "redis3" should run fine within "15" seconds
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["redis1","redis2","redis3"]
"""
When I run command on host "redis1"
"""
supervisorctl signal STOP rdsync
"""
And I run command on host "redis2"
"""
supervisorctl signal STOP rdsync
"""
And I run command on host "redis3"
"""
supervisorctl signal STOP rdsync
"""
And I run command on host "redis3"
"""
rm -f /etc/redis/cluster.conf
"""
And I run command on host "redis3"
"""
sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf
"""
And I run command on host "redis3"
"""
supervisorctl signal KILL redis
"""
And I run command on host "redis3"
"""
supervisorctl start redis
"""
Then redis host "redis3" should become available within "60" seconds
When I run command on redis host "redis1"
"""
SET very-important-key foo
"""
And I set zookeeper node "/test/master" to
"""
"redis3"
"""
And I run command on host "redis1"
"""
supervisorctl signal CONT rdsync
"""
And I run command on host "redis2"
"""
supervisorctl signal CONT rdsync
"""
And I run command on host "redis3"
"""
supervisorctl signal CONT rdsync
"""
Then redis host "redis3" should become replica of "redis1" within "60" seconds
And replication on redis host "redis3" should run fine within "15" seconds
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["redis1","redis2","redis3"]
"""
When I run command on redis host "redis1"
"""
GET very-important-key
"""
Then redis cmd result should match regexp
"""
.*foo.*
"""

0 comments on commit 7e0b2f2

Please sign in to comment.