diff --git a/internal/app/master.go b/internal/app/master.go index 36db14d..43b71b6 100644 --- a/internal/app/master.go +++ b/internal/app/master.go @@ -54,6 +54,22 @@ func (app *App) getMasterHost(shardState map[string]*HostState) (string, error) } } if len(masters) > 1 { + if app.mode == modeCluster { + mastersWithSlots := make([]string, 0) + for _, master := range masters { + node := app.shard.Get(master) + hasSlots, err := node.HasClusterSlots(app.ctx) + if err != nil { + return "", fmt.Errorf("unable to check slots on %s", master) + } + if hasSlots { + mastersWithSlots = append(mastersWithSlots, master) + } + } + if len(mastersWithSlots) == 1 { + return mastersWithSlots[0], nil + } + } return "", fmt.Errorf("got more than 1 master: %s", masters) } if len(masters) == 0 { diff --git a/internal/redis/node.go b/internal/redis/node.go index ba5c71e..3281ff3 100644 --- a/internal/redis/node.go +++ b/internal/redis/node.go @@ -542,3 +542,23 @@ func (n *Node) ClusterMeet(ctx context.Context, addr string, port, clusterBusPor cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterMeet, addr, strconv.Itoa(port), strconv.Itoa(clusterBusPort)) return cmd.Err() } + +// HasClusterSlots checks if node has any slot assigned +func (n *Node) HasClusterSlots(ctx context.Context) (bool, error) { + cmd := n.conn.ClusterNodes(ctx) + err := cmd.Err() + if err != nil { + return false, err + } + lines := strings.Split(cmd.Val(), "\n") + for _, line := range lines { + splitted := strings.Split(line, " ") + if len(splitted) < 3 { + continue + } + if strings.Contains(splitted[2], "myself") { + return len(splitted) > 8, nil + } + } + return false, nil +} diff --git a/tests/features/05_cluster_replication_fix.feature b/tests/features/05_cluster_replication_fix.feature index 6c1dadf..ca9b04e 100644 --- a/tests/features/05_cluster_replication_fix.feature +++ b/tests/features/05_cluster_replication_fix.feature @@ -155,7 +155,41 @@ Feature: Cluster mode broken replication fix Then redis host "redis3" should become replica of "redis1" within "15" seconds And replication on redis host "redis3" should run fine within "60" seconds - Scenario: Cluster lone node is joined in cluster back + #Scenario: Cluster lone node is joined in cluster back + #Given clustered shard is up and running + #Then redis host "redis1" should be master + #And redis host "redis2" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis2" should run fine within "15" seconds + #And redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + #When I run command on host "redis3" + #""" + #rm -f /etc/redis/cluster.conf + #""" + #And I run command on host "redis3" + #""" + #sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf + #""" + #And I run command on host "redis3" + #""" + #supervisorctl signal KILL redis + #""" + #And I run command on host "redis3" + #""" + #supervisorctl start redis + #""" + #Then redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + + Scenario: Cluster splitbrain is fixed in favor of node with slots Given clustered shard is up and running Then redis host "redis1" should be master And redis host "redis2" should become replica of "redis1" within "15" seconds @@ -166,11 +200,27 @@ Feature: Cluster mode broken replication fix """ ["redis1","redis2","redis3"] """ - When I run command on host "redis3" + When I run command on host "redis1" + """ + supervisorctl signal STOP rdsync + """ + And I run command on host "redis2" + """ + supervisorctl signal STOP rdsync + """ + And I run command on host "redis3" + """ + supervisorctl signal STOP rdsync + """ + And I run command on host "redis3" """ rm -f /etc/redis/cluster.conf """ And I run command on host "redis3" + """ + sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf + """ + And I run command on host "redis3" """ supervisorctl signal KILL redis """ @@ -178,9 +228,38 @@ Feature: Cluster mode broken replication fix """ supervisorctl start redis """ - Then redis host "redis3" should become replica of "redis1" within "15" seconds + Then redis host "redis3" should become available within "60" seconds + When I run command on redis host "redis1" + """ + SET very-important-key foo + """ + And I set zookeeper node "/test/master" to + """ + "redis3" + """ + And I run command on host "redis1" + """ + supervisorctl signal CONT rdsync + """ + And I run command on host "redis2" + """ + supervisorctl signal CONT rdsync + """ + And I run command on host "redis3" + """ + supervisorctl signal CONT rdsync + """ + Then redis host "redis3" should become replica of "redis1" within "60" seconds And replication on redis host "redis3" should run fine within "15" seconds And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds """ ["redis1","redis2","redis3"] """ + When I run command on redis host "redis1" + """ + GET very-important-key + """ + Then redis cmd result should match regexp + """ + .*foo.* + """