From d07e3c6c701490e106ea517a3356d4ae572c3e94 Mon Sep 17 00:00:00 2001 From: moridin26 Date: Tue, 23 Jul 2024 15:09:49 +0300 Subject: [PATCH] [WIP] add failoverQuorum semisync check, refactor failoverQuorum (#115) * add failoverQuorum semisync check, refactor failoverQuorum * lint fix * naming fix --------- Co-authored-by: suetin --- internal/app/app.go | 58 +++++++-------------------------- internal/mysql/switch_helper.go | 46 ++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 48 deletions(-) diff --git a/internal/app/app.go b/internal/app/app.go index 8b87edaf..8bf4dbdb 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -161,26 +161,6 @@ func (app *App) removeMaintenanceFile() { } } -// Dynamically calculated version of RplSemiSyncMasterWaitForSlaveCount. -// This variable can be lower than hard-configured RplSemiSyncMasterWaitForSlaveCount -// when some semi-sync replicas are dead. -func (app *App) getRequiredWaitSlaveCount(activeNodes []string) int { - wsc := len(activeNodes) / 2 - if wsc > app.config.RplSemiSyncMasterWaitForSlaveCount { - wsc = app.config.RplSemiSyncMasterWaitForSlaveCount - } - return wsc -} - -// Number of HA nodes to be alive to failover/switchover -func (app *App) getFailoverQuorum(activeNodes []string) int { - fq := len(activeNodes) - app.getRequiredWaitSlaveCount(activeNodes) - if fq < 1 { - fq = 1 - } - return fq -} - // separate goroutine performing health checks func (app *App) healthChecker(ctx context.Context) { ticker := time.NewTicker(app.config.HealthCheckInterval) @@ -795,19 +775,13 @@ func (app *App) approveFailover(clusterState, clusterStateDcs map[string]*NodeSt app.logger.Infof("approve failover: active nodes are %v", activeNodes) // number of active slaves that we can use to perform switchover permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState) - if app.config.SemiSync { - failoverQuorum := app.getFailoverQuorum(activeNodes) - if permissibleSlaves < failoverQuorum { - return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum) - } - } else { - if permissibleSlaves == 0 { - return fmt.Errorf("no alive active replica found") - } + err := app.switchHelper.CheckFailoverQuorum(activeNodes, permissibleSlaves) + if err != nil { + return err } var lastSwitchover Switchover - err := app.dcs.Get(pathLastSwitch, &lastSwitchover) + err = app.dcs.Get(pathLastSwitch, &lastSwitchover) if err != dcs.ErrNotFound { if err != nil { return err @@ -865,15 +839,8 @@ func (app *App) approveSwitchover(switchover *Switchover, activeNodes []string, if switchover.RunCount > 0 { return nil } - if app.config.SemiSync { - // number of active slaves that we can use to perform switchover - permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState) - failoverQuorum := app.getFailoverQuorum(activeNodes) - if permissibleSlaves < failoverQuorum { - return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum) - } - } - return nil + permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState) + return app.switchHelper.CheckFailoverQuorum(activeNodes, permissibleSlaves) } /* @@ -1048,7 +1015,7 @@ func (app *App) updateActiveNodes(clusterState, clusterStateDcs map[string]*Node if masterState.SemiSyncState != nil && masterState.SemiSyncState.MasterEnabled { oldWaitSlaveCount = masterState.SemiSyncState.WaitSlaveCount } - waitSlaveCount := app.getRequiredWaitSlaveCount(activeNodes) + waitSlaveCount := app.switchHelper.GetRequiredWaitSlaveCount(activeNodes) app.logger.Infof("update active nodes: active nodes are: %v, wait_slave_count %d", activeNodes, waitSlaveCount) if len(becomeActive) > 0 { @@ -1202,9 +1169,7 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode return fmt.Errorf("switchover: failed to ping hosts: %v with dubious errors", dubious) } - // calc failoverQuorum before filtering out old master - failoverQuorum := app.getFailoverQuorum(activeNodes) - oldActiveNodes := activeNodes + activeNodesWithOldMaster := activeNodes // filter out old master as may hang and timeout in different ways if switchover.Cause == CauseAuto && switchover.From == oldMaster { @@ -1273,8 +1238,9 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode frozenActiveNodes = append(frozenActiveNodes, host) } } - if len(frozenActiveNodes) < failoverQuorum { - return fmt.Errorf("no failoverQuorum: has %d frozen active nodes, while %d is required", len(frozenActiveNodes), failoverQuorum) + err := app.switchHelper.CheckFailoverQuorum(activeNodesWithOldMaster, len(frozenActiveNodes)) + if err != nil { + return err } // setting server read-only may take a while so we need to ensure we are still a manager @@ -1415,7 +1381,7 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode // adjust semi-sync before finishing switchover clusterState = app.getClusterStateFromDB() - err = app.updateActiveNodes(clusterState, clusterState, oldActiveNodes, newMaster) + err = app.updateActiveNodes(clusterState, clusterState, activeNodesWithOldMaster, newMaster) if err != nil || app.emulateError("update_active_nodes") { app.logger.Warnf("switchover: failed to update active nodes after switchover: %v", err) } diff --git a/internal/mysql/switch_helper.go b/internal/mysql/switch_helper.go index b880fc53..11d53881 100644 --- a/internal/mysql/switch_helper.go +++ b/internal/mysql/switch_helper.go @@ -1,6 +1,7 @@ package mysql import ( + "fmt" "time" "github.com/yandex/mysync/internal/config" @@ -8,10 +9,15 @@ import ( type ISwitchHelper interface { GetPriorityChoiceMaxLag() time.Duration + GetRequiredWaitSlaveCount([]string) int + GetFailoverQuorum([]string) int + CheckFailoverQuorum([]string, int) error } type SwitchHelper struct { - priorityChoiceMaxLag time.Duration + priorityChoiceMaxLag time.Duration + rplSemiSyncMasterWaitForSlaveCount int + SemiSync bool } func NewSwitchHelper(config *config.Config) ISwitchHelper { @@ -22,10 +28,46 @@ func NewSwitchHelper(config *config.Config) ISwitchHelper { } } return &SwitchHelper{ - priorityChoiceMaxLag: priorityChoiceMaxLag, + priorityChoiceMaxLag: priorityChoiceMaxLag, + rplSemiSyncMasterWaitForSlaveCount: config.RplSemiSyncMasterWaitForSlaveCount, + SemiSync: config.SemiSync, } } func (sh *SwitchHelper) GetPriorityChoiceMaxLag() time.Duration { return sh.priorityChoiceMaxLag } + +// GetRequiredWaitSlaveCount Dynamically calculated version of RplSemiSyncMasterWaitForSlaveCount. +// This variable can be lower than hard-configured RplSemiSyncMasterWaitForSlaveCount +// when some semi-sync replicas are dead. +func (sh *SwitchHelper) GetRequiredWaitSlaveCount(activeNodes []string) int { + wsc := len(activeNodes) / 2 + if wsc > sh.rplSemiSyncMasterWaitForSlaveCount { + wsc = sh.rplSemiSyncMasterWaitForSlaveCount + } + return wsc +} + +// GetFailoverQuorum Number of HA nodes to be alive to failover/switchover +func (sh *SwitchHelper) GetFailoverQuorum(activeNodes []string) int { + fq := len(activeNodes) - sh.GetRequiredWaitSlaveCount(activeNodes) + if fq < 1 { + fq = 1 + } + return fq +} + +func (sh *SwitchHelper) CheckFailoverQuorum(activeNodes []string, permissibleSlaves int) error { + if sh.SemiSync { + failoverQuorum := sh.GetFailoverQuorum(activeNodes) + if permissibleSlaves < failoverQuorum { + return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum) + } + } else { + if permissibleSlaves == 0 { + return fmt.Errorf("no alive active replica found") + } + } + return nil +}