Skip to content

Commit

Permalink
[WIP] add failoverQuorum semisync check, refactor failoverQuorum (#115)
Browse files Browse the repository at this point in the history
* add failoverQuorum semisync check, refactor failoverQuorum

* lint fix

* naming fix

---------

Co-authored-by: suetin <[email protected]>
  • Loading branch information
moridin26 and suetin authored Jul 23, 2024
1 parent a5a64da commit d07e3c6
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 48 deletions.
58 changes: 12 additions & 46 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,26 +161,6 @@ func (app *App) removeMaintenanceFile() {
}
}

// Dynamically calculated version of RplSemiSyncMasterWaitForSlaveCount.
// This variable can be lower than hard-configured RplSemiSyncMasterWaitForSlaveCount
// when some semi-sync replicas are dead.
func (app *App) getRequiredWaitSlaveCount(activeNodes []string) int {
wsc := len(activeNodes) / 2
if wsc > app.config.RplSemiSyncMasterWaitForSlaveCount {
wsc = app.config.RplSemiSyncMasterWaitForSlaveCount
}
return wsc
}

// Number of HA nodes to be alive to failover/switchover
func (app *App) getFailoverQuorum(activeNodes []string) int {
fq := len(activeNodes) - app.getRequiredWaitSlaveCount(activeNodes)
if fq < 1 {
fq = 1
}
return fq
}

// separate goroutine performing health checks
func (app *App) healthChecker(ctx context.Context) {
ticker := time.NewTicker(app.config.HealthCheckInterval)
Expand Down Expand Up @@ -795,19 +775,13 @@ func (app *App) approveFailover(clusterState, clusterStateDcs map[string]*NodeSt
app.logger.Infof("approve failover: active nodes are %v", activeNodes)
// number of active slaves that we can use to perform switchover
permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState)
if app.config.SemiSync {
failoverQuorum := app.getFailoverQuorum(activeNodes)
if permissibleSlaves < failoverQuorum {
return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum)
}
} else {
if permissibleSlaves == 0 {
return fmt.Errorf("no alive active replica found")
}
err := app.switchHelper.CheckFailoverQuorum(activeNodes, permissibleSlaves)
if err != nil {
return err
}

var lastSwitchover Switchover
err := app.dcs.Get(pathLastSwitch, &lastSwitchover)
err = app.dcs.Get(pathLastSwitch, &lastSwitchover)
if err != dcs.ErrNotFound {
if err != nil {
return err
Expand Down Expand Up @@ -865,15 +839,8 @@ func (app *App) approveSwitchover(switchover *Switchover, activeNodes []string,
if switchover.RunCount > 0 {
return nil
}
if app.config.SemiSync {
// number of active slaves that we can use to perform switchover
permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState)
failoverQuorum := app.getFailoverQuorum(activeNodes)
if permissibleSlaves < failoverQuorum {
return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum)
}
}
return nil
permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState)
return app.switchHelper.CheckFailoverQuorum(activeNodes, permissibleSlaves)
}

/*
Expand Down Expand Up @@ -1048,7 +1015,7 @@ func (app *App) updateActiveNodes(clusterState, clusterStateDcs map[string]*Node
if masterState.SemiSyncState != nil && masterState.SemiSyncState.MasterEnabled {
oldWaitSlaveCount = masterState.SemiSyncState.WaitSlaveCount
}
waitSlaveCount := app.getRequiredWaitSlaveCount(activeNodes)
waitSlaveCount := app.switchHelper.GetRequiredWaitSlaveCount(activeNodes)

app.logger.Infof("update active nodes: active nodes are: %v, wait_slave_count %d", activeNodes, waitSlaveCount)
if len(becomeActive) > 0 {
Expand Down Expand Up @@ -1202,9 +1169,7 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode
return fmt.Errorf("switchover: failed to ping hosts: %v with dubious errors", dubious)
}

// calc failoverQuorum before filtering out old master
failoverQuorum := app.getFailoverQuorum(activeNodes)
oldActiveNodes := activeNodes
activeNodesWithOldMaster := activeNodes

// filter out old master as may hang and timeout in different ways
if switchover.Cause == CauseAuto && switchover.From == oldMaster {
Expand Down Expand Up @@ -1273,8 +1238,9 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode
frozenActiveNodes = append(frozenActiveNodes, host)
}
}
if len(frozenActiveNodes) < failoverQuorum {
return fmt.Errorf("no failoverQuorum: has %d frozen active nodes, while %d is required", len(frozenActiveNodes), failoverQuorum)
err := app.switchHelper.CheckFailoverQuorum(activeNodesWithOldMaster, len(frozenActiveNodes))
if err != nil {
return err
}

// setting server read-only may take a while so we need to ensure we are still a manager
Expand Down Expand Up @@ -1415,7 +1381,7 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode

// adjust semi-sync before finishing switchover
clusterState = app.getClusterStateFromDB()
err = app.updateActiveNodes(clusterState, clusterState, oldActiveNodes, newMaster)
err = app.updateActiveNodes(clusterState, clusterState, activeNodesWithOldMaster, newMaster)
if err != nil || app.emulateError("update_active_nodes") {
app.logger.Warnf("switchover: failed to update active nodes after switchover: %v", err)
}
Expand Down
46 changes: 44 additions & 2 deletions internal/mysql/switch_helper.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
package mysql

import (
"fmt"
"time"

"github.com/yandex/mysync/internal/config"
)

type ISwitchHelper interface {
GetPriorityChoiceMaxLag() time.Duration
GetRequiredWaitSlaveCount([]string) int
GetFailoverQuorum([]string) int
CheckFailoverQuorum([]string, int) error
}

type SwitchHelper struct {
priorityChoiceMaxLag time.Duration
priorityChoiceMaxLag time.Duration
rplSemiSyncMasterWaitForSlaveCount int
SemiSync bool
}

func NewSwitchHelper(config *config.Config) ISwitchHelper {
Expand All @@ -22,10 +28,46 @@ func NewSwitchHelper(config *config.Config) ISwitchHelper {
}
}
return &SwitchHelper{
priorityChoiceMaxLag: priorityChoiceMaxLag,
priorityChoiceMaxLag: priorityChoiceMaxLag,
rplSemiSyncMasterWaitForSlaveCount: config.RplSemiSyncMasterWaitForSlaveCount,
SemiSync: config.SemiSync,
}
}

func (sh *SwitchHelper) GetPriorityChoiceMaxLag() time.Duration {
return sh.priorityChoiceMaxLag
}

// GetRequiredWaitSlaveCount Dynamically calculated version of RplSemiSyncMasterWaitForSlaveCount.
// This variable can be lower than hard-configured RplSemiSyncMasterWaitForSlaveCount
// when some semi-sync replicas are dead.
func (sh *SwitchHelper) GetRequiredWaitSlaveCount(activeNodes []string) int {
wsc := len(activeNodes) / 2
if wsc > sh.rplSemiSyncMasterWaitForSlaveCount {
wsc = sh.rplSemiSyncMasterWaitForSlaveCount
}
return wsc
}

// GetFailoverQuorum Number of HA nodes to be alive to failover/switchover
func (sh *SwitchHelper) GetFailoverQuorum(activeNodes []string) int {
fq := len(activeNodes) - sh.GetRequiredWaitSlaveCount(activeNodes)
if fq < 1 {
fq = 1
}
return fq
}

func (sh *SwitchHelper) CheckFailoverQuorum(activeNodes []string, permissibleSlaves int) error {
if sh.SemiSync {
failoverQuorum := sh.GetFailoverQuorum(activeNodes)
if permissibleSlaves < failoverQuorum {
return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum)
}
} else {
if permissibleSlaves == 0 {
return fmt.Errorf("no alive active replica found")
}
}
return nil
}

0 comments on commit d07e3c6

Please sign in to comment.