Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add failoverQuorum semisync check, refactor failoverQuorum #115

Merged
merged 3 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 12 additions & 46 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,26 +161,6 @@ func (app *App) removeMaintenanceFile() {
}
}

// Dynamically calculated version of RplSemiSyncMasterWaitForSlaveCount.
// This variable can be lower than hard-configured RplSemiSyncMasterWaitForSlaveCount
// when some semi-sync replicas are dead.
func (app *App) getRequiredWaitSlaveCount(activeNodes []string) int {
wsc := len(activeNodes) / 2
if wsc > app.config.RplSemiSyncMasterWaitForSlaveCount {
wsc = app.config.RplSemiSyncMasterWaitForSlaveCount
}
return wsc
}

// Number of HA nodes to be alive to failover/switchover
func (app *App) getFailoverQuorum(activeNodes []string) int {
fq := len(activeNodes) - app.getRequiredWaitSlaveCount(activeNodes)
if fq < 1 {
fq = 1
}
return fq
}

// separate goroutine performing health checks
func (app *App) healthChecker(ctx context.Context) {
ticker := time.NewTicker(app.config.HealthCheckInterval)
Expand Down Expand Up @@ -795,19 +775,13 @@ func (app *App) approveFailover(clusterState, clusterStateDcs map[string]*NodeSt
app.logger.Infof("approve failover: active nodes are %v", activeNodes)
// number of active slaves that we can use to perform switchover
permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState)
if app.config.SemiSync {
failoverQuorum := app.getFailoverQuorum(activeNodes)
if permissibleSlaves < failoverQuorum {
return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum)
}
} else {
if permissibleSlaves == 0 {
return fmt.Errorf("no alive active replica found")
}
err := app.switchHelper.CheckFailoverQuorum(activeNodes, permissibleSlaves)
if err != nil {
return err
}

var lastSwitchover Switchover
err := app.dcs.Get(pathLastSwitch, &lastSwitchover)
err = app.dcs.Get(pathLastSwitch, &lastSwitchover)
if err != dcs.ErrNotFound {
if err != nil {
return err
Expand Down Expand Up @@ -865,15 +839,8 @@ func (app *App) approveSwitchover(switchover *Switchover, activeNodes []string,
if switchover.RunCount > 0 {
return nil
}
if app.config.SemiSync {
// number of active slaves that we can use to perform switchover
permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState)
failoverQuorum := app.getFailoverQuorum(activeNodes)
if permissibleSlaves < failoverQuorum {
return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum)
}
}
return nil
permissibleSlaves := countAliveHASlavesWithinNodes(activeNodes, clusterState)
return app.switchHelper.CheckFailoverQuorum(activeNodes, permissibleSlaves)
}

/*
Expand Down Expand Up @@ -1048,7 +1015,7 @@ func (app *App) updateActiveNodes(clusterState, clusterStateDcs map[string]*Node
if masterState.SemiSyncState != nil && masterState.SemiSyncState.MasterEnabled {
oldWaitSlaveCount = masterState.SemiSyncState.WaitSlaveCount
}
waitSlaveCount := app.getRequiredWaitSlaveCount(activeNodes)
waitSlaveCount := app.switchHelper.GetRequiredWaitSlaveCount(activeNodes)

app.logger.Infof("update active nodes: active nodes are: %v, wait_slave_count %d", activeNodes, waitSlaveCount)
if len(becomeActive) > 0 {
Expand Down Expand Up @@ -1202,9 +1169,7 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode
return fmt.Errorf("switchover: failed to ping hosts: %v with dubious errors", dubious)
}

// calc failoverQuorum before filtering out old master
failoverQuorum := app.getFailoverQuorum(activeNodes)
oldActiveNodes := activeNodes
activeNodesWithOldMaster := activeNodes

// filter out old master as may hang and timeout in different ways
if switchover.Cause == CauseAuto && switchover.From == oldMaster {
Expand Down Expand Up @@ -1273,8 +1238,9 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode
frozenActiveNodes = append(frozenActiveNodes, host)
}
}
if len(frozenActiveNodes) < failoverQuorum {
return fmt.Errorf("no failoverQuorum: has %d frozen active nodes, while %d is required", len(frozenActiveNodes), failoverQuorum)
err := app.switchHelper.CheckFailoverQuorum(activeNodesWithOldMaster, len(frozenActiveNodes))
if err != nil {
return err
}

// setting server read-only may take a while so we need to ensure we are still a manager
Expand Down Expand Up @@ -1415,7 +1381,7 @@ func (app *App) performSwitchover(clusterState map[string]*NodeState, activeNode

// adjust semi-sync before finishing switchover
clusterState = app.getClusterStateFromDB()
err = app.updateActiveNodes(clusterState, clusterState, oldActiveNodes, newMaster)
err = app.updateActiveNodes(clusterState, clusterState, activeNodesWithOldMaster, newMaster)
if err != nil || app.emulateError("update_active_nodes") {
app.logger.Warnf("switchover: failed to update active nodes after switchover: %v", err)
}
Expand Down
46 changes: 44 additions & 2 deletions internal/mysql/switch_helper.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
package mysql

import (
"fmt"
"time"

"github.com/yandex/mysync/internal/config"
)

type ISwitchHelper interface {
GetPriorityChoiceMaxLag() time.Duration
GetRequiredWaitSlaveCount([]string) int
GetFailoverQuorum([]string) int
CheckFailoverQuorum([]string, int) error
}

type SwitchHelper struct {
priorityChoiceMaxLag time.Duration
priorityChoiceMaxLag time.Duration
rplSemiSyncMasterWaitForSlaveCount int
SemiSync bool
}

func NewSwitchHelper(config *config.Config) ISwitchHelper {
Expand All @@ -22,10 +28,46 @@ func NewSwitchHelper(config *config.Config) ISwitchHelper {
}
}
return &SwitchHelper{
priorityChoiceMaxLag: priorityChoiceMaxLag,
priorityChoiceMaxLag: priorityChoiceMaxLag,
rplSemiSyncMasterWaitForSlaveCount: config.RplSemiSyncMasterWaitForSlaveCount,
SemiSync: config.SemiSync,
}
}

func (sh *SwitchHelper) GetPriorityChoiceMaxLag() time.Duration {
return sh.priorityChoiceMaxLag
}

// GetRequiredWaitSlaveCount Dynamically calculated version of RplSemiSyncMasterWaitForSlaveCount.
// This variable can be lower than hard-configured RplSemiSyncMasterWaitForSlaveCount
// when some semi-sync replicas are dead.
func (sh *SwitchHelper) GetRequiredWaitSlaveCount(activeNodes []string) int {
wsc := len(activeNodes) / 2
if wsc > sh.rplSemiSyncMasterWaitForSlaveCount {
wsc = sh.rplSemiSyncMasterWaitForSlaveCount
}
return wsc
}

// GetFailoverQuorum Number of HA nodes to be alive to failover/switchover
func (sh *SwitchHelper) GetFailoverQuorum(activeNodes []string) int {
fq := len(activeNodes) - sh.GetRequiredWaitSlaveCount(activeNodes)
if fq < 1 {
fq = 1
}
return fq
}

func (sh *SwitchHelper) CheckFailoverQuorum(activeNodes []string, permissibleSlaves int) error {
if sh.SemiSync {
failoverQuorum := sh.GetFailoverQuorum(activeNodes)
if permissibleSlaves < failoverQuorum {
return fmt.Errorf("no quorum, have %d replics while %d is required", permissibleSlaves, failoverQuorum)
}
} else {
if permissibleSlaves == 0 {
return fmt.Errorf("no alive active replica found")
}
}
return nil
}
Loading