From 5f26de52ca7ede3cabc2715fd8cc20cfb0897438 Mon Sep 17 00:00:00 2001 From: Firsov Kirill <37708382+Fizic@users.noreply.github.com> Date: Wed, 20 Mar 2024 17:19:23 +0300 Subject: [PATCH] fix: checking for split brain (#87) MySQL replicas rarely could commit before master --- internal/app/app.go | 8 ++++++- internal/app/util.go | 25 ++++++++++++++++++++ internal/app/util_test.go | 49 +++++++++++++++++++++++++++++++++++++++ internal/mysql/data.go | 4 ++++ internal/mysql/node.go | 21 ++++++++++++++++- internal/mysql/queries.go | 2 ++ 6 files changed, 107 insertions(+), 2 deletions(-) diff --git a/internal/app/app.go b/internal/app/app.go index 66f29f17..16462cd2 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -827,6 +827,12 @@ func (app *App) calcActiveNodes(clusterState, clusterStateDcs map[string]*NodeSt app.logger.Warnf("failed to get master status %v", err) return nil, err } + muuid, err := masterNode.UUID() + if err != nil { + app.logger.Warnf("failed to get master uuid %v", err) + return nil, err + } + for host, node := range clusterState { if host == master { activeNodes = append(activeNodes, master) @@ -869,7 +875,7 @@ func (app *App) calcActiveNodes(clusterState, clusterStateDcs map[string]*NodeSt continue } sgtids := gtids.ParseGtidSet(sstatus.ExecutedGtidSet) - if !(sstatus.ReplicationState == mysql.ReplicationRunning && isGTIDLessOrEqual(sgtids, mgtids)) { + if sstatus.ReplicationState != mysql.ReplicationRunning || isSplitBrained(sgtids, mgtids, muuid) { app.logger.Errorf("calc active nodes: %s is not replicating or splitbrained, deleting from active...", host) continue } diff --git a/internal/app/util.go b/internal/app/util.go index 4cb79076..3e878b98 100644 --- a/internal/app/util.go +++ b/internal/app/util.go @@ -4,6 +4,8 @@ import ( "fmt" "time" + gomysql "github.com/go-mysql-org/go-mysql/mysql" + "github.com/google/uuid" "github.com/yandex/mysync/internal/log" "github.com/yandex/mysync/internal/mysql" "github.com/yandex/mysync/internal/mysql/gtids" @@ -233,6 +235,29 @@ func isGTIDLessOrEqual(slaveGtidSet, masterGtidSet gtids.GTIDSet) bool { return masterGtidSet.Contain(slaveGtidSet) || masterGtidSet.Equal(slaveGtidSet) } +func isSplitBrained(slaveGtidSet, masterGtidSet gtids.GTIDSet, masterUUID uuid.UUID) bool { + mysqlSlaveGtidSet := slaveGtidSet.(*gomysql.MysqlGTIDSet) + mysqlMasterGtidSet := masterGtidSet.(*gomysql.MysqlGTIDSet) + for _, slaveSet := range mysqlSlaveGtidSet.Sets { + masterSet, ok := mysqlMasterGtidSet.Sets[slaveSet.SID.String()] + if !ok { + return true + } + + if masterSet.Contain(slaveSet) { + continue + } + + if masterSet.SID == masterUUID { + continue + } + + return true + } + + return false +} + func validatePriority(priority *int64) error { if priority == nil || *priority >= 0 { return nil diff --git a/internal/app/util_test.go b/internal/app/util_test.go index 8a71e8cb..efc21df0 100644 --- a/internal/app/util_test.go +++ b/internal/app/util_test.go @@ -327,3 +327,52 @@ func getLogger() *log.Logger { } return l } + +func TestIsSplitBrained(t *testing.T) { + masterGTID := mustGTIDSet("6DBC0B04-4B09-43DC-86CC-9AF852DED919:1-100," + + "09978591-5754-4710-BF67-062880ABE1B4:1-100," + + "AA6890C8-69F8-4BC4-B3A5-5D3FEA8C28CF:1-100") + masterUUID := masterGTID.(*gomysql.MysqlGTIDSet).Sets["6dbc0b04-4b09-43dc-86cc-9af852ded919"].SID + + // equal gtids + slaveGTID := mustGTIDSet("6DBC0B04-4B09-43DC-86CC-9AF852DED919:1-100," + + "09978591-5754-4710-BF67-062880ABE1B4:1-100," + + "AA6890C8-69F8-4BC4-B3A5-5D3FEA8C28CF:1-100") + ok := isSplitBrained(slaveGTID, masterGTID, masterUUID) + require.False(t, ok) + + // the replica is lagging behind the master + slaveGTID = mustGTIDSet("6DBC0B04-4B09-43DC-86CC-9AF852DED919:1-99," + + "09978591-5754-4710-BF67-062880ABE1B4:1-100," + + "AA6890C8-69F8-4BC4-B3A5-5D3FEA8C28CF:1-100") + ok = isSplitBrained(slaveGTID, masterGTID, masterUUID) + require.False(t, ok) + + // the replica is lagging behind the new master + slaveGTID = mustGTIDSet("6DBC0B04-4B09-43DC-86CC-9AF852DED919:1-100," + + "09978591-5754-4710-BF67-062880ABE1B4:1-100") + ok = isSplitBrained(slaveGTID, masterGTID, masterUUID) + require.False(t, ok) + + // the replica applied the transaction from the master before the master + slaveGTID = mustGTIDSet("6DBC0B04-4B09-43DC-86CC-9AF852DED919:1-101," + + "09978591-5754-4710-BF67-062880ABE1B4:1-100," + + "AA6890C8-69F8-4BC4-B3A5-5D3FEA8C28CF:1-100") + ok = isSplitBrained(slaveGTID, masterGTID, masterUUID) + require.False(t, ok) + + // the replica applied a transaction not from the master + slaveGTID = mustGTIDSet("6DBC0B04-4B09-43DC-86CC-9AF852DED919:1-100," + + "09978591-5754-4710-BF67-062880ABE1B4:1-100," + + "AA6890C8-69F8-4BC4-B3A5-5D3FEA8C28CF:1-101") + ok = isSplitBrained(slaveGTID, masterGTID, masterUUID) + require.True(t, ok) + + // the replica applied a new transaction not from the master + slaveGTID = mustGTIDSet("6DBC0B04-4B09-43DC-86CC-9AF852DED919:1-101," + + "09978591-5754-4710-BF67-062880ABE1B4:1-100," + + "AA6890C8-69F8-4BC4-B3A5-5D3FEA8C28CF:1-100," + + "BB6890C8-69F8-4BC4-B3A5-5D3FEA8C28CF:1-100") + ok = isSplitBrained(slaveGTID, masterGTID, masterUUID) + require.True(t, ok) +} diff --git a/internal/mysql/data.go b/internal/mysql/data.go index fedc605c..8f480073 100644 --- a/internal/mysql/data.go +++ b/internal/mysql/data.go @@ -28,6 +28,10 @@ type readOnlyResult struct { SuperReadOnly int `db:"SuperReadOnly"` } +type ServerUUIDResult struct { + ServerUUID string `db:"server_uuid"` +} + // CascadeNodeConfiguration is a dcs node configuration for cascade mysql replica type CascadeNodeConfiguration struct { // StreamFrom - is a host to stream from. Can be changed from CLI. diff --git a/internal/mysql/node.go b/internal/mysql/node.go index 5edb5e3f..7f4cea0e 100644 --- a/internal/mysql/node.go +++ b/internal/mysql/node.go @@ -18,9 +18,9 @@ import ( "time" "github.com/go-sql-driver/mysql" + "github.com/google/uuid" "github.com/jmoiron/sqlx" "github.com/shirou/gopsutil/v3/process" - "github.com/yandex/mysync/internal/config" "github.com/yandex/mysync/internal/log" "github.com/yandex/mysync/internal/mysql/gtids" @@ -34,6 +34,7 @@ type Node struct { db *sqlx.DB version *Version host string + uuid uuid.UUID } var ( @@ -602,6 +603,24 @@ func (n *Node) GetBinlogs() ([]Binlog, error) { return binlogs, err } +// UUID returns server_uuid +func (n *Node) UUID() (uuid.UUID, error) { + if n.uuid.ID() != 0 { + return n.uuid, nil + } + var r ServerUUIDResult + err := n.queryRow(queryGetUUID, nil, &r) + if err != nil { + return uuid.UUID{}, err + } + v, err := uuid.Parse(r.ServerUUID) + if err != nil { + return uuid.UUID{}, err + } + n.uuid = v + return v, err +} + // IsReadOnly returns (true, true) if MySQL Node in (read-only, super-read-only) mode func (n *Node) IsReadOnly() (bool, bool, error) { var ror readOnlyResult diff --git a/internal/mysql/queries.go b/internal/mysql/queries.go index 4e0288cd..f5c596ba 100644 --- a/internal/mysql/queries.go +++ b/internal/mysql/queries.go @@ -6,6 +6,7 @@ const ( queryReplicaStatus = "replica_status" queryGetVersion = "get_version" queryGTIDExecuted = "gtid_executed" + queryGetUUID = "get_uuid" queryShowBinaryLogs = "binary_logs" queryReplicationLag = "replication_lag" querySlaveHosts = "slave_hosts" @@ -53,6 +54,7 @@ var DefaultQueries = map[string]string{ queryReplicaStatus: `SHOW REPLICA STATUS FOR CHANNEL :channel`, queryGetVersion: `SELECT sys.version_major() AS MajorVersion, sys.version_minor() AS MinorVersion, sys.version_patch() AS PatchVersion`, queryGTIDExecuted: `SELECT @@GLOBAL.gtid_executed as Executed_Gtid_Set`, + queryGetUUID: `SELECT @@server_uuid as server_uuid`, queryShowBinaryLogs: `SHOW BINARY LOGS`, querySlaveHosts: `SHOW SLAVE HOSTS`, queryReplicationLag: ``,