Skip to content

Commit

Permalink
Remove APIEnableRulesBackup config and use Ring.ReplicationFactor ins… (
Browse files Browse the repository at this point in the history
#5901)

* Remove APIEnableRulesBackup config and use Ring.ReplicationFactor instead

Signed-off-by: Emmanuel Lodovice <[email protected]>

* Update changelog

Signed-off-by: Emmanuel Lodovice <[email protected]>

* Update change log

Signed-off-by: Emmanuel Lodovice <[email protected]>

* Consider AZ awareness disabled if only 1 zone is present in list rules

Signed-off-by: Emmanuel Lodovice <[email protected]>

---------

Signed-off-by: Emmanuel Lodovice <[email protected]>
  • Loading branch information
emanlodovice authored Apr 26, 2024
1 parent 55f996c commit 1b005d3
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 76 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## master / unreleased
* [CHANGE] Ruler: Remove `experimental.ruler.api-enable-rules-backup` flag and use `ruler.ring.replication-factor` to check if rules backup is enabled

## 1.17.0 in progress

Expand Down
9 changes: 0 additions & 9 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -4271,15 +4271,6 @@ ring:
# CLI flag: -experimental.ruler.enable-api
[enable_api: <boolean> | default = false]
# EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers
# with default state (state before any evaluation) and send this copy in list
# API requests as backup in case the ruler who owns the rule fails to send its
# rules. This allows the rules API to handle ruler outage by returning rules
# with default state. Ring replication-factor needs to be set to 2 or more for
# this to be useful.
# CLI flag: -experimental.ruler.api-enable-rules-backup
[api_enable_rules_backup: <boolean> | default = false]
# EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API
# response. If there are duplicate rules the rule with the latest evaluation
# timestamp will be kept.
Expand Down
9 changes: 4 additions & 5 deletions integration/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ func TestRulerAPIShardingWithAPIRulesBackupEnabled(t *testing.T) {
testRulerAPIWithSharding(t, true)
}

func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
func testRulerAPIWithSharding(t *testing.T, enableRulesBackup bool) {
const numRulesGroups = 100

random := rand.New(rand.NewSource(time.Now().UnixNano()))
Expand Down Expand Up @@ -459,9 +459,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
// Enable the bucket index so we can skip the initial bucket scan.
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
}
if enableAPIRulesBackup {
if enableRulesBackup {
overrides["-ruler.ring.replication-factor"] = "3"
overrides["-experimental.ruler.api-enable-rules-backup"] = "true"
}
rulerFlags := mergeFlags(
BlocksStorageFlags(),
Expand Down Expand Up @@ -556,8 +555,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
},
}
// For each test case, fetch the rules with configured filters, and ensure the results match.
if enableAPIRulesBackup {
err := ruler2.Kill() // if api-enable-rules-backup is enabled the APIs should be able to handle a ruler going down
if enableRulesBackup {
err := ruler2.Kill() // if rules backup is enabled the APIs should be able to handle a ruler going down
require.NoError(t, err)
}
for name, tc := range testCases {
Expand Down
2 changes: 1 addition & 1 deletion pkg/ruler/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva
registry: reg,
logger: logger,
}
if cfg.APIEnableRulesBackup {
if cfg.RulesBackupEnabled() {
m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg)
}
return m, nil
Expand Down
4 changes: 3 additions & 1 deletion pkg/ruler/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,9 @@ func TestBackupRules(t *testing.T) {
1 * time.Millisecond,
}
ruleManagerFactory := RuleManagerFactory(nil, waitDurations)
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir, APIEnableRulesBackup: true}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
config := Config{RulePath: dir}
config.Ring.ReplicationFactor = 3
m, err := NewDefaultMultiTenantManager(config, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
require.NoError(t, err)

const user1 = "testUser"
Expand Down
30 changes: 17 additions & 13 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@ type Config struct {
Ring RingConfig `yaml:"ring"`
FlushCheckPeriod time.Duration `yaml:"flush_period"`

EnableAPI bool `yaml:"enable_api"`
APIEnableRulesBackup bool `yaml:"api_enable_rules_backup"`
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`
EnableAPI bool `yaml:"enable_api"`
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`

EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"`
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
Expand Down Expand Up @@ -200,7 +199,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers")
f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api")
f.BoolVar(&cfg.APIEnableRulesBackup, "experimental.ruler.api-enable-rules-backup", false, "EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers with default state (state before any evaluation) and send this copy in list API requests as backup in case the ruler who owns the rule fails to send its rules. This allows the rules API to handle ruler outage by returning rules with default state. Ring replication-factor needs to be set to 2 or more for this to be useful.")
f.BoolVar(&cfg.APIDeduplicateRules, "experimental.ruler.api-deduplicate-rules", false, "EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API response. If there are duplicate rules the rule with the latest evaluation timestamp will be kept.")
f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`)
f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`)
Expand All @@ -217,6 +215,12 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
cfg.RingCheckPeriod = 5 * time.Second
}

func (cfg *Config) RulesBackupEnabled() bool {
// If the replication factor is greater the 1, only the first replica is responsible for evaluating the rule,
// the rest of the replica will store the rule groups as backup only for API HA.
return cfg.Ring.ReplicationFactor > 1
}

// MultiTenantManager is the interface of interaction with a Manager that is tenant aware.
type MultiTenantManager interface {
// SyncRuleGroups is used to sync the Manager with rules from the RuleStore.
Expand Down Expand Up @@ -581,7 +585,7 @@ func (r *Ruler) syncRules(ctx context.Context, reason string) {
// This will also delete local group files for users that are no longer in 'configs' map.
r.manager.SyncRuleGroups(ctx, loadedConfigs)

if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
r.manager.BackUpRuleGroups(ctx, backupConfigs)
}
}
Expand All @@ -604,7 +608,7 @@ func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGrou
if err != nil {
level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(ownedConfigs)-len(loadedOwnedConfigs), "err", err)
}
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
loadedBackupConfigs, err := r.store.LoadRuleGroups(ctx, backupConfigs)
if err != nil {
level.Warn(r.logger).Log("msg", "failed to load some rules backed up by this ruler", "count", len(backupConfigs)-len(loadedBackupConfigs), "err", err)
Expand Down Expand Up @@ -685,7 +689,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
if len(owned) > 0 {
ownedConfigs[userID] = owned
}
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
backup := filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
if len(backup) > 0 {
backedUpConfigs[userID] = backup
Expand Down Expand Up @@ -748,7 +752,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp

filterOwned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
var filterBackup []*rulespb.RuleGroupDesc
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
filterBackup = filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
}
if len(filterOwned) == 0 && len(filterBackup) == 0 {
Expand Down Expand Up @@ -1121,7 +1125,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
)

zoneByAddress := make(map[string]string)
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
for _, ruler := range rulers.Instances {
zoneByAddress[ruler.Addr] = ruler.Zone
}
Expand All @@ -1146,9 +1150,9 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
if err != nil {
level.Error(r.logger).Log("msg", "unable to retrieve rules from ruler", "addr", addr, "err", err)
r.rulerGetRulesFailures.WithLabelValues(addr).Inc()
// If APIEnableRulesBackup is enabled and there are enough rulers replicating the rules, we should
// If rules backup is enabled and there are enough rulers replicating the rules, we should
// be able to handle failures.
if r.cfg.APIEnableRulesBackup && len(jobs) >= r.cfg.Ring.ReplicationFactor {
if r.cfg.RulesBackupEnabled() && len(jobs) >= r.cfg.Ring.ReplicationFactor {
mtx.Lock()
failedZones[zoneByAddress[addr]] = struct{}{}
errCount += 1
Expand All @@ -1168,7 +1172,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
return nil
})

if err == nil && (r.cfg.APIEnableRulesBackup || r.cfg.APIDeduplicateRules) {
if err == nil && (r.cfg.RulesBackupEnabled() || r.cfg.APIDeduplicateRules) {
merged = mergeGroupStateDesc(merged)
}

Expand All @@ -1183,7 +1187,7 @@ func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, er
return nil, fmt.Errorf("no user id found in context")
}

groupDescs, err := r.getLocalRules(userID, *in, r.cfg.APIEnableRulesBackup)
groupDescs, err := r.getLocalRules(userID, *in, r.cfg.RulesBackupEnabled())
if err != nil {
return nil, err
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/ruler/ruler_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ func GetReplicationSetForListRule(r ring.ReadRing, cfg *RingConfig) (ring.Replic
// to 0, and then we update them whether zone-awareness is enabled or not.
maxErrors := 0
maxUnavailableZones := 0
if cfg.ZoneAwarenessEnabled {
// Because ring's Get method returns a number of ruler equal to the replication factor even if there is only 1 zone
// and ZoneAwarenessEnabled, we can consider that ZoneAwarenessEnabled is disabled if there is only 1 zone since
// rules are still replicated to rulers in the same zone.
if cfg.ZoneAwarenessEnabled && len(ringZones) > 1 {
numReplicatedZones := min(len(ringZones), r.ReplicationFactor())
// Given that quorum is not required, we only need at least one of the zone to be healthy to succeed. But we
// also need to handle case when RF < number of zones.
Expand Down
17 changes: 17 additions & 0 deletions pkg/ruler/ruler_ring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,23 @@ func TestGetReplicationSetForListRule(t *testing.T) {
"z2": {},
},
},
"should succeed on 1 unhealthy instances in RF=3 zone replication enabled but only 1 zone": {
ringInstances: map[string]ring.InstanceDesc{
"instance-1": {Addr: "127.0.0.1", State: ring.ACTIVE, Timestamp: now.Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-1", "z1", 128, true), Zone: "z1"},
"instance-2": {Addr: "127.0.0.2", State: ring.ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-2", "z1", 128, true), Zone: "z1"},
"instance-3": {Addr: "127.0.0.3", State: ring.ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-3", "z1", 128, true), Zone: "z1"},
"instance-4": {Addr: "127.0.0.4", State: ring.PENDING, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-4", "z1", 128, true), Zone: "z1"},
},
ringHeartbeatTimeout: time.Minute,
ringReplicationFactor: 3,
expectedSet: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"},
enableAZReplication: true,
expectedFailedZones: map[string]struct{}{
"z1": {},
},
expectedMaxUnavailableZones: 0,
expectedMaxError: 1,
},
}

for testName, testData := range tests {
Expand Down
Loading

0 comments on commit 1b005d3

Please sign in to comment.