-
Notifications
You must be signed in to change notification settings - Fork 806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allowing rule backup for rules API HA #5782
Changes from all commits
8b8013a
bcc7af4
0e2c260
9764486
a572559
2d32a1e
f273e9c
1bac62e
4aef787
00c188e
daf1f83
2681f23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -44,6 +44,9 @@ type DefaultMultiTenantManager struct { | |||
notifiers map[string]*rulerNotifier | ||||
notifiersDiscoveryMetrics map[string]discovery.DiscovererMetrics | ||||
|
||||
// rules backup | ||||
rulesBackupManager *rulesBackupManager | ||||
|
||||
managersTotal prometheus.Gauge | ||||
lastReloadSuccessful *prometheus.GaugeVec | ||||
lastReloadSuccessfulTimestamp *prometheus.GaugeVec | ||||
|
@@ -79,7 +82,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva | |||
os.Exit(1) | ||||
} | ||||
|
||||
return &DefaultMultiTenantManager{ | ||||
m := &DefaultMultiTenantManager{ | ||||
cfg: cfg, | ||||
notifierCfg: ncfg, | ||||
managerFactory: managerFactory, | ||||
|
@@ -112,7 +115,11 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva | |||
}, []string{"user"}), | ||||
registry: reg, | ||||
logger: logger, | ||||
}, nil | ||||
} | ||||
if cfg.APIEnableRulesBackup { | ||||
m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg) | ||||
} | ||||
return m, nil | ||||
} | ||||
|
||||
func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList) { | ||||
|
@@ -161,8 +168,14 @@ func (r *DefaultMultiTenantManager) deleteRuleCache(user string) { | |||
delete(r.ruleCache, user) | ||||
} | ||||
|
||||
func (r *DefaultMultiTenantManager) BackUpRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList) { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might missed it , but where do we call this function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We call it inside syncRules Line 594 in f577e38
|
||||
if r.rulesBackupManager != nil { | ||||
r.rulesBackupManager.setRuleGroups(ctx, ruleGroups) | ||||
} | ||||
} | ||||
|
||||
// syncRulesToManager maps the rule files to disk, detects any changes and will create/update the | ||||
// the users Prometheus Rules Manager. | ||||
// users Prometheus Rules Manager. | ||||
func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user string, groups rulespb.RuleGroupList) { | ||||
// Map the files to disk and return the file names to be passed to the users manager if they | ||||
// have been updated | ||||
|
@@ -333,6 +346,13 @@ func (r *DefaultMultiTenantManager) GetRules(userID string) []*promRules.Group { | |||
return groups | ||||
} | ||||
|
||||
func (r *DefaultMultiTenantManager) GetBackupRules(userID string) rulespb.RuleGroupList { | ||||
if r.rulesBackupManager != nil { | ||||
return r.rulesBackupManager.getRuleGroups(userID) | ||||
} | ||||
return nil | ||||
} | ||||
|
||||
func (r *DefaultMultiTenantManager) Stop() { | ||||
r.notifiersMtx.Lock() | ||||
for _, n := range r.notifiers { | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
package ruler | ||
|
||
import ( | ||
"time" | ||
|
||
promRules "github.com/prometheus/prometheus/rules" | ||
) | ||
|
||
// mergeGroupStateDesc removes duplicates from the provided []*GroupStateDesc by keeping the GroupStateDesc with the | ||
// latest information. It uses the EvaluationTimestamp of the GroupStateDesc and the EvaluationTimestamp of the | ||
// ActiveRules in a GroupStateDesc to determine the which GroupStateDesc has the latest information. | ||
func mergeGroupStateDesc(in []*GroupStateDesc) []*GroupStateDesc { | ||
states := make(map[string]*GroupStateDesc) | ||
rgTime := make(map[string]time.Time) | ||
for _, state := range in { | ||
latestTs := state.EvaluationTimestamp | ||
for _, r := range state.ActiveRules { | ||
if latestTs.Before(r.EvaluationTimestamp) { | ||
latestTs = r.EvaluationTimestamp | ||
} | ||
} | ||
Comment on lines
+17
to
+21
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with this PR we are still evaluating all the rules in the replicas as well? if we are not evaluating, this is not required for the instances that are not primary right? if yes, should we only consider the timestamp of the group? reason why is because the timestamp in the rule is from the time the rule is being evaluated, which can be delayed -> you might be discarding a GroupStateDesc from a good instance in favor of delayed GroupStateDesc just because a rule took a long time to evaluate? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
only the primary instance will evaluate the rule group. so only the data from the primary instance will have the latest EvaluationTimestamps. The |
||
key := promRules.GroupKey(state.Group.Namespace, state.Group.Name) | ||
ts, ok := rgTime[key] | ||
if !ok || ts.Before(latestTs) { | ||
states[key] = state | ||
rgTime[key] = latestTs | ||
} | ||
} | ||
groups := make([]*GroupStateDesc, 0, len(states)) | ||
for _, state := range states { | ||
groups = append(groups, state) | ||
} | ||
return groups | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I gathered that this config is for transition purposes when an user upgrade to a version of Cortex that supports replication.
In this cause I think the we should make the purpose more explicit, and call out this flag is subject to removal. Maybe we should event mark this field as "experimental" to tag along the "experimental" SLA of Cortex.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is but also not 100%. During ring change there is a chance that a rule group is loaded by multiple rulers just because of the timing of the sync rules and the ring propagation delay. So this flag can be used to avoid duplicate rule groups in the list during these events.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it makes sense for all these new flags to be experimental for now. We don't know if these flags still makes sense when we have rule evaluation HA. I made them all experimental.