From cccfd738cf784f6b2c1ee4b039395cbacf8a5617 Mon Sep 17 00:00:00 2001 From: Anand Rajagopal Date: Wed, 25 Sep 2024 16:31:58 -0500 Subject: [PATCH] Make LivenessCheck Timeout Configurable (#6227) --- docs/configuration/config-file-reference.md | 6 ++++++ pkg/ruler/ruler.go | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 7b34c88bbb..3146f68c3c 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4480,6 +4480,12 @@ ring: # Enable high availability # CLI flag: -ruler.enable-ha-evaluation [enable_ha_evaluation: | default = false] + +# Timeout duration for non-primary rulers during liveness checks. If the check +# times out, the non-primary ruler will evaluate the rule group. Applicable when +# ruler.enable-ha-evaluation is true. +# CLI flag: -ruler.liveness-check-timeout +[liveness_check_timeout: | default = 1s] ``` ### `ruler_storage_config` diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 5deac4f9df..efc11eef9f 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -81,8 +81,6 @@ const ( unknownHealthFilter string = "unknown" okHealthFilter string = "ok" errHealthFilter string = "err" - - livenessCheckTimeout = 100 * time.Millisecond ) type DisabledRuleGroupErr struct { @@ -161,7 +159,8 @@ type Config struct { EnableQueryStats bool `yaml:"query_stats_enabled"` DisableRuleGroupLabel bool `yaml:"disable_rule_group_label"` - EnableHAEvaluation bool `yaml:"enable_ha_evaluation"` + EnableHAEvaluation bool `yaml:"enable_ha_evaluation"` + LivenessCheckTimeout time.Duration `yaml:"liveness_check_timeout"` } // Validate config and returns error on failure @@ -238,6 +237,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&cfg.DisableRuleGroupLabel, "ruler.disable-rule-group-label", false, "Disable the rule_group label on exported metrics") f.BoolVar(&cfg.EnableHAEvaluation, "ruler.enable-ha-evaluation", false, "Enable high availability") + f.DurationVar(&cfg.LivenessCheckTimeout, "ruler.liveness-check-timeout", 1*time.Second, "Timeout duration for non-primary rulers during liveness checks. If the check times out, the non-primary ruler will evaluate the rule group. Applicable when ruler.enable-ha-evaluation is true.") cfg.RingCheckPeriod = 5 * time.Second } @@ -590,7 +590,7 @@ func (r *Ruler) nonPrimaryInstanceOwnsRuleGroup(g *rulespb.RuleGroupDesc, replic responseChan := make(chan *LivenessCheckResponse, len(jobs)) ctx := user.InjectOrgID(context.Background(), userID) - ctx, cancel := context.WithTimeout(ctx, livenessCheckTimeout) + ctx, cancel := context.WithTimeout(ctx, r.cfg.LivenessCheckTimeout) defer cancel() err := concurrency.ForEach(ctx, jobs, len(jobs), func(ctx context.Context, job interface{}) error {