diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cfa586fb86..be050f42596 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ ### Mimirtool +* [ENHANCEMENT] mimirtool analyze: Store the query errors instead of exit during the analysis. #3052 * [BUGFIX] mimir-tool remote-read: fix returns where some conditions [return nil error even if there is error](https://github.com/grafana/cortex-tools/issues/260). #3053 ### Query-tee diff --git a/pkg/mimirtool/analyze/prometheus.go b/pkg/mimirtool/analyze/prometheus.go index 68d4f4a5fba..80b617cc6ea 100644 --- a/pkg/mimirtool/analyze/prometheus.go +++ b/pkg/mimirtool/analyze/prometheus.go @@ -12,6 +12,8 @@ type MetricsInPrometheus struct { InUseMetricCounts []MetricCount `json:"in_use_metric_counts"` AdditionalMetricCounts []MetricCount `json:"additional_metric_counts"` + + Errors []string `json:"errors"` } type MetricCount struct { diff --git a/pkg/mimirtool/commands/analyse_prometheus.go b/pkg/mimirtool/commands/analyse_prometheus.go index b4c48f4ec9d..b217b91473b 100644 --- a/pkg/mimirtool/commands/analyse_prometheus.go +++ b/pkg/mimirtool/commands/analyse_prometheus.go @@ -8,6 +8,7 @@ package commands import ( "context" "encoding/json" + "fmt" "os" "sort" "time" @@ -98,7 +99,7 @@ func (cmd *PrometheusAnalyzeCommand) run(k *kingpin.ParseContext) error { jobCount map[string]int }{} inUseCardinality := 0 - + var errorMetrics []string for _, metric := range metricsUsed { ctx, cancel := context.WithTimeout(context.Background(), cmd.readTimeout) defer cancel() @@ -106,7 +107,10 @@ func (cmd *PrometheusAnalyzeCommand) run(k *kingpin.ParseContext) error { query := "count by (job) (" + metric + ")" result, _, err := v1api.Query(ctx, query, time.Now()) if err != nil { - return errors.Wrap(err, "error querying "+query) + errStr := fmt.Sprintf("skipped %s analysis because failed to run query %v: %s", metric, query, err.Error()) + log.Warnln(errStr) + errorMetrics = append(errorMetrics, errStr) + continue } vec := result.(model.Vector) @@ -155,7 +159,10 @@ func (cmd *PrometheusAnalyzeCommand) run(k *kingpin.ParseContext) error { query := "count by (job) (" + metric + ")" result, _, err := v1api.Query(ctx, query, time.Now()) if err != nil { - return errors.Wrap(err, "error querying "+query) + errStr := fmt.Sprintf("skipped %s analysis because failed to run query %v: %s", metric, query, err.Error()) + log.Warnln(errStr) + errorMetrics = append(errorMetrics, errStr) + continue } vec := result.(model.Vector) @@ -191,6 +198,7 @@ func (cmd *PrometheusAnalyzeCommand) run(k *kingpin.ParseContext) error { output.TotalActiveSeries = inUseCardinality + additionalMetricsCardinality output.InUseActiveSeries = inUseCardinality output.AdditionalActiveSeries = additionalMetricsCardinality + output.Errors = errorMetrics for metric, counts := range inUseMetrics { jobCounts := make([]analyze.JobCount, 0, len(counts.jobCount))