From 344743d20b4abcb0535c6c2199221eef6271136e Mon Sep 17 00:00:00 2001 From: Martin Disibio Date: Fri, 15 Nov 2024 07:16:19 -0500 Subject: [PATCH] TraceQL metrics time range fixes (#4325) * Disconnect job time range filtering from step, so that results in split backend/recent range is accurate * changelog --- CHANGELOG.md | 7 +++-- pkg/traceql/engine_metrics.go | 46 ++++++++++++++++++++++-------- pkg/traceql/engine_metrics_test.go | 17 ++++------- 3 files changed, 44 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 901829d0f2f..b9242a2ab06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,7 +52,7 @@ * [ENHANCEMENT] chore: remove gofakeit dependency [#4274](https://github.com/grafana/tempo/pull/4274) (@javiermolinar) * [ENHANCEMENT] Add a max flush attempts and metric to the metrics generator [#4254](https://github.com/grafana/tempo/pull/4254) (@joe-elliott) * [ENHANCEMENT] Collection of query-frontend changes to reduce allocs. [#4242](https://github.com/grafana/tempo/pull/4242) (@joe-elliott) -* [ENHANCEMENT] Added `insecure-skip-verify` option in tempo-cli to skip SSL certificate validation when connecting to the S3 backend. [#44236](https://github.com/grafana/tempo/pull/4259) (@faridtmammadov) +* [ENHANCEMENT] Added `insecure-skip-verify` option in tempo-cli to skip SSL certificate validation when connecting to the S3 backend. [#4259](https://github.com/grafana/tempo/pull/4259) (@faridtmammadov) * [ENHANCEMENT] Chore: delete spanlogger. [4312](https://github.com/grafana/tempo/pull/4312) (@javiermolinar) * [ENHANCEMENT] Add `invalid_utf8` to reasons spanmetrics will discard spans. [#4293](https://github.com/grafana/tempo/pull/4293) (@zalegrala) * [ENHANCEMENT] Reduce frontend and querier allocations by dropping HTTP headers early in the pipeline. [#4298](https://github.com/grafana/tempo/pull/4298) (@joe-elliott) @@ -60,10 +60,11 @@ * [BUGFIX] Metrics generators: Correctly drop from the ring before stopping ingestion to reduce drops during a rollout. [#4101](https://github.com/grafana/tempo/pull/4101) (@joe-elliott) * [BUGFIX] Correctly handle 400 Bad Request and 404 Not Found in gRPC streaming [#4144](https://github.com/grafana/tempo/pull/4144) (@mapno) * [BUGFIX] Pushes a 0 to classic histogram's counter when the series is new to allow Prometheus to start from a non-null value. [#4140](https://github.com/grafana/tempo/pull/4140) (@mapno) -* [BUGFIX] Fix counter samples being downsampled by backdate to the previous minute the initial sample when the series is new [#44236](https://github.com/grafana/tempo/pull/4236) (@javiermolinar) +* [BUGFIX] Fix counter samples being downsampled by backdate to the previous minute the initial sample when the series is new [#4236](https://github.com/grafana/tempo/pull/4236) (@javiermolinar) +* [BUGFIX] Fix traceql metrics time range handling at the cutoff between recent and backend data [#4257](https://github.com/grafana/tempo/issues/4257) (@mdisibio) * [BUGFIX] Skip computing exemplars for instant queries. [#4204](https://github.com/grafana/tempo/pull/4204) (@javiermolinar) * [BUGFIX] Gave context to orphaned spans related to various maintenance processes. [#4260](https://github.com/grafana/tempo/pull/4260) (@joe-elliott) -* [BUGFIX] Utilize S3Pass and S3User parameters in tempo-cli options, which were previously unused in the code. [#44236](https://github.com/grafana/tempo/pull/4259) (@faridtmammadov) +* [BUGFIX] Utilize S3Pass and S3User parameters in tempo-cli options, which were previously unused in the code. [#4259](https://github.com/grafana/tempo/pull/4259) (@faridtmammadov) # v2.6.1 diff --git a/pkg/traceql/engine_metrics.go b/pkg/traceql/engine_metrics.go index 3aba0a74f6c..f997aac8112 100644 --- a/pkg/traceql/engine_metrics.go +++ b/pkg/traceql/engine_metrics.go @@ -50,6 +50,9 @@ func DefaultQueryRangeStep(start, end uint64) uint64 { // IntervalCount is the number of intervals in the range with step. func IntervalCount(start, end, step uint64) int { + start = alignStart(start, step) + end = alignEnd(end, step) + intervals := (end - start) / step intervals++ return int(intervals) @@ -57,11 +60,15 @@ func IntervalCount(start, end, step uint64) int { // TimestampOf the given interval with the start and step. func TimestampOf(interval, start, step uint64) uint64 { + start = alignStart(start, step) return start + interval*step } // IntervalOf the given timestamp within the range and step. func IntervalOf(ts, start, end, step uint64) int { + start = alignStart(start, step) + end = alignEnd(end, step) + step + if ts < start || ts > end || end == start || step == 0 { // Invalid return -1 @@ -89,10 +96,6 @@ func TrimToOverlap(start1, end1, step, start2, end2 uint64) (uint64, uint64, uin if wasInstant { // Alter step to maintain instant nature step = end1 - start1 - } else { - // Realign after trimming - start1 = (start1 / step) * step - end1 = (end1/step)*step + step } return start1, end1, step @@ -110,9 +113,6 @@ func TrimToBefore(req *tempopb.QueryRangeRequest, before time.Time) { if wasInstant { // Maintain instant nature of the request req.Step = req.End - req.Start - } else { - // Realign after trimming - AlignRequest(req) } } @@ -128,9 +128,6 @@ func TrimToAfter(req *tempopb.QueryRangeRequest, before time.Time) { if wasInstant { // Maintain instant nature of the request req.Step = req.End - req.Start - } else { - // Realign after trimming - AlignRequest(req) } } @@ -148,8 +145,30 @@ func AlignRequest(req *tempopb.QueryRangeRequest) { } // It doesn't really matter but the request fields are expected to be in nanoseconds. - req.Start = req.Start / req.Step * req.Step - req.End = req.End / req.Step * req.Step + req.Start = alignStart(req.Start, req.Step) + req.End = alignEnd(req.End, req.Step) +} + +// Start time is rounded down to next step +func alignStart(start, step uint64) uint64 { + if step == 0 { + return 0 + } + return start - start%step +} + +// End time is rounded up to next step +func alignEnd(end, step uint64) uint64 { + if step == 0 { + return 0 + } + + mod := end % step + if mod == 0 { + return end + } + + return end + (step - mod) } type Label struct { @@ -238,6 +257,9 @@ func (set SeriesSet) ToProtoDiff(req *tempopb.QueryRangeRequest, rangeForLabels continue } + start = alignStart(start, req.Step) + end = alignEnd(end, req.Step) + intervals := IntervalCount(start, end, req.Step) samples := make([]tempopb.Sample, 0, intervals) for i, value := range s.Values { diff --git a/pkg/traceql/engine_metrics_test.go b/pkg/traceql/engine_metrics_test.go index 2fc0e356896..c5a64148196 100644 --- a/pkg/traceql/engine_metrics_test.go +++ b/pkg/traceql/engine_metrics_test.go @@ -41,7 +41,7 @@ func TestStepRangeToIntervals(t *testing.T) { start: 0, end: 10, step: 3, - expected: 4, // 0, 3, 6, 9 + expected: 5, // 0, 3, 6, 9, 12 }, } @@ -60,9 +60,9 @@ func TestTimestampOf(t *testing.T) { }, { interval: 2, - start: 10, + start: 10, // aligned to 9 step: 3, - expected: 16, + expected: 15, // 9, 12, 15 <-- intervals }, } @@ -104,21 +104,16 @@ func TestTrimToOverlap(t *testing.T) { expectedStep time.Duration }{ { - // Inner range of 33 to 38 - // gets rounded at 5m intervals to 30 to 40 + // Fully overlapping "2024-01-01 01:00:00", "2024-01-01 02:00:00", 5 * time.Minute, "2024-01-01 01:33:00", "2024-01-01 01:38:00", - "2024-01-01 01:30:00", "2024-01-01 01:40:00", 5 * time.Minute, + "2024-01-01 01:33:00", "2024-01-01 01:38:00", 5 * time.Minute, }, { // Partially Overlapping - // Overlap between 1:01-2:01 and 1:31-2:31 - // in 5m intervals is only 1:30-2:05 - // Start is pushed back - // and end is pushed out "2024-01-01 01:01:00", "2024-01-01 02:01:00", 5 * time.Minute, "2024-01-01 01:31:00", "2024-01-01 02:31:00", - "2024-01-01 01:30:00", "2024-01-01 02:05:00", 5 * time.Minute, + "2024-01-01 01:31:00", "2024-01-01 02:01:00", 5 * time.Minute, }, { // Instant query