From db1feeaf4fe1025391e7fb463ec5695fdb71fde4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kiewicz Date: Tue, 17 Dec 2024 11:43:14 +0100 Subject: [PATCH] Missing bucket results for `date_histogram` aggregation (#1113) https://github.com/QuesmaOrg/quesma/issues/1044 --- .../bucket_aggregations/date_histogram.go | 1 - quesma/model/bucket_aggregations/histogram.go | 9 +- quesma/queryparser/aggregation_parser.go | 17 --- .../pancake_aggregation_parser_buckets.go | 17 +++ quesma/testdata/aggregation_requests.go | 33 +++-- quesma/testdata/aggregation_requests_2.go | 15 ++ quesma/testdata/dates.go | 132 +++++++++++++++++- .../kibana-visualize/aggregation_requests.go | 23 +-- 8 files changed, 208 insertions(+), 39 deletions(-) diff --git a/quesma/model/bucket_aggregations/date_histogram.go b/quesma/model/bucket_aggregations/date_histogram.go index 27b92e73e..950818431 100644 --- a/quesma/model/bucket_aggregations/date_histogram.go +++ b/quesma/model/bucket_aggregations/date_histogram.go @@ -17,7 +17,6 @@ import ( type DateHistogramIntervalType bool const ( - DefaultMinDocCount = -1 DateHistogramFixedInterval DateHistogramIntervalType = true DateHistogramCalendarInterval DateHistogramIntervalType = false defaultDateTimeType = clickhouse.DateTime64 diff --git a/quesma/model/bucket_aggregations/histogram.go b/quesma/model/bucket_aggregations/histogram.go index cb4cb9445..7a2464d36 100644 --- a/quesma/model/bucket_aggregations/histogram.go +++ b/quesma/model/bucket_aggregations/histogram.go @@ -108,9 +108,14 @@ func (query *HistogramRowsTransformer) Transform(ctx context.Context, rowsFromDB return postprocessedRows } -// we're sure key is float64 +// we're sure key is either float64, or in rare cases nil func (query *HistogramRowsTransformer) getKeyFloat64(row model.QueryResultRow) (float64, bool) { - return row.Cols[len(row.Cols)-2].Value.(float64), true + switch val := row.Cols[len(row.Cols)-2].Value.(type) { + case float64: + return val, true + default: + return -1, false + } } // we don't know the type diff --git a/quesma/queryparser/aggregation_parser.go b/quesma/queryparser/aggregation_parser.go index 99c64f312..c50bf5b9e 100644 --- a/quesma/queryparser/aggregation_parser.go +++ b/quesma/queryparser/aggregation_parser.go @@ -7,7 +7,6 @@ import ( "quesma/clickhouse" "quesma/logger" "quesma/model" - "quesma/model/bucket_aggregations" "regexp" "slices" "strconv" @@ -366,22 +365,6 @@ func (cw *ClickhouseQueryTranslator) parseFieldFromScriptField(queryMap QueryMap return } -func (cw *ClickhouseQueryTranslator) parseMinDocCount(queryMap QueryMap) int { - if minDocCountRaw, exists := queryMap["min_doc_count"]; exists { - if minDocCount, ok := minDocCountRaw.(float64); ok { - asInt := int(minDocCount) - if asInt != 0 && asInt != 1 { - logger.WarnWithCtx(cw.Ctx).Msgf("min_doc_count is not 0 or 1, but %d. Not really supported", asInt) - } - return asInt - } else { - logger.WarnWithCtx(cw.Ctx).Msgf("min_doc_count is not a number, but %T, value: %v. Using default value: %d", - minDocCountRaw, minDocCountRaw, bucket_aggregations.DefaultMinDocCount) - } - } - return bucket_aggregations.DefaultMinDocCount -} - // quoteArray returns a new array with the same elements, but quoted func quoteArray(array []string) []string { quotedArray := make([]string, 0, len(array)) diff --git a/quesma/queryparser/pancake_aggregation_parser_buckets.go b/quesma/queryparser/pancake_aggregation_parser_buckets.go index 2e9493956..33257166b 100644 --- a/quesma/queryparser/pancake_aggregation_parser_buckets.go +++ b/quesma/queryparser/pancake_aggregation_parser_buckets.go @@ -470,6 +470,23 @@ func (cw *ClickhouseQueryTranslator) parseOrder(params QueryMap, fieldExpression return fullOrderBy, nil } +func (cw *ClickhouseQueryTranslator) parseMinDocCount(queryMap QueryMap) int { + const defaultMinDocCount = 0 + if minDocCountRaw, exists := queryMap["min_doc_count"]; exists { + if minDocCount, ok := minDocCountRaw.(float64); ok { + asInt := int(minDocCount) + if asInt != 0 && asInt != 1 { + logger.WarnWithCtx(cw.Ctx).Msgf("min_doc_count is not 0 or 1, but %d. Not really supported", asInt) + } + return asInt + } else { + logger.WarnWithCtx(cw.Ctx).Msgf("min_doc_count is not a number, but %T, value: %v. Using default value: %d", + minDocCountRaw, minDocCountRaw, defaultMinDocCount) + } + } + return defaultMinDocCount +} + // addMissingParameterIfPresent parses 'missing' parameter from 'params'. func (cw *ClickhouseQueryTranslator) addMissingParameterIfPresent(field model.Expr, params QueryMap) (updatedField model.Expr, didWeAddMissing bool) { if params["missing"] == nil { diff --git a/quesma/testdata/aggregation_requests.go b/quesma/testdata/aggregation_requests.go index 2ed685ae1..7ee82b683 100644 --- a/quesma/testdata/aggregation_requests.go +++ b/quesma/testdata/aggregation_requests.go @@ -329,10 +329,6 @@ var AggregationTests = []AggregationTestCase{ "aggs": { "1": { "date_histogram": { - "extended_bounds": { - "max": 1707486436029, - "min": 1706881636029 - }, "field": "timestamp", "fixed_interval": "3h", "time_zone": "Europe/Warsaw" @@ -2631,7 +2627,8 @@ var AggregationTests = []AggregationTestCase{ "series": { "date_histogram": { "field": "@timestamp", - "fixed_interval": "60s" + "fixed_interval": "60s", + "min_doc_count": 12 } } } @@ -2869,10 +2866,6 @@ var AggregationTests = []AggregationTestCase{ }, "date_histogram": { "calendar_interval": "1d", - "extended_bounds": { - "max": 1708969256351, - "min": 1708364456351 - }, "field": "order_date" } } @@ -3106,7 +3099,7 @@ var AggregationTests = []AggregationTestCase{ "fixed_interval": "12h", "extended_bounds": { "min": 1708627654149, - "max": 1709232454149 + "max": 1708782454149 } }, "aggs": { @@ -3215,6 +3208,26 @@ var AggregationTests = []AggregationTestCase{ "doc_count": 83, "key": 1708689600000, "key_as_string": "2024-02-23T12:00:00.000" + }, + { + "1-bucket": { + "1-metric": { + "value": null + } + }, + "doc_count": 0, + "key": 1708732800000, + "key_as_string": "2024-02-24T00:00:00.000" + }, + { + "1-bucket": { + "1-metric": { + "value": null + } + }, + "doc_count": 0, + "key": 1708776000000, + "key_as_string": "2024-02-24T12:00:00.000" } ] } diff --git a/quesma/testdata/aggregation_requests_2.go b/quesma/testdata/aggregation_requests_2.go index b2cae8cba..6530c5618 100644 --- a/quesma/testdata/aggregation_requests_2.go +++ b/quesma/testdata/aggregation_requests_2.go @@ -3377,6 +3377,11 @@ var AggregationTests2 = []AggregationTestCase{ "key": 1706021640000, "key_as_string": "2024-01-23T14:54:00.000" }, + { + "doc_count": 0, + "key": 1706021670000, + "key_as_string": "2024-01-23T14:54:30.000" + }, { "doc_count": 17, "key": 1706021700000, @@ -3632,11 +3637,21 @@ var AggregationTests2 = []AggregationTestCase{ "sum_other_doc_count": 1917 } }, + { + "doc_count": 0, + "key": 1706021670000, + "key_as_string": "2024-01-23T14:54:30.000" + }, { "doc_count": 17, "key": 1706021700000, "key_as_string": "2024-01-23T14:55:00.000" }, + { + "doc_count": 0, + "key": 1706021730000, + "key_as_string": "2024-01-23T14:55:30.000" + }, { "doc_count": 15, "key": 1706021760000, diff --git a/quesma/testdata/dates.go b/quesma/testdata/dates.go index eecfdff9a..48f4cf40a 100644 --- a/quesma/testdata/dates.go +++ b/quesma/testdata/dates.go @@ -323,7 +323,7 @@ var AggregationTestsWithDates = []AggregationTestCase{ ("@timestamp", 'Europe/Warsaw'))*1000) / 10000) AS "aggr__timeseries__key_0" ORDER BY "aggr__timeseries__key_0" ASC`, }, - { // [1] + { // [2] TestName: "extended_bounds post keys (timezone calculations most tricky to get right)", QueryRequestJson: ` { @@ -504,4 +504,134 @@ var AggregationTestsWithDates = []AggregationTestCase{ ("@timestamp", 'Europe/Warsaw'))*1000) / 10000) AS "aggr__timeseries__key_0" ORDER BY "aggr__timeseries__key_0" ASC`, }, + { // [3] + TestName: "empty results, we still should add empty buckets, because of the extended_bounds and min_doc_count defaulting to 0", + QueryRequestJson: ` + { + "_source": { + "excludes": [] + }, + "aggs": { + "0": { + "aggs": { + "1": { + "sum": { + "field": "body_bytes_sent" + } + } + }, + "date_histogram": { + "calendar_interval": "1d", + "extended_bounds": { + "min": 1732327903466, + "max": 1732713503466 + }, + "field": "@timestamp", + "time_zone": "Europe/Warsaw" + } + } + }, + "query": { + "bool": { + "filter": [ + { + "range": { + "@timestamp": { + "format": "strict_date_optional_time", + "gte": "2009-11-27T13:18:23.466Z", + "lte": "2024-11-27T13:18:23.466Z" + } + } + } + ], + "must": [], + "must_not": [], + "should": [] + } + }, + "runtime_mappings": {}, + "script_fields": {}, + "size": 0, + "stored_fields": [ + "*" + ], + "track_total_hits": true + }`, + ExpectedResponse: ` + { + "completion_time_in_millis": 1707486436398, + "expiration_time_in_millis": 1707486496397, + "is_partial": false, + "is_running": false, + "response": { + "_shards": { + "failed": 0, + "skipped": 0, + "successful": 1, + "total": 1 + }, + "aggregations": { + "0": { + "buckets": [ + { + "doc_count": 0, + "key": 1732402800000, + "key_as_string": "2024-11-23T23:00:00.000", + "1": { + "value": null + } + }, + { + "doc_count": 0, + "key": 1732489200000, + "key_as_string": "2024-11-24T23:00:00.000", + "1": { + "value": null + } + }, + { + "doc_count": 0, + "key": 1732575600000, + "key_as_string": "2024-11-25T23:00:00.000", + "1": { + "value": null + } + }, + { + "doc_count": 0, + "key": 1732662000000, + "key_as_string": "2024-11-26T23:00:00.000", + "1": { + "value": null + } + } + ] + } + }, + "hits": { + "hits": [], + "max_score": null, + "total": { + "relation": "eq", + "value": 2200 + } + }, + "timed_out": false, + "took": 1 + }, + "start_time_in_millis": 1707486436397 + }`, + ExpectedPancakeResults: []model.QueryResultRow{}, + ExpectedPancakeSQL: ` + SELECT toInt64((toUnixTimestamp64Milli("@timestamp")+timeZoneOffset(toTimezone( + "@timestamp", 'Europe/Warsaw'))*1000) / 86400000) AS "aggr__0__key_0", + count(*) AS "aggr__0__count", + sumOrNull("body_bytes_sent") AS "metric__0__1_col_0" + FROM __quesma_table_name + WHERE ("@timestamp">=fromUnixTimestamp64Milli(1259327903466) AND "@timestamp"<= + fromUnixTimestamp64Milli(1732713503466)) + GROUP BY toInt64((toUnixTimestamp64Milli("@timestamp")+timeZoneOffset(toTimezone + ("@timestamp", 'Europe/Warsaw'))*1000) / 86400000) AS "aggr__0__key_0" + ORDER BY "aggr__0__key_0" ASC`, + }, } diff --git a/quesma/testdata/kibana-visualize/aggregation_requests.go b/quesma/testdata/kibana-visualize/aggregation_requests.go index 957536693..c2056a5fa 100644 --- a/quesma/testdata/kibana-visualize/aggregation_requests.go +++ b/quesma/testdata/kibana-visualize/aggregation_requests.go @@ -46,10 +46,6 @@ var AggregationTests = []testdata.AggregationTestCase{ } }, "date_histogram": { - "extended_bounds": { - "max": 1716812096627, - "min": 1716811196627 - }, "field": "@timestamp", "fixed_interval": "30s", "time_zone": "Europe/Warsaw" @@ -141,6 +137,15 @@ var AggregationTests = []testdata.AggregationTestCase{ "key": 1716827010000, "key_as_string": "2024-05-27T16:23:30.000" }, + { + "doc_count": 0, + "key": 1716827040000, + "key_as_string": "2024-05-27T16:24:00.000", + "1": { + "buckets": [], + "sum_other_doc_count": 0 + } + }, { "1": { "buckets": [ @@ -267,10 +272,6 @@ var AggregationTests = []testdata.AggregationTestCase{ "aggs": { "1": { "date_histogram": { - "extended_bounds": { - "max": 1716812073493, - "min": 1716811173493 - }, "field": "@timestamp", "fixed_interval": "30s" } @@ -345,6 +346,11 @@ var AggregationTests = []testdata.AggregationTestCase{ "key": 1716834450000, "key_as_string": "2024-05-27T18:27:30.000" }, + { + "doc_count": 0, + "key": 1716834480000, + "key_as_string": "2024-05-27T18:28:00.000" + }, { "doc_count": 2, "key": 1716834510000, @@ -454,6 +460,7 @@ var AggregationTests = []testdata.AggregationTestCase{ "max": 1716834478178, "min": 1716833578178 }, + "min_doc_count": 1, "field": "@timestamp", "fixed_interval": "30s" }