From db1feeaf4fe1025391e7fb463ec5695fdb71fde4 Mon Sep 17 00:00:00 2001
From: Krzysztof Kiewicz <krzysztof@quesma.com>
Date: Tue, 17 Dec 2024 11:43:14 +0100
Subject: [PATCH] Missing bucket results for `date_histogram` aggregation
 (#1113)

https://github.com/QuesmaOrg/quesma/issues/1044
---
 .../bucket_aggregations/date_histogram.go     |   1 -
 quesma/model/bucket_aggregations/histogram.go |   9 +-
 quesma/queryparser/aggregation_parser.go      |  17 ---
 .../pancake_aggregation_parser_buckets.go     |  17 +++
 quesma/testdata/aggregation_requests.go       |  33 +++--
 quesma/testdata/aggregation_requests_2.go     |  15 ++
 quesma/testdata/dates.go                      | 132 +++++++++++++++++-
 .../kibana-visualize/aggregation_requests.go  |  23 +--
 8 files changed, 208 insertions(+), 39 deletions(-)

diff --git a/quesma/model/bucket_aggregations/date_histogram.go b/quesma/model/bucket_aggregations/date_histogram.go
index 27b92e73e..950818431 100644
--- a/quesma/model/bucket_aggregations/date_histogram.go
+++ b/quesma/model/bucket_aggregations/date_histogram.go
@@ -17,7 +17,6 @@ import (
 type DateHistogramIntervalType bool
 
 const (
-	DefaultMinDocCount                                      = -1
 	DateHistogramFixedInterval    DateHistogramIntervalType = true
 	DateHistogramCalendarInterval DateHistogramIntervalType = false
 	defaultDateTimeType                                     = clickhouse.DateTime64
diff --git a/quesma/model/bucket_aggregations/histogram.go b/quesma/model/bucket_aggregations/histogram.go
index cb4cb9445..7a2464d36 100644
--- a/quesma/model/bucket_aggregations/histogram.go
+++ b/quesma/model/bucket_aggregations/histogram.go
@@ -108,9 +108,14 @@ func (query *HistogramRowsTransformer) Transform(ctx context.Context, rowsFromDB
 	return postprocessedRows
 }
 
-// we're sure key is float64
+// we're sure key is either float64, or in rare cases nil
 func (query *HistogramRowsTransformer) getKeyFloat64(row model.QueryResultRow) (float64, bool) {
-	return row.Cols[len(row.Cols)-2].Value.(float64), true
+	switch val := row.Cols[len(row.Cols)-2].Value.(type) {
+	case float64:
+		return val, true
+	default:
+		return -1, false
+	}
 }
 
 // we don't know the type
diff --git a/quesma/queryparser/aggregation_parser.go b/quesma/queryparser/aggregation_parser.go
index 99c64f312..c50bf5b9e 100644
--- a/quesma/queryparser/aggregation_parser.go
+++ b/quesma/queryparser/aggregation_parser.go
@@ -7,7 +7,6 @@ import (
 	"quesma/clickhouse"
 	"quesma/logger"
 	"quesma/model"
-	"quesma/model/bucket_aggregations"
 	"regexp"
 	"slices"
 	"strconv"
@@ -366,22 +365,6 @@ func (cw *ClickhouseQueryTranslator) parseFieldFromScriptField(queryMap QueryMap
 	return
 }
 
-func (cw *ClickhouseQueryTranslator) parseMinDocCount(queryMap QueryMap) int {
-	if minDocCountRaw, exists := queryMap["min_doc_count"]; exists {
-		if minDocCount, ok := minDocCountRaw.(float64); ok {
-			asInt := int(minDocCount)
-			if asInt != 0 && asInt != 1 {
-				logger.WarnWithCtx(cw.Ctx).Msgf("min_doc_count is not 0 or 1, but %d. Not really supported", asInt)
-			}
-			return asInt
-		} else {
-			logger.WarnWithCtx(cw.Ctx).Msgf("min_doc_count is not a number, but %T, value: %v. Using default value: %d",
-				minDocCountRaw, minDocCountRaw, bucket_aggregations.DefaultMinDocCount)
-		}
-	}
-	return bucket_aggregations.DefaultMinDocCount
-}
-
 // quoteArray returns a new array with the same elements, but quoted
 func quoteArray(array []string) []string {
 	quotedArray := make([]string, 0, len(array))
diff --git a/quesma/queryparser/pancake_aggregation_parser_buckets.go b/quesma/queryparser/pancake_aggregation_parser_buckets.go
index 2e9493956..33257166b 100644
--- a/quesma/queryparser/pancake_aggregation_parser_buckets.go
+++ b/quesma/queryparser/pancake_aggregation_parser_buckets.go
@@ -470,6 +470,23 @@ func (cw *ClickhouseQueryTranslator) parseOrder(params QueryMap, fieldExpression
 	return fullOrderBy, nil
 }
 
+func (cw *ClickhouseQueryTranslator) parseMinDocCount(queryMap QueryMap) int {
+	const defaultMinDocCount = 0
+	if minDocCountRaw, exists := queryMap["min_doc_count"]; exists {
+		if minDocCount, ok := minDocCountRaw.(float64); ok {
+			asInt := int(minDocCount)
+			if asInt != 0 && asInt != 1 {
+				logger.WarnWithCtx(cw.Ctx).Msgf("min_doc_count is not 0 or 1, but %d. Not really supported", asInt)
+			}
+			return asInt
+		} else {
+			logger.WarnWithCtx(cw.Ctx).Msgf("min_doc_count is not a number, but %T, value: %v. Using default value: %d",
+				minDocCountRaw, minDocCountRaw, defaultMinDocCount)
+		}
+	}
+	return defaultMinDocCount
+}
+
 // addMissingParameterIfPresent parses 'missing' parameter from 'params'.
 func (cw *ClickhouseQueryTranslator) addMissingParameterIfPresent(field model.Expr, params QueryMap) (updatedField model.Expr, didWeAddMissing bool) {
 	if params["missing"] == nil {
diff --git a/quesma/testdata/aggregation_requests.go b/quesma/testdata/aggregation_requests.go
index 2ed685ae1..7ee82b683 100644
--- a/quesma/testdata/aggregation_requests.go
+++ b/quesma/testdata/aggregation_requests.go
@@ -329,10 +329,6 @@ var AggregationTests = []AggregationTestCase{
 					"aggs": {
 						"1": {
 							"date_histogram": {
-								"extended_bounds": {
-									"max": 1707486436029,
-									"min": 1706881636029
-								},
 								"field": "timestamp",
 								"fixed_interval": "3h",
 								"time_zone": "Europe/Warsaw"
@@ -2631,7 +2627,8 @@ var AggregationTests = []AggregationTestCase{
 						"series": {
 							"date_histogram": {
 								"field": "@timestamp",
-								"fixed_interval": "60s"
+								"fixed_interval": "60s",
+								"min_doc_count": 12
 							}
 						}
 					}
@@ -2869,10 +2866,6 @@ var AggregationTests = []AggregationTestCase{
 					},
 					"date_histogram": {
 						"calendar_interval": "1d",
-						"extended_bounds": {
-							"max": 1708969256351,
-							"min": 1708364456351
-						},
 						"field": "order_date"
 					}
 				}
@@ -3106,7 +3099,7 @@ var AggregationTests = []AggregationTestCase{
 						"fixed_interval": "12h",
 						"extended_bounds": {
 							"min": 1708627654149,
-							"max": 1709232454149
+							"max": 1708782454149
 						}
 					},
 					"aggs": {
@@ -3215,6 +3208,26 @@ var AggregationTests = []AggregationTestCase{
 								"doc_count": 83,
 								"key": 1708689600000,
 								"key_as_string": "2024-02-23T12:00:00.000"
+							},
+							{
+								"1-bucket": {
+									"1-metric": {
+										"value": null
+									}
+								},
+								"doc_count": 0,
+								"key": 1708732800000,
+								"key_as_string": "2024-02-24T00:00:00.000"
+							},
+							{
+								"1-bucket": {
+									"1-metric": {
+										"value": null
+									}
+								},
+								"doc_count": 0,
+								"key": 1708776000000,
+								"key_as_string": "2024-02-24T12:00:00.000"
 							}
 						]
 					}
diff --git a/quesma/testdata/aggregation_requests_2.go b/quesma/testdata/aggregation_requests_2.go
index b2cae8cba..6530c5618 100644
--- a/quesma/testdata/aggregation_requests_2.go
+++ b/quesma/testdata/aggregation_requests_2.go
@@ -3377,6 +3377,11 @@ var AggregationTests2 = []AggregationTestCase{
 									"key": 1706021640000,
 									"key_as_string": "2024-01-23T14:54:00.000"
 								},
+								{
+									"doc_count": 0,
+									"key": 1706021670000,
+									"key_as_string": "2024-01-23T14:54:30.000"
+								},
 								{
 									"doc_count": 17,
 									"key": 1706021700000,
@@ -3632,11 +3637,21 @@ var AggregationTests2 = []AggregationTestCase{
 									"sum_other_doc_count": 1917
 								}
 							},
+							{
+								"doc_count": 0,
+								"key": 1706021670000,
+								"key_as_string": "2024-01-23T14:54:30.000"
+							},
 							{
 								"doc_count": 17,
 								"key": 1706021700000,
 								"key_as_string": "2024-01-23T14:55:00.000"
 							},
+							{
+								"doc_count": 0,
+								"key": 1706021730000,
+								"key_as_string": "2024-01-23T14:55:30.000"
+							},
 							{
 								"doc_count": 15,
 								"key": 1706021760000,
diff --git a/quesma/testdata/dates.go b/quesma/testdata/dates.go
index eecfdff9a..48f4cf40a 100644
--- a/quesma/testdata/dates.go
+++ b/quesma/testdata/dates.go
@@ -323,7 +323,7 @@ var AggregationTestsWithDates = []AggregationTestCase{
 			  ("@timestamp", 'Europe/Warsaw'))*1000) / 10000) AS "aggr__timeseries__key_0"
 			ORDER BY "aggr__timeseries__key_0" ASC`,
 	},
-	{ // [1]
+	{ // [2]
 		TestName: "extended_bounds post keys (timezone calculations most tricky to get right)",
 		QueryRequestJson: `
 		{
@@ -504,4 +504,134 @@ var AggregationTestsWithDates = []AggregationTestCase{
 			  ("@timestamp", 'Europe/Warsaw'))*1000) / 10000) AS "aggr__timeseries__key_0"
 			ORDER BY "aggr__timeseries__key_0" ASC`,
 	},
+	{ // [3]
+		TestName: "empty results, we still should add empty buckets, because of the extended_bounds and min_doc_count defaulting to 0",
+		QueryRequestJson: `
+		{
+			"_source": {
+				"excludes": []
+			},
+			"aggs": {
+				"0": {
+					"aggs": {
+						"1": {
+							"sum": {
+								"field": "body_bytes_sent"
+							}
+						}
+					},
+					"date_histogram": {
+						"calendar_interval": "1d",
+						"extended_bounds": {
+							"min": 1732327903466,
+							"max": 1732713503466
+						},
+						"field": "@timestamp",
+						"time_zone": "Europe/Warsaw"
+					}
+				}
+			},
+			"query": {
+				"bool": {
+					"filter": [
+						{
+							"range": {
+								"@timestamp": {
+									"format": "strict_date_optional_time",
+									"gte": "2009-11-27T13:18:23.466Z",
+									"lte": "2024-11-27T13:18:23.466Z"
+								}
+							}
+						}
+					],
+					"must": [],
+					"must_not": [],
+					"should": []
+				}
+			},
+			"runtime_mappings": {},
+			"script_fields": {},
+			"size": 0,
+			"stored_fields": [
+				"*"
+			],
+			"track_total_hits": true
+		}`,
+		ExpectedResponse: `
+		{
+			"completion_time_in_millis": 1707486436398,
+			"expiration_time_in_millis": 1707486496397,
+			"is_partial": false,
+			"is_running": false,
+			"response": {
+				"_shards": {
+					"failed": 0,
+					"skipped": 0,
+					"successful": 1,
+					"total": 1
+				},
+				"aggregations": {
+					"0": {
+						"buckets": [
+							{
+								"doc_count": 0,
+								"key": 1732402800000,
+								"key_as_string": "2024-11-23T23:00:00.000",
+								"1": {
+									"value": null
+								}
+							},
+							{
+								"doc_count": 0,
+								"key": 1732489200000,
+								"key_as_string": "2024-11-24T23:00:00.000",
+								"1": {
+									"value": null
+								}
+							},
+							{
+								"doc_count": 0,
+								"key": 1732575600000,
+								"key_as_string": "2024-11-25T23:00:00.000",
+								"1": {
+									"value": null
+								}
+							},
+							{
+								"doc_count": 0,
+								"key": 1732662000000,
+								"key_as_string": "2024-11-26T23:00:00.000",
+								"1": {
+									"value": null
+								}
+							}
+						]
+					}
+				},
+				"hits": {
+					"hits": [],
+					"max_score": null,
+					"total": {
+						"relation": "eq",
+						"value": 2200
+					}
+				},
+				"timed_out": false,
+				"took": 1
+			},
+			"start_time_in_millis": 1707486436397
+		}`,
+		ExpectedPancakeResults: []model.QueryResultRow{},
+		ExpectedPancakeSQL: `
+			SELECT toInt64((toUnixTimestamp64Milli("@timestamp")+timeZoneOffset(toTimezone(
+			  "@timestamp", 'Europe/Warsaw'))*1000) / 86400000) AS "aggr__0__key_0",
+			  count(*) AS "aggr__0__count",
+			  sumOrNull("body_bytes_sent") AS "metric__0__1_col_0"
+			FROM __quesma_table_name
+			WHERE ("@timestamp">=fromUnixTimestamp64Milli(1259327903466) AND "@timestamp"<=
+			  fromUnixTimestamp64Milli(1732713503466))
+			GROUP BY toInt64((toUnixTimestamp64Milli("@timestamp")+timeZoneOffset(toTimezone
+			  ("@timestamp", 'Europe/Warsaw'))*1000) / 86400000) AS "aggr__0__key_0"
+			ORDER BY "aggr__0__key_0" ASC`,
+	},
 }
diff --git a/quesma/testdata/kibana-visualize/aggregation_requests.go b/quesma/testdata/kibana-visualize/aggregation_requests.go
index 957536693..c2056a5fa 100644
--- a/quesma/testdata/kibana-visualize/aggregation_requests.go
+++ b/quesma/testdata/kibana-visualize/aggregation_requests.go
@@ -46,10 +46,6 @@ var AggregationTests = []testdata.AggregationTestCase{
 						}
 					},
 					"date_histogram": {
-						"extended_bounds": {
-							"max": 1716812096627,
-							"min": 1716811196627
-						},
 						"field": "@timestamp",
 						"fixed_interval": "30s",
 						"time_zone": "Europe/Warsaw"
@@ -141,6 +137,15 @@ var AggregationTests = []testdata.AggregationTestCase{
 								"key": 1716827010000,
 								"key_as_string": "2024-05-27T16:23:30.000"
 							},
+							{
+								"doc_count": 0,
+								"key": 1716827040000,
+								"key_as_string": "2024-05-27T16:24:00.000",
+								"1": {
+									"buckets": [],
+									"sum_other_doc_count": 0
+								}
+							},
 							{
 								"1": {
 									"buckets": [
@@ -267,10 +272,6 @@ var AggregationTests = []testdata.AggregationTestCase{
 					"aggs": {
 						"1": {
 							"date_histogram": {
-								"extended_bounds": {
-									"max": 1716812073493,
-									"min": 1716811173493
-								},
 								"field": "@timestamp",
 								"fixed_interval": "30s"
 							}
@@ -345,6 +346,11 @@ var AggregationTests = []testdata.AggregationTestCase{
 											"key": 1716834450000,
 											"key_as_string": "2024-05-27T18:27:30.000"
 										},
+										{
+											"doc_count": 0,
+											"key": 1716834480000,
+											"key_as_string": "2024-05-27T18:28:00.000"
+										},
 										{
 											"doc_count": 2,
 											"key": 1716834510000,
@@ -454,6 +460,7 @@ var AggregationTests = []testdata.AggregationTestCase{
 									"max": 1716834478178,
 									"min": 1716833578178
 								},
+								"min_doc_count": 1,
 								"field": "@timestamp",
 								"fixed_interval": "30s"
 							}