From 62e6b307707bd2378fb234152b93c75337443270 Mon Sep 17 00:00:00 2001
From: Krzysztof Kiewicz <krzysztof@quesma.com>
Date: Wed, 8 May 2024 21:29:28 +0200
Subject: [PATCH] Respect `keyed` parameter in `percentile_ranks` aggr (#57)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before: dashboard is broken. We don't respect `keyed`, and return
response in incorrect format. I've already fixed it for 2 other
aggregations.

Not sure why, but I didn't add a test for this aggregation, so we had
none. Now we have 1 😆
<img width="478" alt="Screenshot 2024-05-08 at 00 15 39"
src="https://github.com/QuesmaOrg/quesma/assets/5407146/5a45a330-cda2-432f-b249-be11a0467665">
<img width="1728" alt="Screenshot 2024-05-08 at 00 15 14"
src="https://github.com/QuesmaOrg/quesma/assets/5407146/54a61242-8ef0-4435-9182-f47e9247b900">
After:
<img width="1725" alt="Screenshot 2024-05-08 at 18 52 39"
src="https://github.com/QuesmaOrg/quesma/assets/5407146/9bd7eea9-a9cb-406f-a445-71974dbc087b">
---
 .../metrics_aggregations/percentile_ranks.go  |  97 ++++++++---
 quesma/queryparser/aggregation_parser.go      |  14 +-
 .../aggregation_requests.go                   | 152 ++++++++++++++++++
 3 files changed, 236 insertions(+), 27 deletions(-)
diff --git a/quesma/model/metrics_aggregations/percentile_ranks.go b/quesma/model/metrics_aggregations/percentile_ranks.go
index 50e87eb6a..6537f63ff 100644
--- a/quesma/model/metrics_aggregations/percentile_ranks.go
+++ b/quesma/model/metrics_aggregations/percentile_ranks.go
@@ -4,15 +4,19 @@ import (
 	"context"
 	"mitmproxy/quesma/logger"
 	"mitmproxy/quesma/model"
+	"strconv"
 	"strings"
 )
 
 type PercentileRanks struct {
 	ctx context.Context
+	// defines what response should look like
+	// https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-percentile-rank-aggregation.html#_keyed_response_5
+	Keyed bool
 }
 
-func NewPercentileRanks(ctx context.Context) PercentileRanks {
-	return PercentileRanks{ctx: ctx}
+func NewPercentileRanks(ctx context.Context, keyed bool) PercentileRanks {
+	return PercentileRanks{ctx: ctx, Keyed: keyed}
 }
 
 func (query PercentileRanks) IsBucketAggregation() bool {
@@ -20,33 +24,74 @@ func (query PercentileRanks) IsBucketAggregation() bool {
 }
 
 func (query PercentileRanks) TranslateSqlResponseToJson(rows []model.QueryResultRow, level int) []model.JsonMap {
-	valueMap := make(map[string]float64)
-	for _, percentileRank := range rows[0].Cols[level:] {
-		// percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X.
-		// It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory.
-		// Also, can't be .00, needs to be .0
-		beg := strings.Index(percentileRank.ColName, "<=")
-		end := strings.Index(percentileRank.ColName[beg:], ",")
-		cutValue := percentileRank.ColName[beg+2 : beg+end]
-
-		dot := strings.Index(cutValue, ".")
-		if dot == -1 {
-			cutValue += ".0"
-		} else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" {
-			cutValue = cutValue[:dot+2]
-		} else {
-			cutValue = cutValue[:dot+3]
+	if len(rows) == 0 {
+		logger.WarnWithCtx(query.ctx).Msg("no rows in percentile ranks response")
+		return make([]model.JsonMap, 0)
+	}
+	// I duplicate a lot of code in this if/else below,
+	// but I think it's worth it, as this function might get called a lot of times for a single query.
+	// And because of complete separation in if/else, I guess it might (should) be slightly faster (?)
+	if query.Keyed {
+		valueMap := make(model.JsonMap)
+		for _, percentileRank := range rows[0].Cols[level:] {
+			// percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X.
+			// It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory.
+			// Also, can't be .00, needs to be .0
+			beg := strings.Index(percentileRank.ColName, "<=")
+			end := strings.Index(percentileRank.ColName[beg:], ",")
+			cutValue := percentileRank.ColName[beg+2 : beg+end]
+
+			dot := strings.Index(cutValue, ".")
+			if dot == -1 {
+				cutValue += ".0"
+			} else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" {
+				cutValue = cutValue[:dot+2]
+			} else {
+				cutValue = cutValue[:dot+3]
+			}
+			if value, ok := percentileRank.Value.(float64); ok {
+				valueMap[cutValue] = value
+			} else {
+				logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v. Skipping",
+					percentileRank.Value, percentileRank.Value)
+			}
 		}
-		if value, ok := percentileRank.Value.(float64); ok {
-			valueMap[cutValue] = value
-		} else {
-			logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v",
-				percentileRank.Value, percentileRank.Value)
+		return []model.JsonMap{{
+			"values": valueMap,
+		}}
+	} else {
+		buckets := make([]model.JsonMap, 0)
+		for _, percentileRank := range rows[0].Cols[level:] {
+			// percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X.
+			// It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory.
+			// Also, can't be .00, needs to be .0
+			beg := strings.Index(percentileRank.ColName, "<=")
+			end := strings.Index(percentileRank.ColName[beg:], ",")
+			cutValue := percentileRank.ColName[beg+2 : beg+end]
+
+			dot := strings.Index(cutValue, ".")
+			if dot == -1 {
+				cutValue += ".0"
+			} else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" {
+				cutValue = cutValue[:dot+2]
+			} else {
+				cutValue = cutValue[:dot+3]
+			}
+			cutValueFloat, _ := strconv.ParseFloat(cutValue, 64)
+			if value, ok := percentileRank.Value.(float64); ok {
+				buckets = append(buckets, model.JsonMap{
+					"key":   cutValueFloat,
+					"value": value,
+				})
+			} else {
+				logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v. Skipping",
+					percentileRank.Value, percentileRank.Value)
+			}
 		}
+		return []model.JsonMap{{
+			"values": buckets,
+		}}
 	}
-	return []model.JsonMap{{
-		"values": valueMap,
-	}}
 }
 
 func (query PercentileRanks) String() string {
diff --git a/quesma/queryparser/aggregation_parser.go b/quesma/queryparser/aggregation_parser.go
index 4a736c80a..d070fcf60 100644
--- a/quesma/queryparser/aggregation_parser.go
+++ b/quesma/queryparser/aggregation_parser.go
@@ -17,6 +17,8 @@ import (
 	"strings"
 )
 
+const keyedDefaultValuePercentileRanks = true
+
 type filter struct {
 	name string
 	sql  SimpleQuery
@@ -184,7 +186,7 @@ func (b *aggrQueryBuilder) buildMetricsAggregation(metricsAggr metricsAggregatio
 	case "value_count":
 		query.Type = metrics_aggregations.NewValueCount(b.ctx)
 	case "percentile_ranks":
-		query.Type = metrics_aggregations.NewPercentileRanks(b.ctx)
+		query.Type = metrics_aggregations.NewPercentileRanks(b.ctx, metricsAggr.Keyed)
 	}
 	return query
 }
@@ -512,10 +514,20 @@ func (cw *ClickhouseQueryTranslator) tryMetricsAggregation(queryMap QueryMap) (m
 				logger.WarnWithCtx(cw.Ctx).Msgf("cutValue in percentile_ranks is not a number, but %T, value: %v. Skipping.", cutValue, cutValue)
 			}
 		}
+		var keyed bool
+		if keyedRaw, ok := percentileRanks.(QueryMap)["keyed"]; ok {
+			if keyed, ok = keyedRaw.(bool); !ok {
+				logger.WarnWithCtx(cw.Ctx).Msgf("keyed specified for percentiles aggregation is not a boolean. Querymap: %v", queryMap)
+				keyed = keyedDefaultValuePercentileRanks
+			}
+		} else {
+			keyed = keyedDefaultValuePercentileRanks
+		}
 		return metricsAggregation{
 			AggrType:   "percentile_ranks",
 			FieldNames: fieldNames,
 			FieldType:  metricsAggregationDefaultFieldType, // don't need to check, it's unimportant for this aggregation
+			Keyed:      keyed,
 		}, true
 	}
 
diff --git a/quesma/testdata/opensearch-visualize/aggregation_requests.go b/quesma/testdata/opensearch-visualize/aggregation_requests.go
index 55251a79c..81a0e47e9 100644
--- a/quesma/testdata/opensearch-visualize/aggregation_requests.go
+++ b/quesma/testdata/opensearch-visualize/aggregation_requests.go
@@ -1124,4 +1124,156 @@ var AggregationTests = []testdata.AggregationTestCase{
 				`ORDER BY ("response")`,
 		},
 	},
+	{ // [7]
+		TestName: "Percentile_ranks keyed=false. Reproduce: Visualize -> Line -> Metrics: Percentile Ranks, Buckets: X-Asis Date Histogram",
+		QueryRequestJson: `
+		{
+			"_source": {
+				"excludes": []
+			},
+			"aggs": {
+				"2": {
+					"aggs": {
+						"1": {
+							"percentile_ranks": {
+								"field": "AvgTicketPrice",
+								"keyed": false,
+								"values": [
+									0,
+									50000
+								]
+							}
+						}
+					},
+					"date_histogram": {
+						"calendar_interval": "1h",
+						"field": "timestamp",
+						"min_doc_count": 1,
+						"time_zone": "Europe/Warsaw"
+					}
+				}
+			},
+			"docvalue_fields": [
+				{
+					"field": "timestamp",
+					"format": "date_time"
+				}
+			],
+			"query": {
+				"bool": {
+					"filter": [],
+					"must": [
+						{
+							"match_all": {}
+						}
+					],
+					"must_not": [],
+					"should": []
+				}
+			},
+			"script_fields": {
+				"hour_of_day": {
+					"script": {
+						"lang": "painless",
+						"source": "doc['timestamp'].value.hourOfDay"
+					}
+				}
+			},
+			"size": 0,
+			"stored_fields": [
+				"*"
+			]
+		}`,
+		ExpectedResponse: `
+		{
+			"_shards": {
+				"failed": 0,
+				"skipped": 0,
+				"successful": 1,
+				"total": 1
+			},
+			"aggregations": {
+				"2": {
+					"buckets": [
+						{
+							"1": {
+								"values": [
+									{
+										"key": 0.0,
+										"value": 0.0
+									},
+									{
+										"key": 50000.0,
+										"value": 100.0
+									}
+								]
+							},
+							"doc_count": 9,
+							"key": 1714860000000,
+							"key_as_string": "2024-05-04T22:00:00.000"
+						},
+						{
+							"1": {
+								"values": [
+									{
+										"key": 0.0,
+										"value": 0.0
+									},
+									{
+										"key": 50000.0,
+										"value": 50.0
+									}
+								]
+							},
+							"doc_count": 12,
+							"key": 1714863600000,
+							"key_as_string": "2024-05-04T23:00:00.000"
+						}
+					]
+				}
+			},
+			"hits": {
+				"hits": [],
+				"max_score": null,
+				"total": {
+					"relation": "eq",
+					"value": 884
+				}
+			},
+			"timed_out": false,
+			"took": 0
+		}`,
+		ExpectedResults: [][]model.QueryResultRow{
+			{{Cols: []model.QueryResultCol{model.NewQueryResultCol("hits", uint64(884))}}},
+			{
+				{Cols: []model.QueryResultCol{
+					model.NewQueryResultCol("key", int64(1714860000000/3600000)),
+					model.NewQueryResultCol("AvgTicketPrice<=0,", 0.0),
+					model.NewQueryResultCol("AvgTicketPrice<=50000,", 100.0)},
+				},
+				{Cols: []model.QueryResultCol{
+					model.NewQueryResultCol("key", int64(1714863600000/3600000)),
+					model.NewQueryResultCol("AvgTicketPrice<=0,", 0.0),
+					model.NewQueryResultCol("AvgTicketPrice<=50000,", 50.0),
+				}},
+			},
+			{
+				{Cols: []model.QueryResultCol{model.NewQueryResultCol("key", int64(1714860000000/3600000)), model.NewQueryResultCol("doc_count", 9)}},
+				{Cols: []model.QueryResultCol{model.NewQueryResultCol("key", int64(1714863600000/3600000)), model.NewQueryResultCol("doc_count", 12)}},
+			},
+		},
+		ExpectedSQLs: []string{
+			`SELECT count() FROM ` + testdata.QuotedTableName + ` `,
+			"SELECT toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000), " +
+				`count(if("AvgTicketPrice"<=0.000000, 1, NULL))/count(*)*100, ` +
+				`count(if("AvgTicketPrice"<=50000.000000, 1, NULL))/count(*)*100 ` +
+				`FROM ` + testdata.QuotedTableName + `  ` +
+				"GROUP BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000)) " +
+				"ORDER BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000))",
+			"SELECT toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000), count() " +
+				`FROM ` + testdata.QuotedTableName + `  ` +
+				"GROUP BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000)) " +
+				"ORDER BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000))",
+		},
+	},
 }