Merge branch 'main' into jacek-request-ui

QuesmaOrg · May 8, 2024 · aeb14b7 · aeb14b7
2 parents 5a1876e + 62e6b30
commit aeb14b7
Show file tree

Hide file tree

Showing 3 changed files with 236 additions and 27 deletions.
diff --git a/quesma/model/metrics_aggregations/percentile_ranks.go b/quesma/model/metrics_aggregations/percentile_ranks.go
@@ -4,49 +4,94 @@ import (
 	"context"
 	"mitmproxy/quesma/logger"
 	"mitmproxy/quesma/model"
+	"strconv"
 	"strings"
 )
 
 type PercentileRanks struct {
 	ctx context.Context
+	// defines what response should look like
+	// https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-percentile-rank-aggregation.html#_keyed_response_5
+	Keyed bool
 }
 
-func NewPercentileRanks(ctx context.Context) PercentileRanks {
-	return PercentileRanks{ctx: ctx}
+func NewPercentileRanks(ctx context.Context, keyed bool) PercentileRanks {
+	return PercentileRanks{ctx: ctx, Keyed: keyed}
 }
 
 func (query PercentileRanks) IsBucketAggregation() bool {
 	return false
 }
 
 func (query PercentileRanks) TranslateSqlResponseToJson(rows []model.QueryResultRow, level int) []model.JsonMap {
-	valueMap := make(map[string]float64)
-	for _, percentileRank := range rows[0].Cols[level:] {
-		// percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X.
-		// It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory.
-		// Also, can't be .00, needs to be .0
-		beg := strings.Index(percentileRank.ColName, "<=")
-		end := strings.Index(percentileRank.ColName[beg:], ",")
-		cutValue := percentileRank.ColName[beg+2 : beg+end]
-
-		dot := strings.Index(cutValue, ".")
-		if dot == -1 {
-			cutValue += ".0"
-		} else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" {
-			cutValue = cutValue[:dot+2]
-		} else {
-			cutValue = cutValue[:dot+3]
+	if len(rows) == 0 {
+		logger.WarnWithCtx(query.ctx).Msg("no rows in percentile ranks response")
+		return make([]model.JsonMap, 0)
+	}
+	// I duplicate a lot of code in this if/else below,
+	// but I think it's worth it, as this function might get called a lot of times for a single query.
+	// And because of complete separation in if/else, I guess it might (should) be slightly faster (?)
+	if query.Keyed {
+		valueMap := make(model.JsonMap)
+		for _, percentileRank := range rows[0].Cols[level:] {
+			// percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X.
+			// It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory.
+			// Also, can't be .00, needs to be .0
+			beg := strings.Index(percentileRank.ColName, "<=")
+			end := strings.Index(percentileRank.ColName[beg:], ",")
+			cutValue := percentileRank.ColName[beg+2 : beg+end]
+
+			dot := strings.Index(cutValue, ".")
+			if dot == -1 {
+				cutValue += ".0"
+			} else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" {
+				cutValue = cutValue[:dot+2]
+			} else {
+				cutValue = cutValue[:dot+3]
+			}
+			if value, ok := percentileRank.Value.(float64); ok {
+				valueMap[cutValue] = value
+			} else {
+				logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v. Skipping",
+					percentileRank.Value, percentileRank.Value)
+			}
 		}
-		if value, ok := percentileRank.Value.(float64); ok {
-			valueMap[cutValue] = value
-		} else {
-			logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v",
-				percentileRank.Value, percentileRank.Value)
+		return []model.JsonMap{{
+			"values": valueMap,
+		}}
+	} else {
+		buckets := make([]model.JsonMap, 0)
+		for _, percentileRank := range rows[0].Cols[level:] {
+			// percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X.
+			// It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory.
+			// Also, can't be .00, needs to be .0
+			beg := strings.Index(percentileRank.ColName, "<=")
+			end := strings.Index(percentileRank.ColName[beg:], ",")
+			cutValue := percentileRank.ColName[beg+2 : beg+end]
+
+			dot := strings.Index(cutValue, ".")
+			if dot == -1 {
+				cutValue += ".0"
+			} else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" {
+				cutValue = cutValue[:dot+2]
+			} else {
+				cutValue = cutValue[:dot+3]
+			}
+			cutValueFloat, _ := strconv.ParseFloat(cutValue, 64)
+			if value, ok := percentileRank.Value.(float64); ok {
+				buckets = append(buckets, model.JsonMap{
+					"key":   cutValueFloat,
+					"value": value,
+				})
+			} else {
+				logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v. Skipping",
+					percentileRank.Value, percentileRank.Value)
+			}
 		}
+		return []model.JsonMap{{
+			"values": buckets,
+		}}
 	}
-	return []model.JsonMap{{
-		"values": valueMap,
-	}}
 }
 
 func (query PercentileRanks) String() string {

diff --git a/quesma/queryparser/aggregation_parser.go b/quesma/queryparser/aggregation_parser.go
@@ -17,6 +17,8 @@ import (
 	"strings"
 )
 
+const keyedDefaultValuePercentileRanks = true
+
 type filter struct {
 	name string
 	sql  SimpleQuery
@@ -184,7 +186,7 @@ func (b *aggrQueryBuilder) buildMetricsAggregation(metricsAggr metricsAggregatio
 	case "value_count":
 		query.Type = metrics_aggregations.NewValueCount(b.ctx)
 	case "percentile_ranks":
-		query.Type = metrics_aggregations.NewPercentileRanks(b.ctx)
+		query.Type = metrics_aggregations.NewPercentileRanks(b.ctx, metricsAggr.Keyed)
 	}
 	return query
 }
@@ -512,10 +514,20 @@ func (cw *ClickhouseQueryTranslator) tryMetricsAggregation(queryMap QueryMap) (m
 				logger.WarnWithCtx(cw.Ctx).Msgf("cutValue in percentile_ranks is not a number, but %T, value: %v. Skipping.", cutValue, cutValue)
 			}
 		}
+		var keyed bool
+		if keyedRaw, ok := percentileRanks.(QueryMap)["keyed"]; ok {
+			if keyed, ok = keyedRaw.(bool); !ok {
+				logger.WarnWithCtx(cw.Ctx).Msgf("keyed specified for percentiles aggregation is not a boolean. Querymap: %v", queryMap)
+				keyed = keyedDefaultValuePercentileRanks
+			}
+		} else {
+			keyed = keyedDefaultValuePercentileRanks
+		}
 		return metricsAggregation{
 			AggrType:   "percentile_ranks",
 			FieldNames: fieldNames,
 			FieldType:  metricsAggregationDefaultFieldType, // don't need to check, it's unimportant for this aggregation
+			Keyed:      keyed,
 		}, true
 	}
 

diff --git a/quesma/testdata/opensearch-visualize/aggregation_requests.go b/quesma/testdata/opensearch-visualize/aggregation_requests.go
@@ -1124,4 +1124,156 @@ var AggregationTests = []testdata.AggregationTestCase{
 				`ORDER BY ("response")`,
 		},
 	},
+	{ // [7]
+		TestName: "Percentile_ranks keyed=false. Reproduce: Visualize -> Line -> Metrics: Percentile Ranks, Buckets: X-Asis Date Histogram",
+		QueryRequestJson: `
+		{
+			"_source": {
+				"excludes": []
+			},
+			"aggs": {
+				"2": {
+					"aggs": {
+						"1": {
+							"percentile_ranks": {
+								"field": "AvgTicketPrice",
+								"keyed": false,
+								"values": [
+									0,
+									50000
+								]
+							}
+						}
+					},
+					"date_histogram": {
+						"calendar_interval": "1h",
+						"field": "timestamp",
+						"min_doc_count": 1,
+						"time_zone": "Europe/Warsaw"
+					}
+				}
+			},
+			"docvalue_fields": [
+				{
+					"field": "timestamp",
+					"format": "date_time"
+				}
+			],
+			"query": {
+				"bool": {
+					"filter": [],
+					"must": [
+						{
+							"match_all": {}
+						}
+					],
+					"must_not": [],
+					"should": []
+				}
+			},
+			"script_fields": {
+				"hour_of_day": {
+					"script": {
+						"lang": "painless",
+						"source": "doc['timestamp'].value.hourOfDay"
+					}
+				}
+			},
+			"size": 0,
+			"stored_fields": [
+				"*"
+			]
+		}`,
+		ExpectedResponse: `
+		{
+			"_shards": {
+				"failed": 0,
+				"skipped": 0,
+				"successful": 1,
+				"total": 1
+			},
+			"aggregations": {
+				"2": {
+					"buckets": [
+						{
+							"1": {
+								"values": [
+									{
+										"key": 0.0,
+										"value": 0.0
+									},
+									{
+										"key": 50000.0,
+										"value": 100.0
+									}
+								]
+							},
+							"doc_count": 9,
+							"key": 1714860000000,
+							"key_as_string": "2024-05-04T22:00:00.000"
+						},
+						{
+							"1": {
+								"values": [
+									{
+										"key": 0.0,
+										"value": 0.0
+									},
+									{
+										"key": 50000.0,
+										"value": 50.0
+									}
+								]
+							},
+							"doc_count": 12,
+							"key": 1714863600000,
+							"key_as_string": "2024-05-04T23:00:00.000"
+						}
+					]
+				}
+			},
+			"hits": {
+				"hits": [],
+				"max_score": null,
+				"total": {
+					"relation": "eq",
+					"value": 884
+				}
+			},
+			"timed_out": false,
+			"took": 0
+		}`,
+		ExpectedResults: [][]model.QueryResultRow{
+			{{Cols: []model.QueryResultCol{model.NewQueryResultCol("hits", uint64(884))}}},
+			{
+				{Cols: []model.QueryResultCol{
+					model.NewQueryResultCol("key", int64(1714860000000/3600000)),
+					model.NewQueryResultCol("AvgTicketPrice<=0,", 0.0),
+					model.NewQueryResultCol("AvgTicketPrice<=50000,", 100.0)},
+				},
+				{Cols: []model.QueryResultCol{
+					model.NewQueryResultCol("key", int64(1714863600000/3600000)),
+					model.NewQueryResultCol("AvgTicketPrice<=0,", 0.0),
+					model.NewQueryResultCol("AvgTicketPrice<=50000,", 50.0),
+				}},
+			},
+			{
+				{Cols: []model.QueryResultCol{model.NewQueryResultCol("key", int64(1714860000000/3600000)), model.NewQueryResultCol("doc_count", 9)}},
+				{Cols: []model.QueryResultCol{model.NewQueryResultCol("key", int64(1714863600000/3600000)), model.NewQueryResultCol("doc_count", 12)}},
+			},
+		},
+		ExpectedSQLs: []string{
+			`SELECT count() FROM ` + testdata.QuotedTableName + ` `,
+			"SELECT toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000), " +
+				`count(if("AvgTicketPrice"<=0.000000, 1, NULL))/count(*)*100, ` +
+				`count(if("AvgTicketPrice"<=50000.000000, 1, NULL))/count(*)*100 ` +
+				`FROM ` + testdata.QuotedTableName + `  ` +
+				"GROUP BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000)) " +
+				"ORDER BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000))",
+			"SELECT toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000), count() " +
+				`FROM ` + testdata.QuotedTableName + `  ` +
+				"GROUP BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000)) " +
+				"ORDER BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000))",
+		},
+	},
 }