diff --git a/quesma/model/metrics_aggregations/percentile_ranks.go b/quesma/model/metrics_aggregations/percentile_ranks.go index 50e87eb6a..6537f63ff 100644 --- a/quesma/model/metrics_aggregations/percentile_ranks.go +++ b/quesma/model/metrics_aggregations/percentile_ranks.go @@ -4,15 +4,19 @@ import ( "context" "mitmproxy/quesma/logger" "mitmproxy/quesma/model" + "strconv" "strings" ) type PercentileRanks struct { ctx context.Context + // defines what response should look like + // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-percentile-rank-aggregation.html#_keyed_response_5 + Keyed bool } -func NewPercentileRanks(ctx context.Context) PercentileRanks { - return PercentileRanks{ctx: ctx} +func NewPercentileRanks(ctx context.Context, keyed bool) PercentileRanks { + return PercentileRanks{ctx: ctx, Keyed: keyed} } func (query PercentileRanks) IsBucketAggregation() bool { @@ -20,33 +24,74 @@ func (query PercentileRanks) IsBucketAggregation() bool { } func (query PercentileRanks) TranslateSqlResponseToJson(rows []model.QueryResultRow, level int) []model.JsonMap { - valueMap := make(map[string]float64) - for _, percentileRank := range rows[0].Cols[level:] { - // percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X. - // It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory. - // Also, can't be .00, needs to be .0 - beg := strings.Index(percentileRank.ColName, "<=") - end := strings.Index(percentileRank.ColName[beg:], ",") - cutValue := percentileRank.ColName[beg+2 : beg+end] - - dot := strings.Index(cutValue, ".") - if dot == -1 { - cutValue += ".0" - } else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" { - cutValue = cutValue[:dot+2] - } else { - cutValue = cutValue[:dot+3] + if len(rows) == 0 { + logger.WarnWithCtx(query.ctx).Msg("no rows in percentile ranks response") + return make([]model.JsonMap, 0) + } + // I duplicate a lot of code in this if/else below, + // but I think it's worth it, as this function might get called a lot of times for a single query. + // And because of complete separation in if/else, I guess it might (should) be slightly faster (?) + if query.Keyed { + valueMap := make(model.JsonMap) + for _, percentileRank := range rows[0].Cols[level:] { + // percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X. + // It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory. + // Also, can't be .00, needs to be .0 + beg := strings.Index(percentileRank.ColName, "<=") + end := strings.Index(percentileRank.ColName[beg:], ",") + cutValue := percentileRank.ColName[beg+2 : beg+end] + + dot := strings.Index(cutValue, ".") + if dot == -1 { + cutValue += ".0" + } else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" { + cutValue = cutValue[:dot+2] + } else { + cutValue = cutValue[:dot+3] + } + if value, ok := percentileRank.Value.(float64); ok { + valueMap[cutValue] = value + } else { + logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v. Skipping", + percentileRank.Value, percentileRank.Value) + } } - if value, ok := percentileRank.Value.(float64); ok { - valueMap[cutValue] = value - } else { - logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v", - percentileRank.Value, percentileRank.Value) + return []model.JsonMap{{ + "values": valueMap, + }} + } else { + buckets := make([]model.JsonMap, 0) + for _, percentileRank := range rows[0].Cols[level:] { + // percentileRank.ColName looks like this [...]<=X,[...]. We're extracting X. + // It always needs to have .Y or .YZ at the end, so 1 or 2 digits after the dot, and dot is mandatory. + // Also, can't be .00, needs to be .0 + beg := strings.Index(percentileRank.ColName, "<=") + end := strings.Index(percentileRank.ColName[beg:], ",") + cutValue := percentileRank.ColName[beg+2 : beg+end] + + dot := strings.Index(cutValue, ".") + if dot == -1 { + cutValue += ".0" + } else if end-dot >= len(".00") && cutValue[dot:dot+3] == ".00" { + cutValue = cutValue[:dot+2] + } else { + cutValue = cutValue[:dot+3] + } + cutValueFloat, _ := strconv.ParseFloat(cutValue, 64) + if value, ok := percentileRank.Value.(float64); ok { + buckets = append(buckets, model.JsonMap{ + "key": cutValueFloat, + "value": value, + }) + } else { + logger.WarnWithCtx(query.ctx).Msgf("failed to convert percentile rank value to float64, type: %T, value: %v. Skipping", + percentileRank.Value, percentileRank.Value) + } } + return []model.JsonMap{{ + "values": buckets, + }} } - return []model.JsonMap{{ - "values": valueMap, - }} } func (query PercentileRanks) String() string { diff --git a/quesma/queryparser/aggregation_parser.go b/quesma/queryparser/aggregation_parser.go index 4a736c80a..d070fcf60 100644 --- a/quesma/queryparser/aggregation_parser.go +++ b/quesma/queryparser/aggregation_parser.go @@ -17,6 +17,8 @@ import ( "strings" ) +const keyedDefaultValuePercentileRanks = true + type filter struct { name string sql SimpleQuery @@ -184,7 +186,7 @@ func (b *aggrQueryBuilder) buildMetricsAggregation(metricsAggr metricsAggregatio case "value_count": query.Type = metrics_aggregations.NewValueCount(b.ctx) case "percentile_ranks": - query.Type = metrics_aggregations.NewPercentileRanks(b.ctx) + query.Type = metrics_aggregations.NewPercentileRanks(b.ctx, metricsAggr.Keyed) } return query } @@ -512,10 +514,20 @@ func (cw *ClickhouseQueryTranslator) tryMetricsAggregation(queryMap QueryMap) (m logger.WarnWithCtx(cw.Ctx).Msgf("cutValue in percentile_ranks is not a number, but %T, value: %v. Skipping.", cutValue, cutValue) } } + var keyed bool + if keyedRaw, ok := percentileRanks.(QueryMap)["keyed"]; ok { + if keyed, ok = keyedRaw.(bool); !ok { + logger.WarnWithCtx(cw.Ctx).Msgf("keyed specified for percentiles aggregation is not a boolean. Querymap: %v", queryMap) + keyed = keyedDefaultValuePercentileRanks + } + } else { + keyed = keyedDefaultValuePercentileRanks + } return metricsAggregation{ AggrType: "percentile_ranks", FieldNames: fieldNames, FieldType: metricsAggregationDefaultFieldType, // don't need to check, it's unimportant for this aggregation + Keyed: keyed, }, true } diff --git a/quesma/testdata/opensearch-visualize/aggregation_requests.go b/quesma/testdata/opensearch-visualize/aggregation_requests.go index 55251a79c..81a0e47e9 100644 --- a/quesma/testdata/opensearch-visualize/aggregation_requests.go +++ b/quesma/testdata/opensearch-visualize/aggregation_requests.go @@ -1124,4 +1124,156 @@ var AggregationTests = []testdata.AggregationTestCase{ `ORDER BY ("response")`, }, }, + { // [7] + TestName: "Percentile_ranks keyed=false. Reproduce: Visualize -> Line -> Metrics: Percentile Ranks, Buckets: X-Asis Date Histogram", + QueryRequestJson: ` + { + "_source": { + "excludes": [] + }, + "aggs": { + "2": { + "aggs": { + "1": { + "percentile_ranks": { + "field": "AvgTicketPrice", + "keyed": false, + "values": [ + 0, + 50000 + ] + } + } + }, + "date_histogram": { + "calendar_interval": "1h", + "field": "timestamp", + "min_doc_count": 1, + "time_zone": "Europe/Warsaw" + } + } + }, + "docvalue_fields": [ + { + "field": "timestamp", + "format": "date_time" + } + ], + "query": { + "bool": { + "filter": [], + "must": [ + { + "match_all": {} + } + ], + "must_not": [], + "should": [] + } + }, + "script_fields": { + "hour_of_day": { + "script": { + "lang": "painless", + "source": "doc['timestamp'].value.hourOfDay" + } + } + }, + "size": 0, + "stored_fields": [ + "*" + ] + }`, + ExpectedResponse: ` + { + "_shards": { + "failed": 0, + "skipped": 0, + "successful": 1, + "total": 1 + }, + "aggregations": { + "2": { + "buckets": [ + { + "1": { + "values": [ + { + "key": 0.0, + "value": 0.0 + }, + { + "key": 50000.0, + "value": 100.0 + } + ] + }, + "doc_count": 9, + "key": 1714860000000, + "key_as_string": "2024-05-04T22:00:00.000" + }, + { + "1": { + "values": [ + { + "key": 0.0, + "value": 0.0 + }, + { + "key": 50000.0, + "value": 50.0 + } + ] + }, + "doc_count": 12, + "key": 1714863600000, + "key_as_string": "2024-05-04T23:00:00.000" + } + ] + } + }, + "hits": { + "hits": [], + "max_score": null, + "total": { + "relation": "eq", + "value": 884 + } + }, + "timed_out": false, + "took": 0 + }`, + ExpectedResults: [][]model.QueryResultRow{ + {{Cols: []model.QueryResultCol{model.NewQueryResultCol("hits", uint64(884))}}}, + { + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("key", int64(1714860000000/3600000)), + model.NewQueryResultCol("AvgTicketPrice<=0,", 0.0), + model.NewQueryResultCol("AvgTicketPrice<=50000,", 100.0)}, + }, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("key", int64(1714863600000/3600000)), + model.NewQueryResultCol("AvgTicketPrice<=0,", 0.0), + model.NewQueryResultCol("AvgTicketPrice<=50000,", 50.0), + }}, + }, + { + {Cols: []model.QueryResultCol{model.NewQueryResultCol("key", int64(1714860000000/3600000)), model.NewQueryResultCol("doc_count", 9)}}, + {Cols: []model.QueryResultCol{model.NewQueryResultCol("key", int64(1714863600000/3600000)), model.NewQueryResultCol("doc_count", 12)}}, + }, + }, + ExpectedSQLs: []string{ + `SELECT count() FROM ` + testdata.QuotedTableName + ` `, + "SELECT toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000), " + + `count(if("AvgTicketPrice"<=0.000000, 1, NULL))/count(*)*100, ` + + `count(if("AvgTicketPrice"<=50000.000000, 1, NULL))/count(*)*100 ` + + `FROM ` + testdata.QuotedTableName + ` ` + + "GROUP BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000)) " + + "ORDER BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000))", + "SELECT toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000), count() " + + `FROM ` + testdata.QuotedTableName + ` ` + + "GROUP BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000)) " + + "ORDER BY (toInt64(toUnixTimestamp64Milli(`timestamp`)/3600000))", + }, + }, }