bucket_script (pipeline aggr) enhancements (#923)

Before `bucket_script` was only handled in one simplest case (which happened surprisingly often) - when it meant just a simple count. Now we accept the script to be based on more than one parent aggregation + it works for 1 particular example of a script. TODO: After implementing some basic arithmetic parsing, it should work in all cases.
QuesmaOrg · Oct 29, 2024 · 1f434d4 · 1f434d4
1 parent ca8ec4a
commit 1f434d4
Show file tree

Hide file tree

Showing 9 changed files with 460 additions and 29 deletions.
diff --git a/quesma/model/pipeline_aggregations/bucket_script.go b/quesma/model/pipeline_aggregations/bucket_script.go
@@ -4,44 +4,85 @@ package pipeline_aggregations
 
 import (
 	"context"
+	"fmt"
 	"quesma/logger"
 	"quesma/model"
+	"quesma/util"
+	"strings"
 )
 
 type BucketScript struct {
-	ctx context.Context
+	*PipelineAggregation
+	script string
 }
 
-func NewBucketScript(ctx context.Context) BucketScript {
-	return BucketScript{ctx: ctx}
+func NewBucketScript(ctx context.Context, script string) BucketScript {
+	return BucketScript{script: script, PipelineAggregation: newPipelineAggregation(ctx, "_count")}
 }
 
 func (query BucketScript) AggregationType() model.AggregationType {
 	return model.PipelineMetricsAggregation // not sure
 }
 
 func (query BucketScript) TranslateSqlResponseToJson(rows []model.QueryResultRow) model.JsonMap {
-	if len(rows) == 0 {
-		logger.WarnWithCtx(query.ctx).Msg("no rows returned for bucket script aggregation")
-		return model.JsonMap{"value": 0}
-	}
-	var response []model.JsonMap
-	for _, row := range rows {
-		response = append(response, model.JsonMap{"value": row.Cols[0].Value})
-	}
-	return model.JsonMap{
-		"buckets": response,
+	const defaultValue = 0.
+	switch {
+	case query.script == "params.numerator != null && params.denominator != null && params.denominator != 0 ? params.numerator / params.denominator : 0":
+		numerator := query.findFilterValue(rows, "numerator")
+		denominator := query.findFilterValue(rows, "denominator")
+		if denominator == 0 {
+			return model.JsonMap{"value": defaultValue}
+		}
+		return model.JsonMap{"value": numerator / denominator}
+	case len(rows) == 1:
+		for _, row := range rows {
+			return model.JsonMap{"value": util.ExtractNumeric64(row.LastColValue())}
+		}
 	}
+
+	logger.WarnWithCtx(query.ctx).Msgf("unexpected result in bucket_script: %s, len(rows): %d. Returning default.", query.String(), len(rows))
+	return model.JsonMap{"value": defaultValue}
 }
 
-func (query BucketScript) CalculateResultWhenMissing(*model.Query, []model.QueryResultRow) []model.QueryResultRow {
-	return []model.QueryResultRow{}
+func (query BucketScript) CalculateResultWhenMissing(parentRows []model.QueryResultRow) []model.QueryResultRow {
+	resultRows := make([]model.QueryResultRow, 0, len(parentRows))
+	for _, parentRow := range parentRows {
+		resultRow := parentRow.Copy()
+		if len(resultRow.Cols) != 0 {
+			resultRow.Cols[len(resultRow.Cols)-1].Value = util.ExtractNumeric64(parentRow.LastColValue())
+		} else {
+			logger.ErrorWithCtx(query.ctx).Msgf("unexpected empty parent row in bucket_script: %s", query.String())
+		}
+		resultRows = append(resultRows, resultRow)
+	}
+	return resultRows
 }
 
 func (query BucketScript) String() string {
-	return "bucket script"
+	return fmt.Sprintf("bucket_script(isCount: %v, parent: %s, pathToParent: %v, parentBucketAggregation: %v, script: %v)",
+		query.isCount, query.Parent, query.PathToParent, query.parentBucketAggregation, query.script)
 }
 
 func (query BucketScript) PipelineAggregationType() model.PipelineAggregationType {
-	return model.PipelineParentAggregation // not sure, maybe it's sibling.
+	return model.PipelineParentAggregation // not sure, maybe it's sibling. change hasn't changed the result when running some tests.
+}
+
+func (query BucketScript) findFilterValue(rows []model.QueryResultRow, filterName string) float64 {
+	const defaultValue = 0.0
+	for _, row := range rows {
+		for _, col := range row.Cols {
+			colName := col.ColName
+			if !strings.HasSuffix(colName, "_col_0") {
+				continue
+			}
+			colName = strings.TrimSuffix(colName, "_col_0")
+			if strings.HasSuffix(colName, "-"+filterName) {
+				return float64(util.ExtractInt64(col.Value))
+			}
+		}
+	}
+
+	logger.WarnWithCtx(query.ctx).Msgf("could not find filter value for filter: %s, bucket_script: %s, len(rows): %d."+
+		"Returning default", filterName, query.String(), len(rows))
+	return defaultValue
 }
diff --git a/quesma/model/pipeline_aggregations/pipeline_aggregation.go b/quesma/model/pipeline_aggregations/pipeline_aggregation.go
@@ -27,7 +27,7 @@ func newPipelineAggregation(ctx context.Context, bucketsPath string) *PipelineAg
 	const delimiter = ">"
 	if len(bucketsPath) == 0 {
 		logger.WarnWithCtx(ctx).Msgf("invalid bucketsPath: %s. Using empty string as parent.", bucketsPath)
-		return &PipelineAggregation{}
+		return &PipelineAggregation{isCount: true} // count, as it's the simplest case
 	}
 
 	parent := ""
@@ -54,6 +54,10 @@ func (p *PipelineAggregation) IsCount() bool {
 	return p.isCount
 }
 
+func (p *PipelineAggregation) GetParentBucketAggregation() model.QueryType {
+	return p.parentBucketAggregation
+}
+
 func (p *PipelineAggregation) SetParentBucketAggregation(parentBucketAggregation model.QueryType) {
 	p.parentBucketAggregation = parentBucketAggregation
 }

diff --git a/quesma/model/pipeline_query_type.go b/quesma/model/pipeline_query_type.go
@@ -26,5 +26,6 @@ type PipelineQueryType interface {
 	GetPathToParent() []string
 	IsCount() bool
 
+	GetParentBucketAggregation() QueryType
 	SetParentBucketAggregation(parentBucketAggregation QueryType)
 }
diff --git a/quesma/queryparser/aggregation_parser_test.go b/quesma/queryparser/aggregation_parser_test.go
@@ -666,6 +666,7 @@ func allAggregationTests() []testdata.AggregationTestCase {
 	add(kibana_visualize.PipelineAggregationTests, "kibana-visualize/pipeline_agg_req")
 	add(clients.KunkkaTests, "clients/kunkka")
 	add(clients.OpheliaTests, "clients/ophelia")
+	add(clients.CloverTests, "clients/clover")
 
 	return allTests
 }

diff --git a/quesma/queryparser/pancake_pipelines.go b/quesma/queryparser/pancake_pipelines.go
@@ -19,6 +19,10 @@ func (p pancakePipelinesProcessor) selectPipelineRows(pipeline model.PipelineQue
 	bucketAggregation *pancakeModelBucketAggregation) (
 	result []model.QueryResultRow) {
 
+	if bucketAggregation == nil {
+		return rows
+	}
+
 	isCount := pipeline.IsCount()
 	for _, row := range rows {
 		newRow := model.QueryResultRow{Index: row.Index}

diff --git a/quesma/queryparser/pancake_sql_query_generation_test.go b/quesma/queryparser/pancake_sql_query_generation_test.go
@@ -60,6 +60,10 @@ func TestPancakeQueryGeneration(t *testing.T) {
 				t.Skip("Need to implement order by top metrics (talk with Jacek, he has an idea)")
 			}
 
+			if test.TestName == "multiple buckets_path(file:clients/clover,nr:1)" {
+				t.Skip("Unskip after merge of auto_date_histogram")
+			}
+
 			fmt.Println("i:", i, "test:", test.TestName)
 
 			jsonp, err := types.ParseJSON(test.QueryRequestJson)

diff --git a/quesma/queryparser/pipeline_aggregations.go b/quesma/queryparser/pipeline_aggregations.go
@@ -6,6 +6,7 @@ import (
 	"quesma/logger"
 	"quesma/model"
 	"quesma/model/pipeline_aggregations"
+	"strings"
 )
 
 // CAUTION: maybe "return" everywhere isn't corrent, as maybe there can be multiple pipeline aggregations at one level.
@@ -168,26 +169,29 @@ func (cw *ClickhouseQueryTranslator) parseBucketScriptBasic(queryMap QueryMap) (
 	if !ok {
 		return
 	}
-	if bucketsPath != pipeline_aggregations.BucketsPathCount {
+	if !strings.HasSuffix(bucketsPath, pipeline_aggregations.BucketsPathCount) {
 		logger.WarnWithCtx(cw.Ctx).Msgf("buckets_path is not '_count', but %s. Skipping this aggregation", bucketsPath)
 		return
 	}
 
-	// if ["script"]["source"] != "_value", skip the aggregation
 	scriptRaw, exists := bucketScript["script"]
 	if !exists {
 		logger.WarnWithCtx(cw.Ctx).Msg("no script in bucket_script. Skipping this aggregation")
 		return
 	}
+	if script, ok := scriptRaw.(string); ok {
+		return pipeline_aggregations.NewBucketScript(cw.Ctx, script), true
+	}
+
 	script, ok := scriptRaw.(QueryMap)
 	if !ok {
 		logger.WarnWithCtx(cw.Ctx).Msgf("script is not a map, but %T, value: %v. Skipping this aggregation", scriptRaw, scriptRaw)
 		return
 	}
 	if sourceRaw, exists := script["source"]; exists {
 		if source, ok := sourceRaw.(string); ok {
-			if source != "_value" {
-				logger.WarnWithCtx(cw.Ctx).Msgf("source is not '_value', but %s. Skipping this aggregation", source)
+			if source != "_value" && source != "count * 1" {
+				logger.WarnWithCtx(cw.Ctx).Msgf("source is not '_value'/'count * 1', but %s. Skipping this aggregation", source)
 				return
 			}
 		} else {
@@ -200,10 +204,10 @@ func (cw *ClickhouseQueryTranslator) parseBucketScriptBasic(queryMap QueryMap) (
 	}
 
 	// okay, we've checked everything, it's indeed a simple count
-	return pipeline_aggregations.NewBucketScript(cw.Ctx), true
+	return pipeline_aggregations.NewBucketScript(cw.Ctx, ""), true
 }
 
-func (cw *ClickhouseQueryTranslator) parseBucketsPath(shouldBeQueryMap any, aggregationName string) (bucketsPath string, success bool) {
+func (cw *ClickhouseQueryTranslator) parseBucketsPath(shouldBeQueryMap any, aggregationName string) (bucketsPathStr string, success bool) {
 	queryMap, ok := shouldBeQueryMap.(QueryMap)
 	if !ok {
 		logger.WarnWithCtx(cw.Ctx).Msgf("%s is not a map, but %T, value: %v", aggregationName, shouldBeQueryMap, shouldBeQueryMap)
@@ -214,10 +218,27 @@ func (cw *ClickhouseQueryTranslator) parseBucketsPath(shouldBeQueryMap any, aggr
 		logger.WarnWithCtx(cw.Ctx).Msg("no buckets_path in avg_bucket")
 		return
 	}
-	bucketsPath, ok = bucketsPathRaw.(string)
-	if !ok {
-		logger.WarnWithCtx(cw.Ctx).Msgf("buckets_path is not a string, but %T, value: %v", bucketsPathRaw, bucketsPathRaw)
-		return
+
+	switch bucketsPath := bucketsPathRaw.(type) {
+	case string:
+		return bucketsPath, true
+	case QueryMap:
+		// TODO: handle arbitrary nr of keys (and arbitrary scripts, because we also handle only one special case)
+		if len(bucketsPath) == 1 || len(bucketsPath) == 2 {
+			for _, bucketPath := range bucketsPath {
+				if _, ok = bucketPath.(string); !ok {
+					logger.WarnWithCtx(cw.Ctx).Msgf("buckets_path is not a map with string values, but %T. Skipping this aggregation", bucketPath)
+					return
+				}
+				// Kinda weird to return just the first value, but seems working on all cases so far.
+				// After fixing the TODO above, it should also get fixed.
+				return bucketPath.(string), true
+			}
+		} else {
+			logger.WarnWithCtx(cw.Ctx).Msgf("buckets_path is not a map with one or two keys, but %d. Skipping this aggregation", len(bucketsPath))
+		}
 	}
-	return bucketsPath, true
+
+	logger.WarnWithCtx(cw.Ctx).Msgf("buckets_path in wrong format, type: %T, value: %v", bucketsPathRaw, bucketsPathRaw)
+	return
 }