diff --git a/quesma/model/bucket_aggregations/ip_range.go b/quesma/model/bucket_aggregations/ip_range.go index 3d07bb84b..e34bdbd1f 100644 --- a/quesma/model/bucket_aggregations/ip_range.go +++ b/quesma/model/bucket_aggregations/ip_range.go @@ -98,8 +98,8 @@ func (interval IpInterval) ToWhereClause(field model.Expr) model.Expr { isBegin := interval.begin != UnboundedInterval isEnd := interval.end != UnboundedInterval && interval.end != BiggestIpv4 - begin := model.NewInfixExpr(field, ">=", model.NewLiteralSingleQuoted(interval.begin)) - end := model.NewInfixExpr(field, "<", model.NewLiteralSingleQuoted(interval.end)) + begin := model.NewInfixExpr(field, ">=", model.NewLiteralSingleQuoteString(interval.begin)) + end := model.NewInfixExpr(field, "<", model.NewLiteralSingleQuoteString(interval.end)) if isBegin && isEnd { return model.NewInfixExpr(begin, "AND", end) diff --git a/quesma/model/bucket_aggregations/terms.go b/quesma/model/bucket_aggregations/terms.go index e9c7dbfd4..e25cb0c0a 100644 --- a/quesma/model/bucket_aggregations/terms.go +++ b/quesma/model/bucket_aggregations/terms.go @@ -4,19 +4,31 @@ package bucket_aggregations import ( "context" + "fmt" "quesma/logger" "quesma/model" "quesma/util" + "quesma/util/regex" + "reflect" ) type Terms struct { ctx context.Context significant bool // true <=> significant_terms, false <=> terms - OrderByExpr model.Expr + // include is either: + // - single value: then for strings, it can be a regex. + // - array: then field must match exactly one of the values (never a regex) + // Nil if missing in request. + include any + // exclude is either: + // - single value: then for strings, it can be a regex. + // - array: then field must match exactly one of the values (never a regex) + // Nil if missing in request. + exclude any } -func NewTerms(ctx context.Context, significant bool, orderByExpr model.Expr) Terms { - return Terms{ctx: ctx, significant: significant, OrderByExpr: orderByExpr} +func NewTerms(ctx context.Context, significant bool, include, exclude any) Terms { + return Terms{ctx: ctx, significant: significant, include: include, exclude: exclude} } func (query Terms) AggregationType() model.AggregationType { @@ -106,3 +118,104 @@ func (query Terms) key(row model.QueryResultRow) any { func (query Terms) parentCount(row model.QueryResultRow) any { return row.Cols[len(row.Cols)-3].Value } + +func (query Terms) UpdateFieldForIncludeAndExclude(field model.Expr) (updatedField model.Expr, didWeUpdateField bool) { + // We'll use here everywhere Clickhouse 'if' function: if(condition, then, else) + // In our case field becomes: if(condition that field is not excluded, field, NULL) + ifOrNull := func(condition model.Expr) model.FunctionExpr { + return model.NewFunction("if", condition, field, model.NullExpr) + } + + hasExclude := query.exclude != nil + excludeArr, excludeIsArray := query.exclude.([]any) + switch { + case hasExclude && excludeIsArray: + if len(excludeArr) == 0 { + return field, false + } + + // Select expr will be: if(field NOT IN (excludeArr[0], excludeArr[1], ...), field, NULL) + exprs := make([]model.Expr, 0, len(excludeArr)) + for _, excludeVal := range excludeArr { + exprs = append(exprs, model.NewLiteralSingleQuoteString(excludeVal)) + } + return ifOrNull(model.NewInfixExpr(field, "NOT IN", model.NewTupleExpr(exprs...))), true + case hasExclude: + switch exclude := query.exclude.(type) { + case string: // hard case, might be regex + funcName, patternExpr := regex.ToClickhouseExpr(exclude) + return ifOrNull(model.NewInfixExpr(field, "NOT "+funcName, patternExpr)), true + default: // easy case, never regex + return ifOrNull(model.NewInfixExpr(field, "!=", model.NewLiteral(query.exclude))), true + } + + default: + return field, false // TODO implement similar support for 'include' in next PR + } +} + +// TODO make part of QueryType interface and implement for all aggregations +// TODO add bad requests to tests +// Doing so will ensure we see 100% of what we're interested in in our logs (now we see ~95%) +func CheckParamsTerms(ctx context.Context, paramsRaw any) error { + requiredParams := map[string]string{"field": "string"} + optionalParams := map[string]string{ + "size": "float64|string", // TODO should be int|string, will be fixed + "shard_size": "float64", // TODO should be int, will be fixed + "order": "order", // TODO add order type + "min_doc_count": "float64", // TODO should be int, will be fixed + "shard_min_doc_count": "float64", // TODO should be int, will be fixed + "show_term_doc_count_error": "bool", + "exclude": "not-checking-type-now-complicated", + "include": "not-checking-type-now-complicated", + "collect_mode": "string", + "execution_hint": "string", + "missing": "string", + "value_type": "string", + } + logIfYouSeeThemParams := []string{ + "shard_size", "min_doc_count", "shard_min_doc_count", + "show_term_doc_count_error", "collect_mode", "execution_hint", "value_type", + } + + params, ok := paramsRaw.(model.JsonMap) + if !ok { + return fmt.Errorf("params is not a map, but %+v", paramsRaw) + } + + // check if required are present + for paramName, paramType := range requiredParams { + paramVal, exists := params[paramName] + if !exists { + return fmt.Errorf("required parameter %s not found in Terms params", paramName) + } + if reflect.TypeOf(paramVal).Name() != paramType { // TODO I'll make a small rewrite to not use reflect here + return fmt.Errorf("required parameter %s is not of type %s, but %T", paramName, paramType, paramVal) + } + } + + // check if only required/optional are present + for paramName := range params { + if _, isRequired := requiredParams[paramName]; !isRequired { + wantedType, isOptional := optionalParams[paramName] + if !isOptional { + return fmt.Errorf("unexpected parameter %s found in Terms params %v", paramName, params) + } + if wantedType == "not-checking-type-now-complicated" || wantedType == "order" || wantedType == "float64|string" { + continue // TODO: add that later + } + if reflect.TypeOf(params[paramName]).Name() != wantedType { // TODO I'll make a small rewrite to not use reflect here + return fmt.Errorf("optional parameter %s is not of type %s, but %T", paramName, wantedType, params[paramName]) + } + } + } + + // log if you see them + for _, warnParam := range logIfYouSeeThemParams { + if _, exists := params[warnParam]; exists { + logger.WarnWithCtxAndThrottling(ctx, "terms", warnParam, "we didn't expect %s in Terms params %v", warnParam, params) + } + } + + return nil +} diff --git a/quesma/model/expr.go b/quesma/model/expr.go index 3cf0a20d0..3ad5b330e 100644 --- a/quesma/model/expr.go +++ b/quesma/model/expr.go @@ -16,6 +16,7 @@ var ( InvalidExpr = Expr(nil) TrueExpr = NewLiteral(true) FalseExpr = NewLiteral(false) + NullExpr = NewLiteral("NULL") ) // ColumnRef is a reference to a column in a table, we can enrich it with more information (e.g. type used) as we go @@ -129,8 +130,14 @@ func NewLiteral(value any) LiteralExpr { return LiteralExpr{Value: value} } -func NewLiteralSingleQuoted(value string) LiteralExpr { - return LiteralExpr{Value: fmt.Sprintf("'%s'", value)} +// NewLiteralSingleQuoteString simply does: string -> 'string', anything_else -> anything_else +func NewLiteralSingleQuoteString(value any) LiteralExpr { + switch v := value.(type) { + case string: + return LiteralExpr{Value: fmt.Sprintf("'%s'", v)} + default: + return LiteralExpr{Value: v} + } } // DistinctExpr is a representation of DISTINCT keyword in SQL, e.g. `SELECT DISTINCT` ... or `SELECT COUNT(DISTINCT ...)` diff --git a/quesma/model/expr_string_renderer.go b/quesma/model/expr_string_renderer.go index 7ab4adc28..f4c73fa6c 100644 --- a/quesma/model/expr_string_renderer.go +++ b/quesma/model/expr_string_renderer.go @@ -101,7 +101,7 @@ func (v *renderer) VisitInfix(e InfixExpr) interface{} { // I think in the future every infix op should be in braces. if strings.HasPrefix(e.Op, "_") || e.Op == "AND" || e.Op == "OR" { return fmt.Sprintf("(%v %v %v)", lhs, e.Op, rhs) - } else if strings.Contains(e.Op, "LIKE") || e.Op == "IS" || e.Op == "IN" || e.Op == "REGEXP" || strings.Contains(e.Op, "UNION") { + } else if strings.Contains(e.Op, "LIKE") || e.Op == "IS" || e.Op == "IN" || e.Op == "NOT IN" || e.Op == "REGEXP" || strings.Contains(e.Op, "UNION") { return fmt.Sprintf("%v %v %v", lhs, e.Op, rhs) } else { return fmt.Sprintf("%v%v%v", lhs, e.Op, rhs) diff --git a/quesma/queryparser/pancake_aggregation_parser_buckets.go b/quesma/queryparser/pancake_aggregation_parser_buckets.go index 387e2662e..4b65d0319 100644 --- a/quesma/queryparser/pancake_aggregation_parser_buckets.go +++ b/quesma/queryparser/pancake_aggregation_parser_buckets.go @@ -152,20 +152,33 @@ func (cw *ClickhouseQueryTranslator) parseDateHistogram(aggregation *pancakeAggr // aggrName - "terms" or "significant_terms" func (cw *ClickhouseQueryTranslator) parseTermsAggregation(aggregation *pancakeAggregationTreeNode, params QueryMap, aggrName string) error { + if err := bucket_aggregations.CheckParamsTerms(cw.Ctx, params); err != nil { + return err + } + + terms := bucket_aggregations.NewTerms( + cw.Ctx, aggrName == "significant_terms", params["include"], params["exclude"], + ) + + var didWeAddMissing, didWeUpdateFieldHere bool field := cw.parseFieldField(params, aggrName) - field, didWeAddMissing := cw.addMissingParameterIfPresent(field, params) - if !didWeAddMissing { + field, didWeAddMissing = cw.addMissingParameterIfPresent(field, params) + field, didWeUpdateFieldHere = terms.UpdateFieldForIncludeAndExclude(field) + + // If we updated above, we change our select to if(condition, field, NULL), so we also need to filter out those NULLs later + if !didWeAddMissing || didWeUpdateFieldHere { aggregation.filterOutEmptyKeyBucket = true } const defaultSize = 10 size := cw.parseSize(params, defaultSize) + orderBy, err := cw.parseOrder(params, []model.Expr{field}) if err != nil { return err } - aggregation.queryType = bucket_aggregations.NewTerms(cw.Ctx, aggrName == "significant_terms", orderBy[0]) // TODO probably full, not [0] + aggregation.queryType = terms aggregation.selectedColumns = append(aggregation.selectedColumns, field) aggregation.limit = size aggregation.orderBy = orderBy diff --git a/quesma/queryparser/query_parser.go b/quesma/queryparser/query_parser.go index 0722c088b..e38f47de8 100644 --- a/quesma/queryparser/query_parser.go +++ b/quesma/queryparser/query_parser.go @@ -18,6 +18,7 @@ import ( "quesma/quesma/types" "quesma/schema" "quesma/util" + "quesma/util/regex" "strconv" "strings" "unicode" @@ -886,28 +887,13 @@ func (cw *ClickhouseQueryTranslator) parseRegexp(queryMap QueryMap) (result mode return model.NewSimpleQueryInvalid() } - // really simple == (out of all special characters, only . and .* may be present) - isPatternReallySimple := func(pattern string) bool { - // any special characters excluding . and * not allowed. Also (not the most important check) * can't be first character. - if strings.ContainsAny(pattern, `?+|{}[]()"\`) || (len(pattern) > 0 && pattern[0] == '*') { - return false - } - // .* allowed, but [any other char]* - not - for i, char := range pattern[1:] { - if char == '*' && pattern[i] != '.' { - return false - } - } - return true - } - - for fieldName, parametersRaw := range queryMap { - parameters, ok := parametersRaw.(QueryMap) + for fieldName, paramsRaw := range queryMap { + params, ok := paramsRaw.(QueryMap) if !ok { - logger.WarnWithCtx(cw.Ctx).Msgf("invalid regexp parameters type: %T, value: %v", parametersRaw, parametersRaw) + logger.WarnWithCtx(cw.Ctx).Msgf("invalid regexp parameters type: %T, value: %v", paramsRaw, paramsRaw) return model.NewSimpleQueryInvalid() } - patternRaw, exists := parameters["value"] + patternRaw, exists := params["value"] if !exists { logger.WarnWithCtx(cw.Ctx).Msgf("no value in regexp query: %v", queryMap) return model.NewSimpleQueryInvalid() @@ -918,21 +904,13 @@ func (cw *ClickhouseQueryTranslator) parseRegexp(queryMap QueryMap) (result mode return model.NewSimpleQueryInvalid() } - if len(parameters) > 1 { - logger.WarnWithCtx(cw.Ctx).Msgf("unsupported regexp parameters: %v", parameters) + if len(params) > 1 { + logger.WarnWithCtx(cw.Ctx).Msgf("unsupported regexp parameters: %v", params) } - var funcName string - if isPatternReallySimple(pattern) { - pattern = strings.ReplaceAll(pattern, "_", `\_`) - pattern = strings.ReplaceAll(pattern, ".*", "%") - pattern = strings.ReplaceAll(pattern, ".", "_") - funcName = "LIKE" - } else { // this Clickhouse function is much slower, so we use it only for complex regexps - funcName = "REGEXP" - } - return model.NewSimpleQuery( - model.NewInfixExpr(model.NewColumnRef(fieldName), funcName, model.NewLiteral("'"+pattern+"'")), true) + clickhouseFuncName, patternExpr := regex.ToClickhouseExpr(pattern) + clickhouseExpr := model.NewInfixExpr(model.NewColumnRef(fieldName), clickhouseFuncName, patternExpr) + return model.NewSimpleQuery(clickhouseExpr, true) } logger.ErrorWithCtx(cw.Ctx).Msg("parseRegexp: theoretically unreachable code") diff --git a/quesma/testdata/aggregation_requests_2.go b/quesma/testdata/aggregation_requests_2.go index 6530c5618..21acfba83 100644 --- a/quesma/testdata/aggregation_requests_2.go +++ b/quesma/testdata/aggregation_requests_2.go @@ -4689,4 +4689,605 @@ var AggregationTests2 = []AggregationTestCase{ "aggr__my_buckets__key_1" ASC LIMIT 4`, }, + { // [70] + TestName: "simplest terms with exclude (array of values)", + // TODO add ' somewhere in exclude after the merge! + QueryRequestJson: ` + { + "aggs": { + "1": { + "terms": { + "field": "chess_goat", + "size": 2, + "exclude": ["Carlsen", "Kasparov", "Fis._er*"] + } + } + }, + "size": 0, + "track_total_hits": true + }`, + // I omit "took", "timed_out", "_shards", and "hits" from the response for brevity (they can also be easily unit-tested) + ExpectedResponse: ` + { + "aggregations": { + "1": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 7416, + "buckets": [ + { + "key": "My dad", + "doc_count": 3323 + }, + { + "key": "Barack Obama", + "doc_count": 3261 + } + ] + } + } + }`, + ExpectedPancakeResults: []model.QueryResultRow{ + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(14000)), + model.NewQueryResultCol("aggr__1__key_0", "My dad"), + model.NewQueryResultCol("aggr__1__count", int64(3323)), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(14000)), + model.NewQueryResultCol("aggr__1__key_0", "Barack Obama"), + model.NewQueryResultCol("aggr__1__count", int64(3261)), + }}, + }, + ExpectedPancakeSQL: ` + SELECT sum(count(*)) OVER () AS "aggr__1__parent_count", + if("chess_goat" NOT IN tuple('Carlsen', 'Kasparov', 'Fis._er*'), "chess_goat", NULL) + AS "aggr__1__key_0", count(*) AS "aggr__1__count" + FROM __quesma_table_name + GROUP BY if("chess_goat" NOT IN tuple('Carlsen', 'Kasparov', 'Fis._er*'), "chess_goat", NULL) AS "aggr__1__key_0" + ORDER BY "aggr__1__count" DESC, "aggr__1__key_0" ASC + LIMIT 3`, + }, + { // [71] + TestName: "simplest terms with exclude (single value, no regex)", + QueryRequestJson: ` + { + "aggs": { + "1": { + "terms": { + "field": "agi_birth_year", + "size": 1, + "exclude": 2025 + } + } + }, + "size": 0, + "track_total_hits": true + }`, + // I omit "took", "timed_out", "_shards", and "hits" from the response for brevity (they can also be easily unit-tested) + ExpectedResponse: ` + { + "aggregations": { + "1": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 10700, + "buckets": [ + { + "key": 2024, + "doc_count": 3300 + } + ] + } + } + }`, + ExpectedPancakeResults: []model.QueryResultRow{ + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(14000)), + model.NewQueryResultCol("aggr__1__key_0", nil), + model.NewQueryResultCol("aggr__1__count", int64(10000)), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(14000)), + model.NewQueryResultCol("aggr__1__key_0", 2024), + model.NewQueryResultCol("aggr__1__count", int64(3300)), + }}, + }, + ExpectedPancakeSQL: ` + SELECT sum(count(*)) OVER () AS "aggr__1__parent_count", + if("agi_birth_year"!=2025, "agi_birth_year", NULL) AS "aggr__1__key_0", + count(*) AS "aggr__1__count" + FROM __quesma_table_name + GROUP BY if("agi_birth_year"!=2025, "agi_birth_year", NULL) AS "aggr__1__key_0" + ORDER BY "aggr__1__count" DESC, "aggr__1__key_0" ASC + LIMIT 2`, + }, + { // [72] + TestName: "simplest terms with exclude (empty array)", + QueryRequestJson: ` + { + "aggs": { + "1": { + "terms": { + "field": "agi_birth_year", + "size": 1, + "exclude": [] + } + } + }, + "size": 0, + "track_total_hits": true + }`, + // I omit "took", "timed_out", "_shards", and "hits" from the response for brevity (they can also be easily unit-tested) + ExpectedResponse: ` + { + "aggregations": { + "1": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 700, + "buckets": [ + { + "key": 2024, + "doc_count": 300 + } + ] + } + } + }`, + ExpectedPancakeResults: []model.QueryResultRow{ + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(1000)), + model.NewQueryResultCol("aggr__1__key_0", nil), + model.NewQueryResultCol("aggr__1__count", int64(600)), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(1000)), + model.NewQueryResultCol("aggr__1__key_0", 2024), + model.NewQueryResultCol("aggr__1__count", int64(300)), + }}, + }, + ExpectedPancakeSQL: ` + SELECT sum(count(*)) OVER () AS "aggr__1__parent_count", + "agi_birth_year" AS "aggr__1__key_0", count(*) AS "aggr__1__count" + FROM __quesma_table_name + GROUP BY "agi_birth_year" AS "aggr__1__key_0" + ORDER BY "aggr__1__count" DESC, "aggr__1__key_0" ASC + LIMIT 2`, + }, + { // [73] + TestName: "simplest terms with exclude (of strings), regression test", + QueryRequestJson: ` + { + "aggs": { + "1": { + "terms": { + "field": "chess_goat", + "size": 1, + "exclude": ["abc"] + } + } + }, + "size": 0, + "track_total_hits": true + }`, + // I omit "took", "timed_out", "_shards", and "hits" from the response for brevity (they can also be easily unit-tested) + ExpectedResponse: ` + { + "aggregations": { + "1": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 700, + "buckets": [ + { + "key": 2024, + "doc_count": 300 + } + ] + } + } + }`, + ExpectedPancakeResults: []model.QueryResultRow{ + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(1000)), + model.NewQueryResultCol("aggr__1__key_0", nil), + model.NewQueryResultCol("aggr__1__count", int64(600)), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(1000)), + model.NewQueryResultCol("aggr__1__key_0", 2024), + model.NewQueryResultCol("aggr__1__count", int64(300)), + }}, + }, + ExpectedPancakeSQL: ` + SELECT sum(count(*)) OVER () AS "aggr__1__parent_count", + if("chess_goat" NOT IN 'abc', "chess_goat", NULL) AS "aggr__1__key_0", + count(*) AS "aggr__1__count" + FROM __quesma_table_name + GROUP BY if("chess_goat" NOT IN 'abc', "chess_goat", NULL) AS "aggr__1__key_0" + ORDER BY "aggr__1__count" DESC, "aggr__1__key_0" ASC + LIMIT 2`, + }, + { // [74] + TestName: "terms with exclude (more complex, string field with exclude regex)", + // One simple test, for more regex tests see util/regex unit tests + QueryRequestJson: ` + { + "aggs": { + "1": { + "terms": { + "field": "chess_goat", + "size": 1, + "exclude": "K.*" + } + } + }, + "size": 0, + "track_total_hits": true + }`, + // I omit "took", "timed_out", "_shards", and "hits" from the response for brevity (they can also be easily unit-tested) + ExpectedResponse: ` + { + "aggregations": { + "1": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 1, + "buckets": [ + { + "key": "Paul Morphy", + "doc_count": 13999 + } + ] + } + } + }`, + ExpectedPancakeResults: []model.QueryResultRow{ + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(14000)), + model.NewQueryResultCol("aggr__1__key_0", "Paul Morphy"), + model.NewQueryResultCol("aggr__1__count", int64(13999)), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__1__parent_count", int64(14000)), + model.NewQueryResultCol("aggr__1__key_0", nil), + model.NewQueryResultCol("aggr__1__count", int64(1)), + }}, + }, + ExpectedPancakeSQL: ` + SELECT sum(count(*)) OVER () AS "aggr__1__parent_count", + if("chess_goat" NOT LIKE 'K%', "chess_goat", NULL) AS "aggr__1__key_0", + count(*) AS "aggr__1__count" + FROM __quesma_table_name + GROUP BY if("chess_goat" NOT LIKE 'K%', "chess_goat", NULL) AS "aggr__1__key_0" + ORDER BY "aggr__1__count" DESC, "aggr__1__key_0" ASC + LIMIT 2`, + }, + { // [75] + TestName: "complex terms with exclude: nested terms + 2 metrics", + QueryRequestJson: ` + { + "aggs": { + "terms1": { + "aggs": { + "metric1": { + "avg": { + "field": "DistanceMiles" + } + }, + "terms2": { + "aggs": { + "metric2": { + "sum": { + "field": "AvgTicketPrice" + } + } + }, + "terms": { + "field": "DestCityName", + "size": 1 + } + } + }, + "terms": { + "exclude": [ + "a", + "b" + ], + "field": "Carrier", + "size": 2 + } + } + }, + "size": 0, + "track_total_hits": true + }`, + // I omit "took", "timed_out", "_shards", and "hits" from the response for brevity (they can also be easily unit-tested) + ExpectedResponse: ` + { + "aggregations": { + "terms1": { + "buckets": [ + { + "doc_count": 3323, + "key": "Logstash Airways", + "metric1": { + "value": 4451.946294580208 + }, + "terms2": { + "buckets": [ + { + "doc_count": 173, + "key": "Zurich", + "metric2": { + "value": 102370.42402648926 + } + } + ], + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 3150 + } + }, + { + "doc_count": 3261, + "key": "JetBeats", + "metric1": { + "value": 4434.670874554115 + }, + "terms2": { + "buckets": [ + { + "doc_count": 167, + "key": "Zurich", + "metric2": { + "value": 92215.76377868652 + } + } + ], + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 3094 + } + } + ], + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 6430 + } + } + }`, + ExpectedPancakeResults: []model.QueryResultRow{ + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__terms1__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms1__key_0", "Logstash Airways"), + model.NewQueryResultCol("aggr__terms1__count", int64(3323)), + model.NewQueryResultCol("metric__terms1__metric1_col_0", 4451.946294580208), + model.NewQueryResultCol("aggr__terms1__terms2__parent_count", int64(3323)), + model.NewQueryResultCol("aggr__terms1__terms2__key_0", "Zurich"), + model.NewQueryResultCol("aggr__terms1__terms2__count", int64(173)), + model.NewQueryResultCol("metric__terms1__terms2__metric2_col_0", 102370.42402648926), + }}, + {Cols: []model.QueryResultCol{ // should be discarded by us because of terms2's size=1 + model.NewQueryResultCol("aggr__terms1__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms1__key_0", "Logstash Airways"), + model.NewQueryResultCol("aggr__terms1__count", int64(3323)), + model.NewQueryResultCol("metric__terms1__metric1_col_0", 4451.946294580208), + model.NewQueryResultCol("aggr__terms1__terms2__parent_count", int64(3323)), + model.NewQueryResultCol("aggr__terms1__terms2__key_0", "Wąchock"), + model.NewQueryResultCol("aggr__terms1__terms2__count", int64(150)), + model.NewQueryResultCol("metric__terms1__terms2__metric2_col_0", nil), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__terms1__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms1__key_0", "JetBeats"), + model.NewQueryResultCol("aggr__terms1__count", int64(3261)), + model.NewQueryResultCol("metric__terms1__metric1_col_0", 4434.670878262596), + model.NewQueryResultCol("aggr__terms1__terms2__parent_count", int64(3261)), + model.NewQueryResultCol("aggr__terms1__terms2__key_0", "Zurich"), + model.NewQueryResultCol("aggr__terms1__terms2__count", int64(167)), + model.NewQueryResultCol("metric__terms1__terms2__metric2_col_0", 92215.763779), + }}, + {Cols: []model.QueryResultCol{ // should be discarded by us because of terms2's size=1 + model.NewQueryResultCol("aggr__terms1__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms1__key_0", "JetBeats"), + model.NewQueryResultCol("aggr__terms1__count", int64(3261)), + model.NewQueryResultCol("metric__terms1__metric1_col_0", 4434.670878262596), + model.NewQueryResultCol("aggr__terms1__terms2__parent_count", int64(3261)), + model.NewQueryResultCol("aggr__terms1__terms2__key_0", "Wąchock"), + model.NewQueryResultCol("aggr__terms1__terms2__count", int64(147)), + model.NewQueryResultCol("metric__terms1__terms2__metric2_col_0", 90242.31663285477), + }}, + {Cols: []model.QueryResultCol{ // should be discarded by us because of terms1's size=2 + model.NewQueryResultCol("aggr__terms1__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms1__key_0", "Kibana Airlines"), + model.NewQueryResultCol("aggr__terms1__count", int64(3219)), + model.NewQueryResultCol("metric__terms1__metric1_col_0", 4335.019248495363), + model.NewQueryResultCol("aggr__terms1__terms2__parent_count", int64(3219)), + model.NewQueryResultCol("aggr__terms1__terms2__key_0", "Zurich"), + model.NewQueryResultCol("aggr__terms1__terms2__count", int64(173)), + model.NewQueryResultCol("metric__terms1__terms2__metric2_col_0", 99314.3501429406), + }}, + }, + ExpectedPancakeSQL: ` + SELECT "aggr__terms1__parent_count", "aggr__terms1__key_0", + "aggr__terms1__count", "metric__terms1__metric1_col_0", + "aggr__terms1__terms2__parent_count", "aggr__terms1__terms2__key_0", + "aggr__terms1__terms2__count", "metric__terms1__terms2__metric2_col_0" + FROM ( + SELECT "aggr__terms1__parent_count", "aggr__terms1__key_0", + "aggr__terms1__count", "metric__terms1__metric1_col_0", + "aggr__terms1__terms2__parent_count", "aggr__terms1__terms2__key_0", + "aggr__terms1__terms2__count", "metric__terms1__terms2__metric2_col_0", + dense_rank() OVER (ORDER BY "aggr__terms1__count" DESC, + "aggr__terms1__key_0" ASC) AS "aggr__terms1__order_1_rank", + dense_rank() OVER (PARTITION BY "aggr__terms1__key_0" ORDER BY + "aggr__terms1__terms2__count" DESC, "aggr__terms1__terms2__key_0" ASC) AS + "aggr__terms1__terms2__order_1_rank" + FROM ( + SELECT sum(count(*)) OVER () AS "aggr__terms1__parent_count", + if("Carrier" NOT IN tuple('a', 'b'), "Carrier", NULL) AS "aggr__terms1__key_0", + sum(count(*)) OVER (PARTITION BY "aggr__terms1__key_0") AS + "aggr__terms1__count", + avgOrNullMerge(avgOrNullState("DistanceMiles")) OVER (PARTITION BY + "aggr__terms1__key_0") AS "metric__terms1__metric1_col_0", + sum(count(*)) OVER (PARTITION BY "aggr__terms1__key_0") AS + "aggr__terms1__terms2__parent_count", + "DestCityName" AS "aggr__terms1__terms2__key_0", + count(*) AS "aggr__terms1__terms2__count", + sumOrNull("AvgTicketPrice") AS "metric__terms1__terms2__metric2_col_0" + FROM __quesma_table_name + GROUP BY if("Carrier" NOT IN tuple('a', 'b'), "Carrier", NULL) AS + "aggr__terms1__key_0", "DestCityName" AS "aggr__terms1__terms2__key_0")) + WHERE ("aggr__terms1__order_1_rank"<=3 AND "aggr__terms1__terms2__order_1_rank" + <=2) + ORDER BY "aggr__terms1__order_1_rank" ASC, + "aggr__terms1__terms2__order_1_rank" ASC`, + }, + { // [76] + TestName: "terms with exclude, but with branched off aggregation tree", + QueryRequestJson: ` + { + "aggs": { + "terms1": { + "aggs": { + "metric1": { + "avg": { + "field": "DistanceMiles" + } + } + }, + "terms": { + "exclude": [ + "a", + "b" + ], + "field": "Carrier", + "size": 1 + } + }, + "terms2": { + "aggs": { + "metric1": { + "avg": { + "field": "DistanceMiles" + } + } + }, + "terms": { + "exclude": [ + "Logstash Airways", + ".*" + ], + "field": "Carrier", + "size": 2 + } + } + }, + "size": 0, + "track_total_hits": true + }`, + // I omit "took", "timed_out", "_shards", and "hits" from the response for brevity (they can also be easily unit-tested) + ExpectedResponse: ` + { + "_shards": { + "failed": 0, + "skipped": 0, + "successful": 1, + "total": 1 + }, + "aggregations": { + "terms1": { + "buckets": [ + { + "doc_count": 3323, + "key": "Logstash Airways", + "metric1": { + "value": 4451.946294580208 + } + } + ], + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 9691 + }, + "terms2": { + "buckets": [ + { + "doc_count": 3261, + "key": "JetBeats", + "metric1": { + "value": 4434.670874554115 + } + }, + { + "doc_count": 3219, + "key": "Kibana Airlines", + "metric1": { + "value": 4335.019245198367 + } + } + ], + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 6534 + } + }, + "hits": { + "hits": [], + "max_score": null, + "total": { + "relation": "eq", + "value": 13014 + } + }, + "timed_out": false, + "took": 18 + }`, + ExpectedPancakeResults: []model.QueryResultRow{ + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__terms1__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms1__key_0", "Logstash Airways"), + model.NewQueryResultCol("aggr__terms1__count", int64(3323)), + model.NewQueryResultCol("metric__terms1__metric1_col_0", 4451.946294580208), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__terms1__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms1__key_0", "Discard"), + model.NewQueryResultCol("aggr__terms1__count", int64(5)), + model.NewQueryResultCol("metric__terms1__metric1_col_0", 6.20), + }}, + }, + ExpectedPancakeSQL: ` + SELECT sum(count(*)) OVER () AS "aggr__terms1__parent_count", + if("Carrier" NOT IN tuple('a', 'b'), "Carrier", NULL) AS "aggr__terms1__key_0" + , count(*) AS "aggr__terms1__count", + avgOrNull("DistanceMiles") AS "metric__terms1__metric1_col_0" + FROM __quesma_table_name + GROUP BY if("Carrier" NOT IN tuple('a', 'b'), "Carrier", NULL) AS + "aggr__terms1__key_0" + ORDER BY "aggr__terms1__count" DESC, "aggr__terms1__key_0" ASC + LIMIT 2`, + ExpectedAdditionalPancakeResults: [][]model.QueryResultRow{{ + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__terms2__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms2__key_0", "JetBeats"), + model.NewQueryResultCol("aggr__terms2__count", int64(3261)), + model.NewQueryResultCol("metric__terms2__metric1_col_0", 4434.670874554115), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__terms2__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms2__key_0", "Kibana Airlines"), + model.NewQueryResultCol("aggr__terms2__count", int64(3219)), + model.NewQueryResultCol("metric__terms2__metric1_col_0", 4335.019245198367), + }}, + {Cols: []model.QueryResultCol{ + model.NewQueryResultCol("aggr__terms2__parent_count", int64(13014)), + model.NewQueryResultCol("aggr__terms2__key_0", "Discard"), + model.NewQueryResultCol("aggr__terms2__count", int64(11)), + model.NewQueryResultCol("metric__terms2__metric1_col_0", 42), + }}, + }}, + ExpectedAdditionalPancakeSQLs: []string{` + SELECT sum(count(*)) OVER () AS "aggr__terms2__parent_count", + if("Carrier" NOT IN tuple('Logstash Airways', '.*'), "Carrier", NULL) AS + "aggr__terms2__key_0", count(*) AS "aggr__terms2__count", + avgOrNull("DistanceMiles") AS "metric__terms2__metric1_col_0" + FROM __quesma_table_name + GROUP BY if("Carrier" NOT IN tuple('Logstash Airways', '.*'), "Carrier", NULL) + AS "aggr__terms2__key_0" + ORDER BY "aggr__terms2__count" DESC, "aggr__terms2__key_0" ASC + LIMIT 3`}, + }, } diff --git a/quesma/util/regex/regex.go b/quesma/util/regex/regex.go new file mode 100644 index 000000000..e5a42aa89 --- /dev/null +++ b/quesma/util/regex/regex.go @@ -0,0 +1,40 @@ +// Copyright Quesma, licensed under the Elastic License 2.0. +// SPDX-License-Identifier: Elastic-2.0 +package regex + +import ( + "quesma/model" + "strings" +) + +// ToClickhouseExpr converts a regex pattern to a Clickhouse expression. +// It's our old heuristic, maybe it'll need to be improved. +func ToClickhouseExpr(pattern string) (clickhouseFuncName string, patternExpr model.Expr) { + // really simple == (out of all special characters, only . and .* may be present) + isPatternReallySimple := func(pattern string) bool { + // any special characters excluding . and * not allowed. Also (not the most important check) * can't be first character. + if strings.ContainsAny(pattern, `?+|{}[]()"\`) || (len(pattern) > 0 && pattern[0] == '*') { + return false + } + // .* allowed, but [any other char]* - not + for i, char := range pattern[1:] { + prevChar := pattern[i] + if char == '*' && prevChar != '.' { + return false + } + } + return true + } + + var funcName string + if isPatternReallySimple(pattern) { + pattern = strings.ReplaceAll(pattern, "_", `\_`) + pattern = strings.ReplaceAll(pattern, ".*", "%") + pattern = strings.ReplaceAll(pattern, ".", "_") + funcName = "LIKE" + } else { // this Clickhouse function is much slower, so we use it only for complex regexps + funcName = "REGEXP" + } + + return funcName, model.NewLiteral("'" + pattern + "'") +}