From d085fa58b01335984838223f311e20aab418d860 Mon Sep 17 00:00:00 2001 From: Piotr Grabowski Date: Wed, 4 Dec 2024 11:26:20 +0100 Subject: [PATCH] Fix `match` query for integer fields (#1037) The `match` query is in most cases a full-text search, e.g.: ```json { "query": { "match": { "message": "this is a test" } } } ``` However, it can also be used to find an exact value against a integer field: ```json { "query": { "match": { "products_count": "5" } } } ``` In such case Quesma would generate an invalid SQL, trying to do an `ILIKE` against an `Int64` column: ``` Illegal type Int64 of argument of function ilike ``` Fix the issue by introducing an internal __quesma_match operator and a transformation which transforms it either to `ILIKE` or `=`. Fixes #1018 --------- Co-authored-by: Krzysztof Kiewicz --- quesma/model/constants.go | 1 + quesma/model/expr_string_renderer.go | 2 +- quesma/model/highlighter.go | 4 +- quesma/queryparser/query_parser.go | 2 +- quesma/quesma/schema_transformer.go | 45 ++++++++++ quesma/quesma/schema_transformer_test.go | 104 +++++++++++++++++++++++ quesma/testdata/aggregation_requests.go | 18 ++-- quesma/testdata/clients/clover.go | 2 +- quesma/testdata/requests.go | 18 ++-- 9 files changed, 173 insertions(+), 23 deletions(-) diff --git a/quesma/model/constants.go b/quesma/model/constants.go index 1ea242c27..a409d9c24 100644 --- a/quesma/model/constants.go +++ b/quesma/model/constants.go @@ -8,4 +8,5 @@ const ( TimestampFieldName = "@timestamp" DateHourFunction = "__quesma_date_hour" + MatchOperator = "__quesma_match" ) diff --git a/quesma/model/expr_string_renderer.go b/quesma/model/expr_string_renderer.go index 448feff33..1d3f465b9 100644 --- a/quesma/model/expr_string_renderer.go +++ b/quesma/model/expr_string_renderer.go @@ -82,7 +82,7 @@ func (v *renderer) VisitInfix(e InfixExpr) interface{} { } // This might look like a strange heuristics to but is aligned with the way we are currently generating the statement // I think in the future every infix op should be in braces. - if e.Op == "AND" || e.Op == "OR" { + if strings.HasPrefix(e.Op, "_") || e.Op == "AND" || e.Op == "OR" { return fmt.Sprintf("(%v %v %v)", lhs, e.Op, rhs) } else if strings.Contains(e.Op, "LIKE") || e.Op == "IS" || e.Op == "IN" || e.Op == "REGEXP" { return fmt.Sprintf("%v %v %v", lhs, e.Op, rhs) diff --git a/quesma/model/highlighter.go b/quesma/model/highlighter.go index e52b192bc..d716750e4 100644 --- a/quesma/model/highlighter.go +++ b/quesma/model/highlighter.go @@ -57,13 +57,13 @@ func (h *Highlighter) SetTokensToHighlight(selectCmd SelectCommand) { visitor.OverrideVisitInfix = func(b *BaseExprVisitor, e InfixExpr) interface{} { switch e.Op { - case "iLIKE", "LIKE", "IN", "=": + case "iLIKE", "LIKE", "IN", "=", MatchOperator: lhs, isColumnRef := e.Left.(ColumnRef) rhs, isLiteral := e.Right.(LiteralExpr) if isLiteral && isColumnRef { // we only highlight in this case switch literalAsString := rhs.Value.(type) { case string: - literalAsString = strings.TrimPrefix(literalAsString, "'%") + literalAsString = strings.TrimPrefix(literalAsString, "'") literalAsString = strings.TrimPrefix(literalAsString, "%") literalAsString = strings.TrimSuffix(literalAsString, "'") literalAsString = strings.TrimSuffix(literalAsString, "%") diff --git a/quesma/queryparser/query_parser.go b/quesma/queryparser/query_parser.go index 11b3415fd..8619c4304 100644 --- a/quesma/queryparser/query_parser.go +++ b/quesma/queryparser/query_parser.go @@ -588,7 +588,7 @@ func (cw *ClickhouseQueryTranslator) parseMatch(queryMap QueryMap, matchPhrase b computedIdMatchingQuery := cw.parseIds(QueryMap{"values": []interface{}{subQuery}}) statements = append(statements, computedIdMatchingQuery.WhereClause) } else { - simpleStat := model.NewInfixExpr(model.NewColumnRef(fieldName), "iLIKE", model.NewLiteral("'%"+subQuery+"%'")) + simpleStat := model.NewInfixExpr(model.NewColumnRef(fieldName), model.MatchOperator, model.NewLiteral("'"+subQuery+"'")) statements = append(statements, simpleStat) } } diff --git a/quesma/quesma/schema_transformer.go b/quesma/quesma/schema_transformer.go index 9f6e3ad36..164deba1c 100644 --- a/quesma/quesma/schema_transformer.go +++ b/quesma/quesma/schema_transformer.go @@ -751,6 +751,7 @@ func (s *SchemaCheckPass) Transform(queries []*model.Query) ([]*model.Query, err {TransformationName: "GeoTransformation", Transformation: s.applyGeoTransformations}, {TransformationName: "ArrayTransformation", Transformation: s.applyArrayTransformations}, {TransformationName: "MapTransformation", Transformation: s.applyMapTransformations}, + {TransformationName: "MatchOperatorTransformation", Transformation: s.applyMatchOperator}, // Section 4: compensations and checks {TransformationName: "BooleanLiteralTransformation", Transformation: s.applyBooleanLiteralLowering}, @@ -789,3 +790,47 @@ func (s *SchemaCheckPass) Transform(queries []*model.Query) ([]*model.Query, err } return queries, nil } + +func (s *SchemaCheckPass) applyMatchOperator(indexSchema schema.Schema, query *model.Query) (*model.Query, error) { + + visitor := model.NewBaseVisitor() + + var err error + + visitor.OverrideVisitInfix = func(b *model.BaseExprVisitor, e model.InfixExpr) interface{} { + lhs, ok := e.Left.(model.ColumnRef) + rhs, ok2 := e.Right.(model.LiteralExpr) + + if ok && ok2 && e.Op == model.MatchOperator { + field, found := indexSchema.ResolveFieldByInternalName(lhs.ColumnName) + if !found { + logger.Error().Msgf("Field %s not found in schema for table %s, should never happen here", lhs.ColumnName, query.TableName) + } + + rhsValue := rhs.Value.(string) + rhsValue = strings.TrimPrefix(rhsValue, "'") + rhsValue = strings.TrimSuffix(rhsValue, "'") + + switch field.Type.String() { + case schema.QuesmaTypeInteger.Name, schema.QuesmaTypeLong.Name, schema.QuesmaTypeUnsignedLong.Name, schema.QuesmaTypeBoolean.Name: + return model.NewInfixExpr(lhs, "=", model.NewLiteral(rhsValue)) + default: + return model.NewInfixExpr(lhs, "iLIKE", model.NewLiteral("'%"+rhsValue+"%'")) + } + } + + return model.NewInfixExpr(e.Left.Accept(b).(model.Expr), e.Op, e.Right.Accept(b).(model.Expr)) + } + + expr := query.SelectCommand.Accept(visitor) + + if err != nil { + return nil, err + } + + if _, ok := expr.(*model.SelectCommand); ok { + query.SelectCommand = *expr.(*model.SelectCommand) + } + return query, nil + +} diff --git a/quesma/quesma/schema_transformer_test.go b/quesma/quesma/schema_transformer_test.go index bf2510f63..565f27d0b 100644 --- a/quesma/quesma/schema_transformer_test.go +++ b/quesma/quesma/schema_transformer_test.go @@ -979,3 +979,107 @@ func TestFullTextFields(t *testing.T) { }) } } + +func Test_applyMatchOperator(t *testing.T) { + schemaTable := schema.Table{ + Columns: map[string]schema.Column{ + "message": {Name: "message", Type: "String"}, + "count": {Name: "count", Type: "Int64"}, + }, + } + + tests := []struct { + name string + query *model.Query + expected *model.Query + }{ + { + name: "match operator transformation for String (ILIKE)", + query: &model.Query{ + TableName: "test", + SelectCommand: model.SelectCommand{ + FromClause: model.NewTableRef("test"), + Columns: []model.Expr{model.NewColumnRef("message")}, + WhereClause: model.NewInfixExpr( + model.NewColumnRef("message"), + model.MatchOperator, + model.NewLiteral("'needle'"), + ), + }, + }, + expected: &model.Query{ + TableName: "test", + SelectCommand: model.SelectCommand{ + FromClause: model.NewTableRef("test"), + Columns: []model.Expr{model.NewColumnRef("message")}, + WhereClause: model.NewInfixExpr( + model.NewColumnRef("message"), + "iLIKE", + model.NewLiteral("'%needle%'"), + ), + }, + }, + }, + { + name: "match operator transformation for Int64 (=)", + query: &model.Query{ + TableName: "test", + SelectCommand: model.SelectCommand{ + FromClause: model.NewTableRef("test"), + Columns: []model.Expr{model.NewColumnRef("message")}, + WhereClause: model.NewInfixExpr( + model.NewColumnRef("count"), + model.MatchOperator, + model.NewLiteral("'123'"), + ), + }, + }, + expected: &model.Query{ + TableName: "test", + SelectCommand: model.SelectCommand{ + FromClause: model.NewTableRef("test"), + Columns: []model.Expr{model.NewColumnRef("message")}, + WhereClause: model.NewInfixExpr( + model.NewColumnRef("count"), + "=", + model.NewLiteral("123"), + ), + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tableDiscovery := + fixedTableProvider{tables: map[string]schema.Table{ + "test": schemaTable, + }} + + indexConfig := map[string]config.IndexConfiguration{ + "test": { + Name: "test", + }, + } + + cfg := config.QuesmaConfiguration{ + IndexConfig: indexConfig, + } + + s := schema.NewSchemaRegistry(tableDiscovery, &cfg, clickhouse.SchemaTypeAdapter{}) + transform := &SchemaCheckPass{cfg: &cfg} + + indexSchema, ok := s.FindSchema("test") + if !ok { + t.Fatal("schema not found") + } + + actual, err := transform.applyMatchOperator(indexSchema, tt.query) + if err != nil { + t.Fatal(err) + } + + assert.Equal(t, model.AsString(tt.expected.SelectCommand), model.AsString(actual.SelectCommand)) + }) + } +} diff --git a/quesma/testdata/aggregation_requests.go b/quesma/testdata/aggregation_requests.go index c8b5e9591..2ed685ae1 100644 --- a/quesma/testdata/aggregation_requests.go +++ b/quesma/testdata/aggregation_requests.go @@ -1844,7 +1844,7 @@ var AggregationTests = []AggregationTestCase{ "@timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS "aggr__0__1__key_0", count(*) AS "aggr__0__1__count" FROM __quesma_table_name - WHERE ("host.name" iLIKE '%prometheus%' AND ("@timestamp">= + WHERE (("host.name" __quesma_match 'prometheus') AND ("@timestamp">= fromUnixTimestamp64Milli(1706891809940) AND "@timestamp"<= fromUnixTimestamp64Milli(1707496609940))) GROUP BY "severity" AS "aggr__0__key_0", @@ -2848,8 +2848,8 @@ var AggregationTests = []AggregationTestCase{ "@timestamp") AS "metric__earliest_timestamp_col_0", maxOrNull("@timestamp") AS "metric__latest_timestamp_col_0" FROM ` + TableName + ` - WHERE ((` + fullTextFieldName + ` iLIKE '%posei%' AND "message" iLIKE '%User logged out%') AND - "host.name" iLIKE '%poseidon%')`, + WHERE ((` + fullTextFieldName + ` iLIKE '%posei%' AND ("message" __quesma_match 'User logged out')) AND + ("host.name" __quesma_match 'poseidon'))`, }, { // [15] TestName: "date_histogram: regression test", @@ -6195,7 +6195,7 @@ var AggregationTests = []AggregationTestCase{ "aggr__0__1__parent_count", "message" AS "aggr__0__1__key_0", count(*) AS "aggr__0__1__count" FROM __quesma_table_name - WHERE ("message" IS NOT NULL AND NOT ("message" iLIKE '%US%')) + WHERE ("message" IS NOT NULL AND NOT (("message" __quesma_match 'US'))) GROUP BY "host.name" AS "aggr__0__key_0", "message" AS "aggr__0__1__key_0")) WHERE ("aggr__0__order_1_rank"<=11 AND "aggr__0__1__order_1_rank"<=4) ORDER BY "aggr__0__order_1_rank" ASC, "aggr__0__1__order_1_rank" ASC`, @@ -6311,7 +6311,7 @@ var AggregationTests = []AggregationTestCase{ "aggr__0__1__2__parent_count", "message" AS "aggr__0__1__2__key_0", count(*) AS "aggr__0__1__2__count" FROM __quesma_table_name - WHERE ("message" IS NOT NULL AND NOT ("message" iLIKE '%US%')) + WHERE ("message" IS NOT NULL AND NOT (("message" __quesma_match 'US'))) GROUP BY "host.name" AS "aggr__0__key_0", "message" AS "aggr__0__1__key_0", "message" AS "aggr__0__1__2__key_0")) WHERE (("aggr__0__order_1_rank"<=11 AND "aggr__0__1__order_1_rank"<=4) AND @@ -6403,7 +6403,7 @@ var AggregationTests = []AggregationTestCase{ sum(count(*)) OVER (PARTITION BY "aggr__0__key_0") AS "aggr__0__count", "FlightDelayMin" AS "aggr__0__1__key_0", count(*) AS "aggr__0__1__count" FROM ` + TableName + ` - WHERE ("message" IS NOT NULL AND NOT ("message" iLIKE '%US%')) + WHERE ("message" IS NOT NULL AND NOT (("message" __quesma_match 'US'))) GROUP BY "host.name" AS "aggr__0__key_0", "FlightDelayMin" AS "aggr__0__1__key_0")) WHERE "aggr__0__order_1_rank"<=9 @@ -6513,7 +6513,7 @@ var AggregationTests = []AggregationTestCase{ sum(count(*)) OVER (PARTITION BY "aggr__0__key_0") AS "aggr__0__count", "FlightDelayMin" AS "aggr__0__1__key_0", count(*) AS "aggr__0__1__count" FROM ` + TableName + ` - WHERE ("message" IS NOT NULL AND NOT ("message" iLIKE '%US%')) + WHERE ("message" IS NOT NULL AND NOT (("message" __quesma_match 'US'))) GROUP BY "host.name" AS "aggr__0__key_0", "FlightDelayMin" AS "aggr__0__1__key_0")) WHERE "aggr__0__order_1_rank"<=11 @@ -6610,7 +6610,7 @@ var AggregationTests = []AggregationTestCase{ sum(count(*)) OVER (PARTITION BY "aggr__0__key_0") AS "aggr__0__count", "FlightDelayMin" AS "aggr__0__1__key_0", count(*) AS "aggr__0__1__count" FROM __quesma_table_name - WHERE ("message" IS NOT NULL AND NOT ("message" iLIKE '%US%')) + WHERE ("message" IS NOT NULL AND NOT (("message" __quesma_match 'US'))) GROUP BY "host.name" AS "aggr__0__key_0", "FlightDelayMin" AS "aggr__0__1__key_0")) WHERE "aggr__0__order_1_rank"<=11 @@ -6826,7 +6826,7 @@ var AggregationTests = []AggregationTestCase{ count(*) AS "aggr__2__count", sumOrNull("total") AS "metric__2__1_col_0" FROM ` + TableName + ` - WHERE NOT ((("abc">=0 AND "abc"<600) OR "type" iLIKE '%def%')) + WHERE NOT ((("abc">=0 AND "abc"<600) OR ("type" __quesma_match 'def'))) GROUP BY "name" AS "aggr__2__key_0" ORDER BY "metric__2__1_col_0" DESC, "aggr__2__key_0" ASC LIMIT 11`, diff --git a/quesma/testdata/clients/clover.go b/quesma/testdata/clients/clover.go index 2bc58c61e..bb1ec73e6 100644 --- a/quesma/testdata/clients/clover.go +++ b/quesma/testdata/clients/clover.go @@ -765,7 +765,7 @@ var CloverTests = []testdata.AggregationTestCase{ "field" AS "aggr__other-filter__3__key_0", count(*) AS "aggr__other-filter__3__count" FROM __quesma_table_name - WHERE ("a" iLIKE '%b%' AND "c" iLIKE '%d%') + WHERE (("a" __quesma_match 'b') AND ("c" __quesma_match 'd')) GROUP BY "field" AS "aggr__other-filter__3__key_0" ORDER BY "aggr__other-filter__3__count" DESC, "aggr__other-filter__3__key_0" ASC diff --git a/quesma/testdata/requests.go b/quesma/testdata/requests.go index 52a81085b..99b45df94 100644 --- a/quesma/testdata/requests.go +++ b/quesma/testdata/requests.go @@ -1131,7 +1131,7 @@ var TestsSearch = []SearchTestCase{ }, "track_total_hits": false }`, - []string{`"host_name" iLIKE '%prometheus%'`}, + []string{`("host_name" __quesma_match 'prometheus')`}, model.ListAllFields, []string{`SELECT "message" FROM ` + TableName + ` WHERE "host_name" iLIKE '%prometheus%' LIMIT 10`}, []string{}, @@ -1148,7 +1148,7 @@ var TestsSearch = []SearchTestCase{ "size": 100, "track_total_hits": false }`, - []string{`((("message" iLIKE '%this%' OR "message" iLIKE '%is%') OR "message" iLIKE '%a%') OR "message" iLIKE '%test%')`}, + []string{`(((("message" __quesma_match 'this') OR ("message" __quesma_match 'is')) OR ("message" __quesma_match 'a')) OR ("message" __quesma_match 'test'))`}, model.ListAllFields, []string{ `SELECT "message" FROM ` + TableName + ` WHERE ((("message" iLIKE '%this%' OR "message" iLIKE '%is%') ` + @@ -1405,7 +1405,7 @@ var TestsSearch = []SearchTestCase{ }, "track_total_hits": false }`, - []string{`"message" iLIKE '%this is a test%'`}, + []string{`("message" __quesma_match 'this is a test')`}, model.ListAllFields, []string{`SELECT "message" FROM ` + TableName + ` WHERE "message" iLIKE '%this is a test%'`}, []string{}, @@ -1423,7 +1423,7 @@ var TestsSearch = []SearchTestCase{ }, "track_total_hits": false }`, - []string{`"message" iLIKE '%this is a test%'`}, + []string{`("message" __quesma_match 'this is a test')`}, model.ListAllFields, []string{`SELECT "message" FROM ` + TableName + ` WHERE "message" iLIKE '%this is a test%'`}, []string{}, @@ -1687,9 +1687,9 @@ var TestsSearch = []SearchTestCase{ "track_total_hits": true }`, []string{ - `(("message" iLIKE '%User logged out%' AND "host.name" iLIKE '%poseidon%') ` + + `((("message" __quesma_match 'User logged out') AND ("host.name" __quesma_match 'poseidon')) ` + `AND ("@timestamp">=fromUnixTimestamp64Milli(1706542596491) AND "@timestamp"<=fromUnixTimestamp64Milli(1706551896491)))`, - `((("message" iLIKE '%User logged out%' AND "host.name" iLIKE '%poseidon%') ` + + `(((("message" __quesma_match 'User logged out') AND ("host.name" __quesma_match 'poseidon')) ` + `AND ("@timestamp">=fromUnixTimestamp64Milli(1706542596491) AND "@timestamp"<=fromUnixTimestamp64Milli(1706551896491))) ` + `AND "stream.namespace" IS NOT NULL)`, }, @@ -1847,10 +1847,10 @@ var TestsSearch = []SearchTestCase{ "timeout": "1000ms" }`, []string{ - `((("message" iLIKE '%User logged out%' AND "host.name" iLIKE '%poseidon%') ` + + `(((("message" __quesma_match 'User logged out') AND ("host.name" __quesma_match 'poseidon')) ` + `AND ("@timestamp">=fromUnixTimestamp64Milli(1706542596491) AND "@timestamp"<=fromUnixTimestamp64Milli(1706551896491))) ` + `AND "namespace" IS NOT NULL)`, - `(("message" iLIKE '%User logged out%' AND "host.name" iLIKE '%poseidon%') ` + + `((("message" __quesma_match 'User logged out') AND ("host.name" __quesma_match 'poseidon')) ` + `AND ("@timestamp">=fromUnixTimestamp64Milli(1706542596491) AND "@timestamp"<=fromUnixTimestamp64Milli(1706551896491)))`, }, model.Normal, @@ -2085,7 +2085,7 @@ var TestsSearch = []SearchTestCase{ "track_total_hits": false, "size": 12 }`, - []string{`("message" iLIKE '%User logged out%' AND "message" iLIKE '%User logged out%')`}, + []string{`(("message" __quesma_match 'User logged out') AND ("message" __quesma_match 'User logged out'))`}, model.ListAllFields, []string{ `SELECT "message" ` +