Skip to content

Commit

Permalink
Terms: start supporting exclude param (#1140)
Browse files Browse the repository at this point in the history
#1085

TODO (next PRs):
* `include` parameter can probably be quite easily handled the same way
* our regex translator is pretty basic, it requires tests + possibly a
better implementation

Issue from the issue works fine now:
<img width="1420" alt="Screenshot 2024-12-27 at 01 30 32"
src="https://github.com/user-attachments/assets/2b943135-7d10-4dd7-b418-600cd3a2632c"
/>
<img width="1416" alt="Screenshot 2024-12-27 at 01 32 17"
src="https://github.com/user-attachments/assets/39b82688-1816-42b9-8823-77a93af6e108"
/>
  • Loading branch information
trzysiek authored Dec 28, 2024
1 parent c7f0c19 commit c5d1baa
Show file tree
Hide file tree
Showing 8 changed files with 795 additions and 43 deletions.
4 changes: 2 additions & 2 deletions quesma/model/bucket_aggregations/ip_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ func (interval IpInterval) ToWhereClause(field model.Expr) model.Expr {
isBegin := interval.begin != UnboundedInterval
isEnd := interval.end != UnboundedInterval && interval.end != BiggestIpv4

begin := model.NewInfixExpr(field, ">=", model.NewLiteralSingleQuoted(interval.begin))
end := model.NewInfixExpr(field, "<", model.NewLiteralSingleQuoted(interval.end))
begin := model.NewInfixExpr(field, ">=", model.NewLiteralSingleQuoteString(interval.begin))
end := model.NewInfixExpr(field, "<", model.NewLiteralSingleQuoteString(interval.end))

if isBegin && isEnd {
return model.NewInfixExpr(begin, "AND", end)
Expand Down
119 changes: 116 additions & 3 deletions quesma/model/bucket_aggregations/terms.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,31 @@ package bucket_aggregations

import (
"context"
"fmt"
"quesma/logger"
"quesma/model"
"quesma/util"
"quesma/util/regex"
"reflect"
)

type Terms struct {
ctx context.Context
significant bool // true <=> significant_terms, false <=> terms
OrderByExpr model.Expr
// include is either:
// - single value: then for strings, it can be a regex.
// - array: then field must match exactly one of the values (never a regex)
// Nil if missing in request.
include any
// exclude is either:
// - single value: then for strings, it can be a regex.
// - array: then field must match exactly one of the values (never a regex)
// Nil if missing in request.
exclude any
}

func NewTerms(ctx context.Context, significant bool, orderByExpr model.Expr) Terms {
return Terms{ctx: ctx, significant: significant, OrderByExpr: orderByExpr}
func NewTerms(ctx context.Context, significant bool, include, exclude any) Terms {
return Terms{ctx: ctx, significant: significant, include: include, exclude: exclude}
}

func (query Terms) AggregationType() model.AggregationType {
Expand Down Expand Up @@ -106,3 +118,104 @@ func (query Terms) key(row model.QueryResultRow) any {
func (query Terms) parentCount(row model.QueryResultRow) any {
return row.Cols[len(row.Cols)-3].Value
}

func (query Terms) UpdateFieldForIncludeAndExclude(field model.Expr) (updatedField model.Expr, didWeUpdateField bool) {
// We'll use here everywhere Clickhouse 'if' function: if(condition, then, else)
// In our case field becomes: if(condition that field is not excluded, field, NULL)
ifOrNull := func(condition model.Expr) model.FunctionExpr {
return model.NewFunction("if", condition, field, model.NullExpr)
}

hasExclude := query.exclude != nil
excludeArr, excludeIsArray := query.exclude.([]any)
switch {
case hasExclude && excludeIsArray:
if len(excludeArr) == 0 {
return field, false
}

// Select expr will be: if(field NOT IN (excludeArr[0], excludeArr[1], ...), field, NULL)
exprs := make([]model.Expr, 0, len(excludeArr))
for _, excludeVal := range excludeArr {
exprs = append(exprs, model.NewLiteralSingleQuoteString(excludeVal))
}
return ifOrNull(model.NewInfixExpr(field, "NOT IN", model.NewTupleExpr(exprs...))), true
case hasExclude:
switch exclude := query.exclude.(type) {
case string: // hard case, might be regex
funcName, patternExpr := regex.ToClickhouseExpr(exclude)
return ifOrNull(model.NewInfixExpr(field, "NOT "+funcName, patternExpr)), true
default: // easy case, never regex
return ifOrNull(model.NewInfixExpr(field, "!=", model.NewLiteral(query.exclude))), true
}

default:
return field, false // TODO implement similar support for 'include' in next PR
}
}

// TODO make part of QueryType interface and implement for all aggregations
// TODO add bad requests to tests
// Doing so will ensure we see 100% of what we're interested in in our logs (now we see ~95%)
func CheckParamsTerms(ctx context.Context, paramsRaw any) error {
requiredParams := map[string]string{"field": "string"}
optionalParams := map[string]string{
"size": "float64|string", // TODO should be int|string, will be fixed
"shard_size": "float64", // TODO should be int, will be fixed
"order": "order", // TODO add order type
"min_doc_count": "float64", // TODO should be int, will be fixed
"shard_min_doc_count": "float64", // TODO should be int, will be fixed
"show_term_doc_count_error": "bool",
"exclude": "not-checking-type-now-complicated",
"include": "not-checking-type-now-complicated",
"collect_mode": "string",
"execution_hint": "string",
"missing": "string",
"value_type": "string",
}
logIfYouSeeThemParams := []string{
"shard_size", "min_doc_count", "shard_min_doc_count",
"show_term_doc_count_error", "collect_mode", "execution_hint", "value_type",
}

params, ok := paramsRaw.(model.JsonMap)
if !ok {
return fmt.Errorf("params is not a map, but %+v", paramsRaw)
}

// check if required are present
for paramName, paramType := range requiredParams {
paramVal, exists := params[paramName]
if !exists {
return fmt.Errorf("required parameter %s not found in Terms params", paramName)
}
if reflect.TypeOf(paramVal).Name() != paramType { // TODO I'll make a small rewrite to not use reflect here
return fmt.Errorf("required parameter %s is not of type %s, but %T", paramName, paramType, paramVal)
}
}

// check if only required/optional are present
for paramName := range params {
if _, isRequired := requiredParams[paramName]; !isRequired {
wantedType, isOptional := optionalParams[paramName]
if !isOptional {
return fmt.Errorf("unexpected parameter %s found in Terms params %v", paramName, params)
}
if wantedType == "not-checking-type-now-complicated" || wantedType == "order" || wantedType == "float64|string" {
continue // TODO: add that later
}
if reflect.TypeOf(params[paramName]).Name() != wantedType { // TODO I'll make a small rewrite to not use reflect here
return fmt.Errorf("optional parameter %s is not of type %s, but %T", paramName, wantedType, params[paramName])
}
}
}

// log if you see them
for _, warnParam := range logIfYouSeeThemParams {
if _, exists := params[warnParam]; exists {
logger.WarnWithCtxAndThrottling(ctx, "terms", warnParam, "we didn't expect %s in Terms params %v", warnParam, params)
}
}

return nil
}
11 changes: 9 additions & 2 deletions quesma/model/expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ var (
InvalidExpr = Expr(nil)
TrueExpr = NewLiteral(true)
FalseExpr = NewLiteral(false)
NullExpr = NewLiteral("NULL")
)

// ColumnRef is a reference to a column in a table, we can enrich it with more information (e.g. type used) as we go
Expand Down Expand Up @@ -129,8 +130,14 @@ func NewLiteral(value any) LiteralExpr {
return LiteralExpr{Value: value}
}

func NewLiteralSingleQuoted(value string) LiteralExpr {
return LiteralExpr{Value: fmt.Sprintf("'%s'", value)}
// NewLiteralSingleQuoteString simply does: string -> 'string', anything_else -> anything_else
func NewLiteralSingleQuoteString(value any) LiteralExpr {
switch v := value.(type) {
case string:
return LiteralExpr{Value: fmt.Sprintf("'%s'", v)}
default:
return LiteralExpr{Value: v}
}
}

// DistinctExpr is a representation of DISTINCT keyword in SQL, e.g. `SELECT DISTINCT` ... or `SELECT COUNT(DISTINCT ...)`
Expand Down
2 changes: 1 addition & 1 deletion quesma/model/expr_string_renderer.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func (v *renderer) VisitInfix(e InfixExpr) interface{} {
// I think in the future every infix op should be in braces.
if strings.HasPrefix(e.Op, "_") || e.Op == "AND" || e.Op == "OR" {
return fmt.Sprintf("(%v %v %v)", lhs, e.Op, rhs)
} else if strings.Contains(e.Op, "LIKE") || e.Op == "IS" || e.Op == "IN" || e.Op == "REGEXP" || strings.Contains(e.Op, "UNION") {
} else if strings.Contains(e.Op, "LIKE") || e.Op == "IS" || e.Op == "IN" || e.Op == "NOT IN" || e.Op == "REGEXP" || strings.Contains(e.Op, "UNION") {
return fmt.Sprintf("%v %v %v", lhs, e.Op, rhs)
} else {
return fmt.Sprintf("%v%v%v", lhs, e.Op, rhs)
Expand Down
19 changes: 16 additions & 3 deletions quesma/queryparser/pancake_aggregation_parser_buckets.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,20 +152,33 @@ func (cw *ClickhouseQueryTranslator) parseDateHistogram(aggregation *pancakeAggr

// aggrName - "terms" or "significant_terms"
func (cw *ClickhouseQueryTranslator) parseTermsAggregation(aggregation *pancakeAggregationTreeNode, params QueryMap, aggrName string) error {
if err := bucket_aggregations.CheckParamsTerms(cw.Ctx, params); err != nil {
return err
}

terms := bucket_aggregations.NewTerms(
cw.Ctx, aggrName == "significant_terms", params["include"], params["exclude"],
)

var didWeAddMissing, didWeUpdateFieldHere bool
field := cw.parseFieldField(params, aggrName)
field, didWeAddMissing := cw.addMissingParameterIfPresent(field, params)
if !didWeAddMissing {
field, didWeAddMissing = cw.addMissingParameterIfPresent(field, params)
field, didWeUpdateFieldHere = terms.UpdateFieldForIncludeAndExclude(field)

// If we updated above, we change our select to if(condition, field, NULL), so we also need to filter out those NULLs later
if !didWeAddMissing || didWeUpdateFieldHere {
aggregation.filterOutEmptyKeyBucket = true
}

const defaultSize = 10
size := cw.parseSize(params, defaultSize)

orderBy, err := cw.parseOrder(params, []model.Expr{field})
if err != nil {
return err
}

aggregation.queryType = bucket_aggregations.NewTerms(cw.Ctx, aggrName == "significant_terms", orderBy[0]) // TODO probably full, not [0]
aggregation.queryType = terms
aggregation.selectedColumns = append(aggregation.selectedColumns, field)
aggregation.limit = size
aggregation.orderBy = orderBy
Expand Down
42 changes: 10 additions & 32 deletions quesma/queryparser/query_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"quesma/quesma/types"
"quesma/schema"
"quesma/util"
"quesma/util/regex"
"strconv"
"strings"
"unicode"
Expand Down Expand Up @@ -886,28 +887,13 @@ func (cw *ClickhouseQueryTranslator) parseRegexp(queryMap QueryMap) (result mode
return model.NewSimpleQueryInvalid()
}

// really simple == (out of all special characters, only . and .* may be present)
isPatternReallySimple := func(pattern string) bool {
// any special characters excluding . and * not allowed. Also (not the most important check) * can't be first character.
if strings.ContainsAny(pattern, `?+|{}[]()"\`) || (len(pattern) > 0 && pattern[0] == '*') {
return false
}
// .* allowed, but [any other char]* - not
for i, char := range pattern[1:] {
if char == '*' && pattern[i] != '.' {
return false
}
}
return true
}

for fieldName, parametersRaw := range queryMap {
parameters, ok := parametersRaw.(QueryMap)
for fieldName, paramsRaw := range queryMap {
params, ok := paramsRaw.(QueryMap)
if !ok {
logger.WarnWithCtx(cw.Ctx).Msgf("invalid regexp parameters type: %T, value: %v", parametersRaw, parametersRaw)
logger.WarnWithCtx(cw.Ctx).Msgf("invalid regexp parameters type: %T, value: %v", paramsRaw, paramsRaw)
return model.NewSimpleQueryInvalid()
}
patternRaw, exists := parameters["value"]
patternRaw, exists := params["value"]
if !exists {
logger.WarnWithCtx(cw.Ctx).Msgf("no value in regexp query: %v", queryMap)
return model.NewSimpleQueryInvalid()
Expand All @@ -918,21 +904,13 @@ func (cw *ClickhouseQueryTranslator) parseRegexp(queryMap QueryMap) (result mode
return model.NewSimpleQueryInvalid()
}

if len(parameters) > 1 {
logger.WarnWithCtx(cw.Ctx).Msgf("unsupported regexp parameters: %v", parameters)
if len(params) > 1 {
logger.WarnWithCtx(cw.Ctx).Msgf("unsupported regexp parameters: %v", params)
}

var funcName string
if isPatternReallySimple(pattern) {
pattern = strings.ReplaceAll(pattern, "_", `\_`)
pattern = strings.ReplaceAll(pattern, ".*", "%")
pattern = strings.ReplaceAll(pattern, ".", "_")
funcName = "LIKE"
} else { // this Clickhouse function is much slower, so we use it only for complex regexps
funcName = "REGEXP"
}
return model.NewSimpleQuery(
model.NewInfixExpr(model.NewColumnRef(fieldName), funcName, model.NewLiteral("'"+pattern+"'")), true)
clickhouseFuncName, patternExpr := regex.ToClickhouseExpr(pattern)
clickhouseExpr := model.NewInfixExpr(model.NewColumnRef(fieldName), clickhouseFuncName, patternExpr)
return model.NewSimpleQuery(clickhouseExpr, true)
}

logger.ErrorWithCtx(cw.Ctx).Msg("parseRegexp: theoretically unreachable code")
Expand Down
Loading

0 comments on commit c5d1baa

Please sign in to comment.