Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Terms: start supporting exclude param #1140

Merged
merged 6 commits into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions quesma/model/bucket_aggregations/ip_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ func (interval IpInterval) ToWhereClause(field model.Expr) model.Expr {
isBegin := interval.begin != UnboundedInterval
isEnd := interval.end != UnboundedInterval && interval.end != BiggestIpv4

begin := model.NewInfixExpr(field, ">=", model.NewLiteralSingleQuoted(interval.begin))
end := model.NewInfixExpr(field, "<", model.NewLiteralSingleQuoted(interval.end))
begin := model.NewInfixExpr(field, ">=", model.NewLiteralSingleQuoteString(interval.begin))
end := model.NewInfixExpr(field, "<", model.NewLiteralSingleQuoteString(interval.end))

if isBegin && isEnd {
return model.NewInfixExpr(begin, "AND", end)
Expand Down
119 changes: 116 additions & 3 deletions quesma/model/bucket_aggregations/terms.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,31 @@ package bucket_aggregations

import (
"context"
"fmt"
"quesma/logger"
"quesma/model"
"quesma/util"
"quesma/util/regex"
"reflect"
)

type Terms struct {
ctx context.Context
significant bool // true <=> significant_terms, false <=> terms
OrderByExpr model.Expr
// include is either:
// - single value: then for strings, it can be a regex.
// - array: then field must match exactly one of the values (never a regex)
// Nil if missing in request.
include any
// exclude is either:
// - single value: then for strings, it can be a regex.
// - array: then field must match exactly one of the values (never a regex)
// Nil if missing in request.
exclude any
}

func NewTerms(ctx context.Context, significant bool, orderByExpr model.Expr) Terms {
return Terms{ctx: ctx, significant: significant, OrderByExpr: orderByExpr}
func NewTerms(ctx context.Context, significant bool, include, exclude any) Terms {
return Terms{ctx: ctx, significant: significant, include: include, exclude: exclude}
}

func (query Terms) AggregationType() model.AggregationType {
Expand Down Expand Up @@ -106,3 +118,104 @@ func (query Terms) key(row model.QueryResultRow) any {
func (query Terms) parentCount(row model.QueryResultRow) any {
return row.Cols[len(row.Cols)-3].Value
}

func (query Terms) UpdateFieldForIncludeAndExclude(field model.Expr) (updatedField model.Expr, didWeUpdateField bool) {
// We'll use here everywhere Clickhouse 'if' function: if(condition, then, else)
// In our case field becomes: if(condition that field is not excluded, field, NULL)
ifOrNull := func(condition model.Expr) model.FunctionExpr {
return model.NewFunction("if", condition, field, model.NullExpr)
}

hasExclude := query.exclude != nil
excludeArr, excludeIsArray := query.exclude.([]any)
switch {
case hasExclude && excludeIsArray:
if len(excludeArr) == 0 {
return field, false
}

// Select expr will be: if(field NOT IN (excludeArr[0], excludeArr[1], ...), field, NULL)
exprs := make([]model.Expr, 0, len(excludeArr))
for _, excludeVal := range excludeArr {
exprs = append(exprs, model.NewLiteralSingleQuoteString(excludeVal))
}
return ifOrNull(model.NewInfixExpr(field, "NOT IN", model.NewTupleExpr(exprs...))), true
case hasExclude:
switch exclude := query.exclude.(type) {
case string: // hard case, might be regex
funcName, patternExpr := regex.ToClickhouseExpr(exclude)
return ifOrNull(model.NewInfixExpr(field, "NOT "+funcName, patternExpr)), true
default: // easy case, never regex
return ifOrNull(model.NewInfixExpr(field, "!=", model.NewLiteral(query.exclude))), true
}

default:
return field, false // TODO implement similar support for 'include' in next PR
}
}

// TODO make part of QueryType interface and implement for all aggregations
// TODO add bad requests to tests
// Doing so will ensure we see 100% of what we're interested in in our logs (now we see ~95%)
func CheckParamsTerms(ctx context.Context, paramsRaw any) error {
requiredParams := map[string]string{"field": "string"}
optionalParams := map[string]string{
"size": "float64|string", // TODO should be int|string, will be fixed
"shard_size": "float64", // TODO should be int, will be fixed
"order": "order", // TODO add order type
"min_doc_count": "float64", // TODO should be int, will be fixed
"shard_min_doc_count": "float64", // TODO should be int, will be fixed
"show_term_doc_count_error": "bool",
"exclude": "not-checking-type-now-complicated",
"include": "not-checking-type-now-complicated",
"collect_mode": "string",
"execution_hint": "string",
"missing": "string",
"value_type": "string",
}
logIfYouSeeThemParams := []string{
"shard_size", "min_doc_count", "shard_min_doc_count",
"show_term_doc_count_error", "collect_mode", "execution_hint", "value_type",
}

params, ok := paramsRaw.(model.JsonMap)
if !ok {
return fmt.Errorf("params is not a map, but %+v", paramsRaw)
}

// check if required are present
for paramName, paramType := range requiredParams {
paramVal, exists := params[paramName]
if !exists {
return fmt.Errorf("required parameter %s not found in Terms params", paramName)
}
if reflect.TypeOf(paramVal).Name() != paramType { // TODO I'll make a small rewrite to not use reflect here
return fmt.Errorf("required parameter %s is not of type %s, but %T", paramName, paramType, paramVal)
}
}

// check if only required/optional are present
for paramName := range params {
if _, isRequired := requiredParams[paramName]; !isRequired {
wantedType, isOptional := optionalParams[paramName]
if !isOptional {
return fmt.Errorf("unexpected parameter %s found in Terms params %v", paramName, params)
}
if wantedType == "not-checking-type-now-complicated" || wantedType == "order" || wantedType == "float64|string" {
continue // TODO: add that later
}
if reflect.TypeOf(params[paramName]).Name() != wantedType { // TODO I'll make a small rewrite to not use reflect here
return fmt.Errorf("optional parameter %s is not of type %s, but %T", paramName, wantedType, params[paramName])
}
}
}

// log if you see them
for _, warnParam := range logIfYouSeeThemParams {
if _, exists := params[warnParam]; exists {
logger.WarnWithCtxAndThrottling(ctx, "terms", warnParam, "we didn't expect %s in Terms params %v", warnParam, params)
}
}

return nil
}
11 changes: 9 additions & 2 deletions quesma/model/expr.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ var (
InvalidExpr = Expr(nil)
TrueExpr = NewLiteral(true)
FalseExpr = NewLiteral(false)
NullExpr = NewLiteral("NULL")
)

// ColumnRef is a reference to a column in a table, we can enrich it with more information (e.g. type used) as we go
Expand Down Expand Up @@ -129,8 +130,14 @@ func NewLiteral(value any) LiteralExpr {
return LiteralExpr{Value: value}
}

func NewLiteralSingleQuoted(value string) LiteralExpr {
return LiteralExpr{Value: fmt.Sprintf("'%s'", value)}
// NewLiteralSingleQuoteString simply does: string -> 'string', anything_else -> anything_else
func NewLiteralSingleQuoteString(value any) LiteralExpr {
switch v := value.(type) {
case string:
return LiteralExpr{Value: fmt.Sprintf("'%s'", v)}
default:
return LiteralExpr{Value: v}
}
}

// DistinctExpr is a representation of DISTINCT keyword in SQL, e.g. `SELECT DISTINCT` ... or `SELECT COUNT(DISTINCT ...)`
Expand Down
2 changes: 1 addition & 1 deletion quesma/model/expr_string_renderer.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func (v *renderer) VisitInfix(e InfixExpr) interface{} {
// I think in the future every infix op should be in braces.
if strings.HasPrefix(e.Op, "_") || e.Op == "AND" || e.Op == "OR" {
return fmt.Sprintf("(%v %v %v)", lhs, e.Op, rhs)
} else if strings.Contains(e.Op, "LIKE") || e.Op == "IS" || e.Op == "IN" || e.Op == "REGEXP" || strings.Contains(e.Op, "UNION") {
} else if strings.Contains(e.Op, "LIKE") || e.Op == "IS" || e.Op == "IN" || e.Op == "NOT IN" || e.Op == "REGEXP" || strings.Contains(e.Op, "UNION") {
return fmt.Sprintf("%v %v %v", lhs, e.Op, rhs)
} else {
return fmt.Sprintf("%v%v%v", lhs, e.Op, rhs)
Expand Down
19 changes: 16 additions & 3 deletions quesma/queryparser/pancake_aggregation_parser_buckets.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,20 +152,33 @@ func (cw *ClickhouseQueryTranslator) parseDateHistogram(aggregation *pancakeAggr

// aggrName - "terms" or "significant_terms"
func (cw *ClickhouseQueryTranslator) parseTermsAggregation(aggregation *pancakeAggregationTreeNode, params QueryMap, aggrName string) error {
if err := bucket_aggregations.CheckParamsTerms(cw.Ctx, params); err != nil {
return err
}

terms := bucket_aggregations.NewTerms(
cw.Ctx, aggrName == "significant_terms", params["include"], params["exclude"],
)

var didWeAddMissing, didWeUpdateFieldHere bool
field := cw.parseFieldField(params, aggrName)
field, didWeAddMissing := cw.addMissingParameterIfPresent(field, params)
if !didWeAddMissing {
field, didWeAddMissing = cw.addMissingParameterIfPresent(field, params)
field, didWeUpdateFieldHere = terms.UpdateFieldForIncludeAndExclude(field)

// If we updated above, we change our select to if(condition, field, NULL), so we also need to filter out those NULLs later
if !didWeAddMissing || didWeUpdateFieldHere {
aggregation.filterOutEmptyKeyBucket = true
}

const defaultSize = 10
size := cw.parseSize(params, defaultSize)

orderBy, err := cw.parseOrder(params, []model.Expr{field})
if err != nil {
return err
}

aggregation.queryType = bucket_aggregations.NewTerms(cw.Ctx, aggrName == "significant_terms", orderBy[0]) // TODO probably full, not [0]
aggregation.queryType = terms
aggregation.selectedColumns = append(aggregation.selectedColumns, field)
aggregation.limit = size
aggregation.orderBy = orderBy
Expand Down
42 changes: 10 additions & 32 deletions quesma/queryparser/query_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"quesma/quesma/types"
"quesma/schema"
"quesma/util"
"quesma/util/regex"
"strconv"
"strings"
"unicode"
Expand Down Expand Up @@ -886,28 +887,13 @@ func (cw *ClickhouseQueryTranslator) parseRegexp(queryMap QueryMap) (result mode
return model.NewSimpleQueryInvalid()
}

// really simple == (out of all special characters, only . and .* may be present)
isPatternReallySimple := func(pattern string) bool {
// any special characters excluding . and * not allowed. Also (not the most important check) * can't be first character.
if strings.ContainsAny(pattern, `?+|{}[]()"\`) || (len(pattern) > 0 && pattern[0] == '*') {
return false
}
// .* allowed, but [any other char]* - not
for i, char := range pattern[1:] {
if char == '*' && pattern[i] != '.' {
return false
}
}
return true
}

for fieldName, parametersRaw := range queryMap {
parameters, ok := parametersRaw.(QueryMap)
for fieldName, paramsRaw := range queryMap {
params, ok := paramsRaw.(QueryMap)
if !ok {
logger.WarnWithCtx(cw.Ctx).Msgf("invalid regexp parameters type: %T, value: %v", parametersRaw, parametersRaw)
logger.WarnWithCtx(cw.Ctx).Msgf("invalid regexp parameters type: %T, value: %v", paramsRaw, paramsRaw)
return model.NewSimpleQueryInvalid()
}
patternRaw, exists := parameters["value"]
patternRaw, exists := params["value"]
if !exists {
logger.WarnWithCtx(cw.Ctx).Msgf("no value in regexp query: %v", queryMap)
return model.NewSimpleQueryInvalid()
Expand All @@ -918,21 +904,13 @@ func (cw *ClickhouseQueryTranslator) parseRegexp(queryMap QueryMap) (result mode
return model.NewSimpleQueryInvalid()
}

if len(parameters) > 1 {
logger.WarnWithCtx(cw.Ctx).Msgf("unsupported regexp parameters: %v", parameters)
if len(params) > 1 {
logger.WarnWithCtx(cw.Ctx).Msgf("unsupported regexp parameters: %v", params)
}

var funcName string
if isPatternReallySimple(pattern) {
pattern = strings.ReplaceAll(pattern, "_", `\_`)
pattern = strings.ReplaceAll(pattern, ".*", "%")
pattern = strings.ReplaceAll(pattern, ".", "_")
funcName = "LIKE"
} else { // this Clickhouse function is much slower, so we use it only for complex regexps
funcName = "REGEXP"
}
return model.NewSimpleQuery(
model.NewInfixExpr(model.NewColumnRef(fieldName), funcName, model.NewLiteral("'"+pattern+"'")), true)
clickhouseFuncName, patternExpr := regex.ToClickhouseExpr(pattern)
clickhouseExpr := model.NewInfixExpr(model.NewColumnRef(fieldName), clickhouseFuncName, patternExpr)
return model.NewSimpleQuery(clickhouseExpr, true)
}

logger.ErrorWithCtx(cw.Ctx).Msg("parseRegexp: theoretically unreachable code")
Expand Down
Loading
Loading