Skip to content

Commit

Permalink
Merge branch 'main' into pr-e2e-1
Browse files Browse the repository at this point in the history
  • Loading branch information
nablaone authored May 14, 2024
2 parents 51d9985 + ca10087 commit c3b4917
Show file tree
Hide file tree
Showing 3 changed files with 326 additions and 20 deletions.
103 changes: 87 additions & 16 deletions quesma/queryparser/aggregation_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"mitmproxy/quesma/model/bucket_aggregations"
"mitmproxy/quesma/model/metrics_aggregations"
"mitmproxy/quesma/util"
"regexp"
"slices"
"strconv"
"strings"
Expand All @@ -34,14 +35,15 @@ type aggrQueryBuilder struct {
}

type metricsAggregation struct {
AggrType string
FieldNames []string // on these fields we're doing aggregation. Array, because e.g. 'top_hits' can have multiple fields
FieldType clickhouse.DateTimeType // field type of FieldNames[0]. If it's a date field, a slightly different response is needed
Percentiles map[string]float64 // Only for percentiles aggregation
Keyed bool // Only for percentiles aggregation
SortBy string // Only for top_metrics
Size int // Only for top_metrics
Order string // Only for top_metrics
AggrType string
FieldNames []string // on these fields we're doing aggregation. Array, because e.g. 'top_hits' can have multiple fields
FieldType clickhouse.DateTimeType // field type of FieldNames[0]. If it's a date field, a slightly different response is needed
Percentiles map[string]float64 // Only for percentiles aggregation
Keyed bool // Only for percentiles aggregation
SortBy string // Only for top_metrics
Size int // Only for top_metrics
Order string // Only for top_metrics
IsFieldNameCompound bool // Only for a few aggregations, where we have only 1 field. It's a compound, so e.g. toHour(timestamp), not just "timestamp"
}

const metricsAggregationDefaultFieldType = clickhouse.Invalid
Expand Down Expand Up @@ -84,7 +86,13 @@ func (b *aggrQueryBuilder) buildMetricsAggregation(metricsAggr metricsAggregatio
query := b.buildAggregationCommon(metadata)
switch metricsAggr.AggrType {
case "sum", "min", "max", "avg":
query.NonSchemaFields = append(query.NonSchemaFields, metricsAggr.AggrType+`OrNull("`+getFirstFieldName()+`")`)
var fieldNameProperlyQuoted string
if metricsAggr.IsFieldNameCompound {
fieldNameProperlyQuoted = getFirstFieldName()
} else {
fieldNameProperlyQuoted = strconv.Quote(getFirstFieldName())
}
query.NonSchemaFields = append(query.NonSchemaFields, metricsAggr.AggrType+`OrNull(`+fieldNameProperlyQuoted+`)`)
case "quantile":
// Sorting here useful mostly for determinism in tests.
// It wasn't there before, and everything worked fine. We could safely remove it, if needed.
Expand Down Expand Up @@ -430,11 +438,12 @@ func (cw *ClickhouseQueryTranslator) tryMetricsAggregation(queryMap QueryMap) (m
metricsAggregations := []string{"sum", "avg", "min", "max", "cardinality", "value_count", "stats"}
for k, v := range queryMap {
if slices.Contains(metricsAggregations, k) {
fieldName := cw.parseFieldField(v, k)
fieldName, isFieldNameFromScript := cw.parseFieldFieldMaybeScript(v, k)
return metricsAggregation{
AggrType: k,
FieldNames: []string{fieldName},
FieldType: cw.Table.GetDateTimeType(cw.Ctx, fieldName),
AggrType: k,
FieldNames: []string{fieldName},
FieldType: cw.Table.GetDateTimeType(cw.Ctx, fieldName),
IsFieldNameCompound: isFieldNameFromScript,
}, true
}
}
Expand Down Expand Up @@ -536,7 +545,13 @@ func (cw *ClickhouseQueryTranslator) tryBucketAggregation(currentAggr *aggrQuery
success = true // returned in most cases
if histogram, ok := queryMap["histogram"]; ok {
currentAggr.Type = bucket_aggregations.NewHistogram(cw.Ctx)
fieldName := strconv.Quote(cw.parseFieldField(histogram, "histogram"))
fieldName, isFieldNameFromScript := cw.parseFieldFieldMaybeScript(histogram, "histogram")
var fieldNameProperlyQuoted string
if isFieldNameFromScript {
fieldNameProperlyQuoted = fieldName
} else {
fieldNameProperlyQuoted = strconv.Quote(fieldName)
}
var interval float64
intervalQueryMap, ok := histogram.(QueryMap)["interval"]
if !ok {
Expand All @@ -556,9 +571,9 @@ func (cw *ClickhouseQueryTranslator) tryBucketAggregation(currentAggr *aggrQuery
default:
logger.ErrorWithCtx(cw.Ctx).Msgf("unexpected type of interval: %T, value: %v", intervalRaw, intervalRaw)
}
groupByStr := fieldName
groupByStr := fieldNameProperlyQuoted
if interval != 1 {
groupByStr = fmt.Sprintf("floor(%s / %f) * %f", fieldName, interval, interval)
groupByStr = fmt.Sprintf("floor(%s / %f) * %f", fieldNameProperlyQuoted, interval, interval)
}
currentAggr.GroupByFields = append(currentAggr.GroupByFields, groupByStr)
currentAggr.NonSchemaFields = append(currentAggr.NonSchemaFields, groupByStr)
Expand Down Expand Up @@ -664,6 +679,62 @@ func (cw *ClickhouseQueryTranslator) parseFieldField(shouldBeMap any, aggregatio
return ""
}

// parseFieldFieldMaybeScript is basically almost a copy of parseFieldField above, but it also handles a basic script, if "field" is missing.
func (cw *ClickhouseQueryTranslator) parseFieldFieldMaybeScript(shouldBeMap any, aggregationType string) (field string, isFromScript bool) {
isFromScript = false
Map, ok := shouldBeMap.(QueryMap)
if !ok {
logger.WarnWithCtx(cw.Ctx).Msgf("%s aggregation is not a map, but %T, value: %v", aggregationType, shouldBeMap, shouldBeMap)
return
}
// maybe "field" field
if fieldRaw, ok := Map["field"]; ok {
if field, ok = fieldRaw.(string); ok {
return
} else {
logger.WarnWithCtx(cw.Ctx).Msgf("field is not a string, but %T, value: %v", fieldRaw, fieldRaw)
}
}

// else: maybe script
if fieldName, ok := cw.parseFieldFromScriptField(Map); ok {
return fmt.Sprintf("toHour(`%s`)", fieldName), true
}

logger.WarnWithCtx(cw.Ctx).Msgf("field not found in %s aggregation: %v", aggregationType, Map)
return
}

func (cw *ClickhouseQueryTranslator) parseFieldFromScriptField(queryMap QueryMap) (fieldName string, success bool) {
scriptRaw, exists := queryMap["script"]
if !exists {
return
}
script, ok := scriptRaw.(QueryMap)
if !ok {
logger.WarnWithCtx(cw.Ctx).Msgf("script is not a JsonMap, but %T, value: %v", scriptRaw, scriptRaw)
return
}

sourceRaw, exists := script["source"]
if !exists {
logger.WarnWithCtx(cw.Ctx).Msgf("source not found in script: %v", script)
return
}
source, ok := sourceRaw.(string)
if !ok {
logger.WarnWithCtx(cw.Ctx).Msgf("source is not a string, but %T, value: %v", sourceRaw, sourceRaw)
}

// source must look like "doc['field_name'].value.getHour()" or "doc['field_name'].value.hourOfDay"
wantedRegex := regexp.MustCompile(`^doc\['(\w+)']\.value\.(?:getHour\(\)|hourOfDay)$`)
matches := wantedRegex.FindStringSubmatch(source)
if len(matches) == 2 {
return matches[1], true
}
return
}

func (cw *ClickhouseQueryTranslator) parseFilters(filtersMap QueryMap) []filter {
var filters []filter
filtersMap = filtersMap["filters"].(QueryMap)
Expand Down
33 changes: 29 additions & 4 deletions quesma/queryparser/aggregation_parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -581,10 +581,7 @@ func Test2AggregationParserExternalTestcases(t *testing.T) {

// Let's leave those commented debugs for now, they'll be useful in next PRs
for j, aggregation := range aggregations {
// fmt.Println("--- Aggregation "+strconv.Itoa(j)+":", aggregation)
// fmt.Println()
// fmt.Println("--- SQL string ", aggregation.String())
// fmt.Println()
// fmt.Printf("--- Aggregation %d: %+v\n\n---SQL string: %s\n\n", j, aggregation, aggregation.String())
// fmt.Println("--- Group by: ", aggregation.GroupByFields)
if test.ExpectedSQLs[j] != "NoDBQuery" {
util.AssertSqlEqual(t, test.ExpectedSQLs[j], aggregation.String())
Expand Down Expand Up @@ -637,3 +634,31 @@ func Test_quoteArray(t *testing.T) {
assert.Equal(t, inputs[i], test.input) // check that original array isn't changed
}
}

func Test_parseFieldFromScriptField(t *testing.T) {
goodQueryMap := func(sourceField string) QueryMap {
return QueryMap{"script": QueryMap{"source": sourceField}}
}
testcases := []struct {
queryMap QueryMap
expectedMatch string
expectedSuccess bool
}{
{goodQueryMap("doc['field1'].value.getHour()"), "field1", true},
{goodQueryMap("doc['field1'].value.getHour() + doc['field2'].value.getHour()"), "", false},
{goodQueryMap("doc['field1'].value.hourOfDay"), "field1", true},
{goodQueryMap("doc['field1'].value"), "", false},
{goodQueryMap("value.getHour() + doc['field2'].value.getHour()"), "", false},
{QueryMap{}, "", false},
{QueryMap{"script": QueryMap{}}, "", false},
{QueryMap{"script": QueryMap{"source": ""}}, "", false},
{QueryMap{"script": "script"}, "", false},
{QueryMap{"script": QueryMap{"source": 1}}, "", false},
}
cw := ClickhouseQueryTranslator{Ctx: context.Background()}
for _, tc := range testcases {
fieldName, success := cw.parseFieldFromScriptField(tc.queryMap)
assert.Equal(t, tc.expectedSuccess, success)
assert.Equal(t, tc.expectedMatch, fieldName)
}
}
Loading

0 comments on commit c3b4917

Please sign in to comment.