Skip to content

Commit

Permalink
Fix Hydrolix dates 1 (#835)
Browse files Browse the repository at this point in the history
This eliminates an error we encountered with Hydrolix.
```
Q3006: Unspecified database error.  clickhouse: query failed. err: code: 169, message: Key expression contains comparison between inconvertible types: DateTime64(3) and Float64 inside reqTimeSec >= 1727858503270
```
It's possible to compare dates and floats in Clickhouse from 2022 year,
but in 2021 it wasn't possible, and Hydrolix must use some earlier
version, so I created a solution which works in both scenarios

I move date parsing from Clickhouse to Quesma, removing usage of
Clickhouse's `parseDateTimeBestEffort()`. Now we parse date ourselves,
and use `toDateTime64()` to compare this date with date field (checked,
it works fine also for fields of basic `DateTime` type, not
`DateTime64`, even in Clickhouse 4 years ago). Elastic has like 50
different date formats available, so I doubt all of them are available
in Clickhouse or other databases, so it's a move in the right direction,
I guess.

After:
<img width="1728" alt="Screenshot 2024-10-06 at 17 15 22"
src="https://github.com/user-attachments/assets/952c6a9e-2a3e-43ce-85f5-fd19c1373018">
Timestamps are fine: there's `"gte": 1727858503270` in the request,
which is `Wed Oct 02 2024 08:41:43 GMT+0000`, so histogram starts fine
from `8:41:30`:
<img width="505" alt="Screenshot 2024-10-06 at 17 15 49"
src="https://github.com/user-attachments/assets/3d9acfb9-90c6-4341-b149-48c214e3d0d2">

---------

Co-authored-by: Jacek Migdal <[email protected]>
  • Loading branch information
trzysiek and jakozaur authored Oct 14, 2024
1 parent e97ce49 commit 939452c
Show file tree
Hide file tree
Showing 17 changed files with 360 additions and 458 deletions.
1 change: 0 additions & 1 deletion quesma/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ require (
github.com/knadh/koanf/providers/file v1.1.2
github.com/knadh/koanf/v2 v2.1.1
github.com/markbates/goth v1.80.0
github.com/relvacode/iso8601 v1.4.0
github.com/rs/zerolog v1.33.0
github.com/shirou/gopsutil/v3 v3.24.5
github.com/stretchr/testify v1.9.0
Expand Down
2 changes: 0 additions & 2 deletions quesma/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,6 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
github.com/relvacode/iso8601 v1.4.0 h1:GsInVSEJfkYuirYFxa80nMLbH2aydgZpIf52gYZXUJs=
github.com/relvacode/iso8601 v1.4.0/go.mod h1:FlNp+jz+TXpyRqgmM7tnzHHzBnz776kmAH2h3sZCn0I=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
Expand Down
33 changes: 25 additions & 8 deletions quesma/kibana/dates.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package kibana

import (
"quesma/model"
"quesma/util"
"strconv"
"time"
Expand All @@ -17,19 +18,18 @@ func NewDateManager() DateManager {
var acceptableDateTimeFormats = []string{"2006", "2006-01", "2006-01-02", "2006-01-02", "2006-01-02T15",
"2006-01-02T15:04", "2006-01-02T15:04:05", "2006-01-02T15:04:05Z07", "2006-01-02T15:04:05Z07:00"}

// MissingInDateHistogramToUnixTimestamp parses date_histogram's missing field.
// If missing is present, it's in [strict_date_optional_time || epoch_millis] format
// parseStrictDateOptionalTimeOrEpochMillis parses date, which is in [strict_date_optional_time || epoch_millis] format
// (https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html)
func (dm DateManager) MissingInDateHistogramToUnixTimestamp(missing any) (unixTimestamp int64, parsingSucceeded bool) {
if asInt, success := util.ExtractInt64Maybe(missing); success {
func (dm DateManager) parseStrictDateOptionalTimeOrEpochMillis(date any) (unixTimestamp int64, parsingSucceeded bool) {
if asInt, success := util.ExtractInt64Maybe(date); success {
return asInt, true
}

if asFloat, success := util.ExtractFloat64Maybe(missing); success {
if asFloat, success := util.ExtractFloat64Maybe(date); success {
return int64(asFloat), true
}

asString, success := missing.(string)
asString, success := date.(string)
if !success {
return -1, false
}
Expand All @@ -41,9 +41,9 @@ func (dm DateManager) MissingInDateHistogramToUnixTimestamp(missing any) (unixTi
const yearOrTsDelimiter = 10000

if asInt, err := strconv.ParseInt(asString, 10, 64); err == nil && asInt >= yearOrTsDelimiter {
return dm.MissingInDateHistogramToUnixTimestamp(asInt)
return dm.parseStrictDateOptionalTimeOrEpochMillis(asInt)
} else if asFloat, err := strconv.ParseFloat(asString, 64); err == nil && asFloat >= yearOrTsDelimiter {
return dm.MissingInDateHistogramToUnixTimestamp(asFloat)
return dm.parseStrictDateOptionalTimeOrEpochMillis(asFloat)
}

// It could be replaced with iso8601.ParseString() after the fixes to 1.4.0:
Expand All @@ -56,3 +56,20 @@ func (dm DateManager) MissingInDateHistogramToUnixTimestamp(missing any) (unixTi

return -1, false
}

// ParseMissingInDateHistogram parses date_histogram's missing field.
// If missing is present, it's in [strict_date_optional_time || epoch_millis] format
// (https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html)
func (dm DateManager) ParseMissingInDateHistogram(missing any) (unixTimestamp int64, parsingSucceeded bool) {
return dm.parseStrictDateOptionalTimeOrEpochMillis(missing)
}

// ParseRange parses range filter.
// We assume it's in [strict_date_optional_time || epoch_millis] format (TODO: other formats)
// (https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html)
func (dm DateManager) ParseRange(Range any) (timestampExpr model.Expr, parsingSucceeded bool) {
if timestamp, success := dm.parseStrictDateOptionalTimeOrEpochMillis(Range); success {
return model.NewFunction("fromUnixTimestamp64Milli", model.NewLiteral(timestamp)), true
}
return nil, false
}
4 changes: 2 additions & 2 deletions quesma/kibana/dates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"testing"
)

func TestDateManager_MissingInDateHistogramToUnixTimestamp(t *testing.T) {
func TestDateManager_parseStrictDateOptionalTimeOrEpochMillis(t *testing.T) {
tests := []struct {
missing any
wantUnixTimestamp int64
Expand Down Expand Up @@ -38,7 +38,7 @@ func TestDateManager_MissingInDateHistogramToUnixTimestamp(t *testing.T) {
for _, tt := range tests {
t.Run(fmt.Sprintf("%v", tt.missing), func(t *testing.T) {
dm := NewDateManager()
gotUnixTs, gotParsingSucceeded := dm.MissingInDateHistogramToUnixTimestamp(tt.missing)
gotUnixTs, gotParsingSucceeded := dm.parseStrictDateOptionalTimeOrEpochMillis(tt.missing)
assert.Equalf(t, tt.wantUnixTimestamp, gotUnixTs, "MissingInDateHistogramToUnixTimestamp(%v)", tt.missing)
assert.Equalf(t, tt.wantParsingSucceeded, gotParsingSucceeded, "MissingInDateHistogramToUnixTimestamp(%v)", tt.missing)
})
Expand Down
12 changes: 6 additions & 6 deletions quesma/queryparser/pancake_aggregation_parser_buckets.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,14 @@ func (cw *ClickhouseQueryTranslator) pancakeTryBucketAggregation(aggregation *pa
}
field := cw.parseFieldField(dateHistogram, "date_histogram")

didWeAddMissing := false
weAddedMissing := false
if missingRaw, exists := dateHistogram["missing"]; exists {
if missing, ok := missingRaw.(string); ok {
dateManager := kibana.NewDateManager()
timestamp, parsingTimestampOk := dateManager.MissingInDateHistogramToUnixTimestamp(missing)
if parsingTimestampOk {
if unixTimestamp, parsingOk := dateManager.ParseMissingInDateHistogram(missing); parsingOk {
field = model.NewFunction("COALESCE", field,
model.NewFunction("toDateTime", model.NewLiteral(timestamp)))
didWeAddMissing = true
model.NewFunction("fromUnixTimestamp64Milli", model.NewLiteral(unixTimestamp)))
weAddedMissing = true
} else {
logger.ErrorWithCtx(cw.Ctx).Msgf("unknown format of missing in date_histogram: %v. Skipping it.", missing)
}
Expand All @@ -95,7 +94,8 @@ func (cw *ClickhouseQueryTranslator) pancakeTryBucketAggregation(aggregation *pa
}
}

if !didWeAddMissing {
if !weAddedMissing {
// if we don't add missing, we need to filter out nulls later
aggregation.filterOutEmptyKeyBucket = true
}

Expand Down
131 changes: 55 additions & 76 deletions quesma/queryparser/query_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import (
"encoding/hex"
"encoding/json"
"fmt"
"github.com/k0kubun/pp"
"quesma/clickhouse"
"quesma/kibana"
"quesma/logger"
"quesma/model"
"quesma/model/bucket_aggregations"
Expand All @@ -19,9 +21,6 @@ import (
"strconv"
"strings"
"unicode"

"github.com/k0kubun/pp"
"github.com/relvacode/iso8601"
)

type QueryMap = map[string]interface{}
Expand Down Expand Up @@ -764,7 +763,6 @@ func (cw *ClickhouseQueryTranslator) parseDateMathExpression(expr string) (strin

exp, err := ParseDateMathExpression(expr)
if err != nil {
logger.Warn().Msgf("error parsing date math expression: %s", expr)
return "", err
}

Expand All @@ -775,7 +773,6 @@ func (cw *ClickhouseQueryTranslator) parseDateMathExpression(expr string) (strin

sql, err := builder.RenderSQL(exp)
if err != nil {
logger.Warn().Msgf("error rendering date math expression: %s", expr)
return "", err
}

Expand All @@ -792,84 +789,81 @@ func (cw *ClickhouseQueryTranslator) parseRange(queryMap QueryMap) model.SimpleQ
return model.NewSimpleQuery(nil, false)
}

for field, v := range queryMap {
field = cw.ResolveField(cw.Ctx, field)
for fieldName, v := range queryMap {
fieldName = cw.ResolveField(cw.Ctx, fieldName)
fieldType := cw.Table.GetDateTimeType(cw.Ctx, cw.ResolveField(cw.Ctx, fieldName))
stmts := make([]model.Expr, 0)
if _, ok := v.(QueryMap); !ok {
logger.WarnWithCtx(cw.Ctx).Msgf("invalid range type: %T, value: %v", v, v)
continue
}
isDatetimeInDefaultFormat := true // in 99% requests, format is "strict_date_optional_time", which we can parse with time.Parse(time.RFC3339Nano, ..)
if format, ok := v.(QueryMap)["format"]; ok && format == "epoch_millis" {
isDatetimeInDefaultFormat = false
}

keysSorted := util.MapKeysSorted(v.(QueryMap))
for _, op := range keysSorted {
v := v.(QueryMap)[op]
var timeFormatFuncName string
var finalLHS, valueToCompare model.Expr
fieldType := cw.Table.GetDateTimeType(cw.Ctx, cw.ResolveField(cw.Ctx, field))
vToPrint := sprint(v)
valueToCompare = model.NewLiteral(vToPrint)
finalLHS = model.NewColumnRef(field)
if !isDatetimeInDefaultFormat {
timeFormatFuncName = "toUnixTimestamp64Milli"
finalLHS = model.NewFunction(timeFormatFuncName, model.NewColumnRef(field))
} else {
switch fieldType {
case clickhouse.DateTime64, clickhouse.DateTime:
if dateTime, ok := v.(string); ok {
// if it's a date, we need to parse it to Clickhouse's DateTime format
// how to check if it does not contain date math expression?
if _, err := iso8601.ParseString(dateTime); err == nil {
_, timeFormatFuncName = cw.parseDateTimeString(cw.Table, field, dateTime)
// TODO Investigate the quotation below
valueToCompare = model.NewFunction(timeFormatFuncName, model.NewLiteral(fmt.Sprintf("'%s'", dateTime)))
} else if op == "gte" || op == "lte" || op == "gt" || op == "lt" {
vToPrint, err = cw.parseDateMathExpression(vToPrint)
valueToCompare = model.NewLiteral(vToPrint)
if err != nil {
logger.WarnWithCtx(cw.Ctx).Msgf("error parsing date math expression: %s", vToPrint)
return model.NewSimpleQuery(nil, false)
}
}
} else if v == nil {
vToPrint = "NULL"
valueToCompare = model.NewLiteral("NULL")
valueRaw := v.(QueryMap)[op]
value := sprint(valueRaw)
defaultValue := model.NewLiteral(value)
dateManager := kibana.NewDateManager()

// Three stages:
// 1. dateManager.ParseRange
// 2. cw.parseDateMathExpression
// 3. just a number
// Dates use 1-3 and finish as soon as any succeeds
// Numbers use just 3rd

var finalValue model.Expr
doneParsing, isQuoted := false, len(value) > 2 && value[0] == '\'' && value[len(value)-1] == '\''
switch fieldType {
case clickhouse.DateTime, clickhouse.DateTime64:
// TODO add support for "time_zone" parameter in ParseRange
finalValue, doneParsing = dateManager.ParseRange(value) // stage 1

if !doneParsing && (op == "gte" || op == "lte" || op == "gt" || op == "lt") { // stage 2
parsed, err := cw.parseDateMathExpression(value)
if err == nil {
doneParsing = true
finalValue = model.NewLiteral(parsed)
}
case clickhouse.Invalid: // assumes it is number that does not need formatting
if len(vToPrint) > 2 && vToPrint[0] == '\'' && vToPrint[len(vToPrint)-1] == '\'' {
isNumber := true
for _, c := range vToPrint[1 : len(vToPrint)-1] {
if !unicode.IsDigit(c) && c != '.' {
isNumber = false
}
}
if isNumber {
vToPrint = vToPrint[1 : len(vToPrint)-1]
} else {
logger.WarnWithCtx(cw.Ctx).Msgf("we use range with unknown literal %s, field %s", vToPrint, field)
}

if !doneParsing && isQuoted { // stage 3
finalValue, doneParsing = dateManager.ParseRange(value[1 : len(value)-1])
}
case clickhouse.Invalid:
if isQuoted {
isNumber, unquoted := true, value[1:len(value)-1]
for _, c := range unquoted {
if !unicode.IsDigit(c) && c != '.' {
isNumber = false
}
valueToCompare = model.NewLiteral(vToPrint)
}
default:
logger.WarnWithCtx(cw.Ctx).Msgf("invalid DateTime type for field: %s, parsed dateTime value: %s", field, vToPrint)
if isNumber {
finalValue = model.NewLiteral(unquoted)
doneParsing = true
}
}
default:
logger.ErrorWithCtx(cw.Ctx).Msgf("invalid DateTime type for field: %s, parsed dateTime value: %s", fieldName, value)
}

if !doneParsing {
finalValue = defaultValue
}

field := model.NewColumnRef(fieldName)
switch op {
case "gte":
stmt := model.NewInfixExpr(finalLHS, ">=", valueToCompare)
stmt := model.NewInfixExpr(field, ">=", finalValue)
stmts = append(stmts, stmt)
case "lte":
stmt := model.NewInfixExpr(finalLHS, "<=", valueToCompare)
stmt := model.NewInfixExpr(field, "<=", finalValue)
stmts = append(stmts, stmt)
case "gt":
stmt := model.NewInfixExpr(finalLHS, ">", valueToCompare)
stmt := model.NewInfixExpr(field, ">", finalValue)
stmts = append(stmts, stmt)
case "lt":
stmt := model.NewInfixExpr(finalLHS, "<", valueToCompare)
stmt := model.NewInfixExpr(field, "<", finalValue)
stmts = append(stmts, stmt)
case "format":
// ignored
Expand All @@ -885,21 +879,6 @@ func (cw *ClickhouseQueryTranslator) parseRange(queryMap QueryMap) model.SimpleQ
return model.NewSimpleQuery(nil, false)
}

// parseDateTimeString returns string used to parse DateTime in Clickhouse (depends on column type)

func (cw *ClickhouseQueryTranslator) parseDateTimeString(table *clickhouse.Table, field, dateTime string) (string, string) {
typ := table.GetDateTimeType(cw.Ctx, cw.ResolveField(cw.Ctx, field))
switch typ {
case clickhouse.DateTime64:
return "parseDateTime64BestEffort('" + dateTime + "')", "parseDateTime64BestEffort"
case clickhouse.DateTime:
return "parseDateTimeBestEffort('" + dateTime + "')", "parseDateTimeBestEffort"
default:
logger.Error().Msgf("invalid DateTime type: %T for field: %s, parsed dateTime value: %s", typ, field, dateTime)
return "", ""
}
}

// TODO: not supported:
// - The field has "index" : false and "doc_values" : false set in the mapping
// - The length of the field value exceeded an ignore_above setting in the mapping
Expand Down
6 changes: 3 additions & 3 deletions quesma/queryparser/query_parser_range_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ var parseRangeTests = []parseRangeTest{
`CREATE TABLE ` + tableName + `
( "message" String, "timestamp" DateTime64(3, 'UTC') )
ENGINE = Memory`,
`("timestamp">=parseDateTime64BestEffort('2024-02-02T13:47:16.029Z') AND "timestamp"<=parseDateTime64BestEffort('2024-02-09T13:47:16.029Z'))`,
`("timestamp">=fromUnixTimestamp64Milli(1706881636029) AND "timestamp"<=fromUnixTimestamp64Milli(1707486436029))`,
},
{
"parseDateTimeBestEffort",
Expand All @@ -46,7 +46,7 @@ var parseRangeTests = []parseRangeTest{
`CREATE TABLE ` + tableName + `
( "message" String, "timestamp" DateTime )
ENGINE = Memory`,
`("timestamp">=parseDateTimeBestEffort('2024-02-02T13:47:16.029Z') AND "timestamp"<=parseDateTimeBestEffort('2024-02-09T13:47:16.029Z'))`,
`("timestamp">=fromUnixTimestamp64Milli(1706881636029) AND "timestamp"<=fromUnixTimestamp64Milli(1707486436029))`,
},
{
"numeric range",
Expand All @@ -72,7 +72,7 @@ var parseRangeTests = []parseRangeTest{
`CREATE TABLE ` + tableName + `
( "message" String, "timestamp" DateTime64(3, 'UTC') )
ENGINE = Memory`,
`("timestamp">=parseDateTime64BestEffort('2024-02-02T13:47:16') AND "timestamp"<=parseDateTime64BestEffort('2024-02-09T13:47:16'))`,
`("timestamp">=fromUnixTimestamp64Milli(1706881636000) AND "timestamp"<=fromUnixTimestamp64Milli(1707486436000))`,
},
}

Expand Down
4 changes: 2 additions & 2 deletions quesma/quesma/functionality/terms_enum/terms_enum_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ func testHandleTermsEnumRequest(t *testing.T, requestBody []byte) {
}
qt := &queryparser.ClickhouseQueryTranslator{ClickhouseLM: lm, Table: table, Ctx: context.Background(), Schema: s.Tables[schema.TableName(testTableName)]}
// Here we additionally verify that terms for `_tier` are **NOT** included in the SQL query
expectedQuery1 := `SELECT DISTINCT "client_name" FROM ` + testTableName + ` WHERE ("epoch_time">=parseDateTimeBestEffort('2024-02-27T12:25:00.000Z') AND "epoch_time"<=parseDateTimeBestEffort('2024-02-27T12:40:59.999Z')) LIMIT 13`
expectedQuery2 := `SELECT DISTINCT "client_name" FROM ` + testTableName + ` WHERE ("epoch_time"<=parseDateTimeBestEffort('2024-02-27T12:40:59.999Z') AND "epoch_time">=parseDateTimeBestEffort('2024-02-27T12:25:00.000Z')) LIMIT 13`
expectedQuery1 := `SELECT DISTINCT "client_name" FROM ` + testTableName + ` WHERE ("epoch_time">=fromUnixTimestamp64Milli(1709036700000) AND "epoch_time"<=fromUnixTimestamp64Milli(1709037659999)) LIMIT 13`
expectedQuery2 := `SELECT DISTINCT "client_name" FROM ` + testTableName + ` WHERE ("epoch_time">=fromUnixTimestamp64Milli(1709036700000) AND "epoch_time"<=fromUnixTimestamp64Milli(1709037659999)) LIMIT 13`

// Once in a while `AND` conditions could be swapped, so we match both cases
mock.ExpectQuery(fmt.Sprintf("%s|%s", regexp.QuoteMeta(expectedQuery1), regexp.QuoteMeta(expectedQuery2))).
Expand Down
Loading

0 comments on commit 939452c

Please sign in to comment.