From a7ed64c7110198444260b958bb79b6193d53a4e2 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Wed, 11 Sep 2024 11:23:21 +0800 Subject: [PATCH 01/13] introduce simple date format --- velox/functions/lib/DateTimeFormatter.cpp | 125 +++++++++++++++++++ velox/functions/lib/DateTimeFormatter.h | 13 +- velox/functions/sparksql/CMakeLists.txt | 1 + velox/functions/sparksql/DateTimeFunctions.h | 31 +++-- velox/functions/sparksql/flags.cpp | 24 ++++ 5 files changed, 184 insertions(+), 10 deletions(-) create mode 100644 velox/functions/sparksql/flags.cpp diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index e9fcc238a919..84c14acdf9ec 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -1697,4 +1697,129 @@ std::shared_ptr buildJodaDateTimeFormatter( return builder.setType(DateTimeFormatterType::JODA).build(); } +std::shared_ptr buildSimpleDateTimeFormatter( + const std::string_view& format, + bool lenient) { + if (format.empty()) { + VELOX_USER_FAIL("Invalid pattern specification"); + } + + DateTimeFormatterBuilder builder(format.size()); + const char* cur = format.data(); + const char* end = cur + format.size(); + + while (cur < end) { + const char* startTokenPtr = cur; + + // Literal case + if (*startTokenPtr == '\'') { + // Case 1: 2 consecutive single quote + if (cur + 1 < end && *(cur + 1) == '\'') { + builder.appendLiteral("'"); + cur += 2; + } else { + // Case 2: find closing single quote + int64_t count = numLiteralChars(startTokenPtr + 1, end); + if (count == -1) { + VELOX_USER_FAIL("No closing single quote for literal"); + } else { + for (int64_t i = 1; i <= count; i++) { + builder.appendLiteral(startTokenPtr + i, 1); + if (*(startTokenPtr + i) == '\'') { + i += 1; + } + } + cur += count + 2; + } + } + } else { + int count = 1; + ++cur; + while (cur < end && *startTokenPtr == *cur) { + ++count; + ++cur; + } + switch (*startTokenPtr) { + case 'G': + builder.appendEra(); + break; + case 'C': + builder.appendCenturyOfEra(count); + break; + case 'Y': + builder.appendYearOfEra(count); + break; + case 'x': + builder.appendWeekYear(count); + break; + case 'w': + builder.appendWeekOfWeekYear(count); + break; + case 'e': + builder.appendDayOfWeek1Based(count); + break; + case 'E': + builder.appendDayOfWeekText(count); + break; + case 'y': + builder.appendYear(count); + break; + case 'D': + builder.appendDayOfYear(count); + break; + case 'M': + if (count <= 2) { + builder.appendMonthOfYear(count); + } else { + builder.appendMonthOfYearText(count); + } + break; + case 'd': + builder.appendDayOfMonth(count); + break; + case 'a': + builder.appendHalfDayOfDay(); + break; + case 'K': + builder.appendHourOfHalfDay(count); + break; + case 'h': + builder.appendClockHourOfHalfDay(count); + break; + case 'H': + builder.appendHourOfDay(count); + break; + case 'k': + builder.appendClockHourOfDay(count); + break; + case 'm': + builder.appendMinuteOfHour(count); + break; + case 's': + builder.appendSecondOfMinute(count); + break; + case 'S': + builder.appendFractionOfSecond(count); + break; + case 'z': + builder.appendTimeZone(count); + break; + case 'Z': + builder.appendTimeZoneOffsetId(count); + break; + default: + if (isalpha(*startTokenPtr)) { + VELOX_UNSUPPORTED("Specifier {} is not supported.", *startTokenPtr); + } else { + builder.appendLiteral(startTokenPtr, cur - startTokenPtr); + } + break; + } + } + } + DateTimeFormatterType type = lenient ? DateTimeFormatterType::LENIENT_SIMPLE + : DateTimeFormatterType::STRICT_SIMPLE; + return builder.setType(type).build(); +} + } // namespace facebook::velox::functions diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index 06165c5dbc48..d3d357342efc 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -23,7 +23,15 @@ namespace facebook::velox::functions { -enum class DateTimeFormatterType { JODA, MYSQL, UNKNOWN }; +// LENIENT_SIMPLE and STRICT_SIMPLE are respectively aligned with +// java.text.SimpleDateFormat in lenient and non-lenient modes. +enum class DateTimeFormatterType { + JODA, + MYSQL, + LENIENT_SIMPLE, + STRICT_SIMPLE, + UNKNOWN +}; enum class DateTimeFormatSpecifier : uint8_t { // Era, e.g: "AD" @@ -209,6 +217,9 @@ std::shared_ptr buildMysqlDateTimeFormatter( std::shared_ptr buildJodaDateTimeFormatter( const std::string_view& format); +std::shared_ptr buildSimpleDateTimeFormatter( + const std::string_view& format, bool lenient); + } // namespace facebook::velox::functions template <> diff --git a/velox/functions/sparksql/CMakeLists.txt b/velox/functions/sparksql/CMakeLists.txt index 1b940591e2a4..70e51466b249 100644 --- a/velox/functions/sparksql/CMakeLists.txt +++ b/velox/functions/sparksql/CMakeLists.txt @@ -15,6 +15,7 @@ add_subdirectory(specialforms) velox_add_library( velox_functions_spark + flags.cpp ArrayGetFunction.cpp ArraySort.cpp Bitwise.cpp diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index a5a70e0b568b..cf3940fd503a 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -23,8 +23,21 @@ #include "velox/type/TimestampConversion.h" #include "velox/type/tz/TimeZoneMap.h" +DECLARE_bool(spark_sql_legacy_timeParserPolicy); + namespace facebook::velox::functions::sparksql { +std::shared_ptr getDateTimeFormatter( + const std::string_view& format, + bool lenient) { + if (FLAGS_spark_sql_legacy_timeParserPolicy) { + return buildSimpleDateTimeFormatter(format, lenient); + } else { + return buildJodaDateTimeFormatter( + std::string_view(format.data(), format.size())); + } +} + template struct YearFunction : public InitSessionTimezone { VELOX_DEFINE_FUNCTION_TYPES(T); @@ -156,7 +169,7 @@ struct UnixTimestampParseFunction { const std::vector& /*inputTypes*/, const core::QueryConfig& config, const arg_type* /*input*/) { - format_ = buildJodaDateTimeFormatter(kDefaultFormat_); + format_ = getDateTimeFormatter(kDefaultFormat_, false); setTimezone(config); } @@ -207,8 +220,8 @@ struct UnixTimestampParseWithFormatFunction const arg_type* format) { if (format != nullptr) { try { - this->format_ = buildJodaDateTimeFormatter( - std::string_view(format->data(), format->size())); + this->format_ = getDateTimeFormatter( + std::string_view(format->data(), format->size()), false); } catch (const VeloxUserError&) { invalidFormat_ = true; } @@ -228,8 +241,8 @@ struct UnixTimestampParseWithFormatFunction // Format error returns null. try { if (!isConstFormat_) { - this->format_ = buildJodaDateTimeFormatter( - std::string_view(format.data(), format.size())); + this->format_ = getDateTimeFormatter( + std::string_view(format.data(), format.size()), false); } } catch (const VeloxUserError&) { return false; @@ -284,8 +297,8 @@ struct FromUnixtimeFunction { private: FOLLY_ALWAYS_INLINE void setFormatter(const arg_type& format) { - formatter_ = buildJodaDateTimeFormatter( - std::string_view(format.data(), format.size())); + formatter_ = getDateTimeFormatter( + std::string_view(format.data(), format.size()), false); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); } @@ -371,7 +384,7 @@ struct GetTimestampFunction { sessionTimeZone_ = tz::locateZone(sessionTimezoneName); } if (format != nullptr) { - formatter_ = buildJodaDateTimeFormatter(std::string_view(*format)); + formatter_ = getDateTimeFormatter(std::string_view(*format), false); isConstantTimeFormat_ = true; } } @@ -381,7 +394,7 @@ struct GetTimestampFunction { const arg_type& input, const arg_type& format) { if (!isConstantTimeFormat_) { - formatter_ = buildJodaDateTimeFormatter(std::string_view(format)); + formatter_ = getDateTimeFormatter(std::string_view(format), false); } auto dateTimeResult = formatter_->parse(std::string_view(input)); // Null as result for parsing error. diff --git a/velox/functions/sparksql/flags.cpp b/velox/functions/sparksql/flags.cpp new file mode 100644 index 000000000000..6ffed498d1d8 --- /dev/null +++ b/velox/functions/sparksql/flags.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +DEFINE_bool( + spark_sql_legacy_timeParserPolicy, + false, + "When true, SIMPLE_STRICT or SIMPLE_LENIENT date formatter is used for " + "formatting and parsing. This is aligned with Spark legacy date formatting " + "and parsing behavior."); From 212bf50b030ffc7624c26dff94ba0d86ac242f9b Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Wed, 11 Sep 2024 12:03:06 +0800 Subject: [PATCH 02/13] reformat --- velox/functions/lib/DateTimeFormatter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index d3d357342efc..f29db21e63bd 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -218,7 +218,8 @@ std::shared_ptr buildJodaDateTimeFormatter( const std::string_view& format); std::shared_ptr buildSimpleDateTimeFormatter( - const std::string_view& format, bool lenient); + const std::string_view& format, + bool lenient); } // namespace facebook::velox::functions From 41d77f26522cae510b8187d6ccc0cd1a01a12bb3 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Wed, 11 Sep 2024 17:53:16 +0800 Subject: [PATCH 03/13] update --- velox/core/QueryConfig.h | 8 +++++ velox/docs/configs.rst | 4 +++ velox/functions/lib/DateTimeFormatter.cpp | 25 ++++++-------- velox/functions/sparksql/CMakeLists.txt | 1 - velox/functions/sparksql/DateTimeFunctions.h | 36 ++++++++++++++------ velox/functions/sparksql/flags.cpp | 24 ------------- 6 files changed, 47 insertions(+), 51 deletions(-) delete mode 100644 velox/functions/sparksql/flags.cpp diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index 814bdef03a26..4fcd15639eb6 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -287,6 +287,10 @@ class QueryConfig { /// The current spark partition id. static constexpr const char* kSparkPartitionId = "spark.partition_id"; + /// If true, simple date formatter is used for time formatting and parsing. + static constexpr const char* kSparkLegacyTimeParser = + "spark.sql_legacy_timeParserPolicy"; + /// The number of local parallel table writer operators per task. static constexpr const char* kTaskWriterCount = "task_writer_count"; @@ -677,6 +681,10 @@ class QueryConfig { return value; } + bool sparkLegacyTimeParser() const { + return get(kSparkLegacyTimeParser, false); + } + bool exprTrackCpuUsage() const { return get(kExprTrackCpuUsage, false); } diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index 746ad0d8cd44..72407ed24dd6 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -697,6 +697,10 @@ Spark-specific Configuration - integer - - The current task's Spark partition ID. It's set by the query engine (Spark) prior to task execution. + * - spark.sql_legacy_timeParserPolicy + - bool + - false + - If true, simple date formatter is used for time formatting and parsing. Tracing -------- diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 84c14acdf9ec..1ac14736435a 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -1700,9 +1700,7 @@ std::shared_ptr buildJodaDateTimeFormatter( std::shared_ptr buildSimpleDateTimeFormatter( const std::string_view& format, bool lenient) { - if (format.empty()) { - VELOX_USER_FAIL("Invalid pattern specification"); - } + VELOX_USER_CHECK(!format.empty(), "Format pattern should not be empty."); DateTimeFormatterBuilder builder(format.size()); const char* cur = format.data(); @@ -1711,26 +1709,23 @@ std::shared_ptr buildSimpleDateTimeFormatter( while (cur < end) { const char* startTokenPtr = cur; - // Literal case + // Literal case. if (*startTokenPtr == '\'') { - // Case 1: 2 consecutive single quote + // 2 consecutive single quote. if (cur + 1 < end && *(cur + 1) == '\'') { builder.appendLiteral("'"); cur += 2; } else { - // Case 2: find closing single quote + // Find closing single quote. int64_t count = numLiteralChars(startTokenPtr + 1, end); - if (count == -1) { - VELOX_USER_FAIL("No closing single quote for literal"); - } else { - for (int64_t i = 1; i <= count; i++) { - builder.appendLiteral(startTokenPtr + i, 1); - if (*(startTokenPtr + i) == '\'') { - i += 1; - } + VELOX_USER_CHECK_NE(count, -1, "No closing single quote for literal"); + for (int64_t i = 1; i <= count; i++) { + builder.appendLiteral(startTokenPtr + i, 1); + if (*(startTokenPtr + i) == '\'') { + i += 1; } - cur += count + 2; } + cur += count + 2; } } else { int count = 1; diff --git a/velox/functions/sparksql/CMakeLists.txt b/velox/functions/sparksql/CMakeLists.txt index 70e51466b249..1b940591e2a4 100644 --- a/velox/functions/sparksql/CMakeLists.txt +++ b/velox/functions/sparksql/CMakeLists.txt @@ -15,7 +15,6 @@ add_subdirectory(specialforms) velox_add_library( velox_functions_spark - flags.cpp ArrayGetFunction.cpp ArraySort.cpp Bitwise.cpp diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index cf3940fd503a..46382709677d 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -23,14 +23,13 @@ #include "velox/type/TimestampConversion.h" #include "velox/type/tz/TimeZoneMap.h" -DECLARE_bool(spark_sql_legacy_timeParserPolicy); - namespace facebook::velox::functions::sparksql { std::shared_ptr getDateTimeFormatter( + const bool legacyTimeParser, const std::string_view& format, bool lenient) { - if (FLAGS_spark_sql_legacy_timeParserPolicy) { + if (legacyTimeParser) { return buildSimpleDateTimeFormatter(format, lenient); } else { return buildJodaDateTimeFormatter( @@ -169,7 +168,8 @@ struct UnixTimestampParseFunction { const std::vector& /*inputTypes*/, const core::QueryConfig& config, const arg_type* /*input*/) { - format_ = getDateTimeFormatter(kDefaultFormat_, false); + format_ = getDateTimeFormatter( + config.sparkLegacyTimeParser(), kDefaultFormat_, false); setTimezone(config); } @@ -218,10 +218,13 @@ struct UnixTimestampParseWithFormatFunction const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { + legacyTimeParser = config.sparkLegacyTimeParser(); if (format != nullptr) { try { this->format_ = getDateTimeFormatter( - std::string_view(format->data(), format->size()), false); + legacyTimeParser, + std::string_view(format->data(), format->size()), + false); } catch (const VeloxUserError&) { invalidFormat_ = true; } @@ -242,7 +245,9 @@ struct UnixTimestampParseWithFormatFunction try { if (!isConstFormat_) { this->format_ = getDateTimeFormatter( - std::string_view(format.data(), format.size()), false); + legacyTimeParser, + std::string_view(format.data(), format.size()), + false); } } catch (const VeloxUserError&) { return false; @@ -261,6 +266,7 @@ struct UnixTimestampParseWithFormatFunction private: bool isConstFormat_{false}; bool invalidFormat_{false}; + bool legacyTimeParser{false}; }; // Parses unix time in seconds to a formatted string. @@ -275,7 +281,7 @@ struct FromUnixtimeFunction { const arg_type* format) { sessionTimeZone_ = getTimeZoneFromConfig(config); if (format != nullptr) { - setFormatter(*format); + setFormatter(*format, config.sparkLegacyTimeParser()); isConstantTimeFormat_ = true; } } @@ -296,9 +302,13 @@ struct FromUnixtimeFunction { } private: - FOLLY_ALWAYS_INLINE void setFormatter(const arg_type& format) { + FOLLY_ALWAYS_INLINE void setFormatter( + const arg_type& format, + bool legacyTimeParser) { formatter_ = getDateTimeFormatter( - std::string_view(format.data(), format.size()), false); + legacyTimeParser, + std::string_view(format.data(), format.size()), + false); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); } @@ -379,12 +389,14 @@ struct GetTimestampFunction { const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { + legacyTimeParser = config.sparkLegacyTimeParser(); auto sessionTimezoneName = config.sessionTimezone(); if (!sessionTimezoneName.empty()) { sessionTimeZone_ = tz::locateZone(sessionTimezoneName); } if (format != nullptr) { - formatter_ = getDateTimeFormatter(std::string_view(*format), false); + formatter_ = getDateTimeFormatter( + legacyTimeParser, std::string_view(*format), false); isConstantTimeFormat_ = true; } } @@ -394,7 +406,8 @@ struct GetTimestampFunction { const arg_type& input, const arg_type& format) { if (!isConstantTimeFormat_) { - formatter_ = getDateTimeFormatter(std::string_view(format), false); + formatter_ = getDateTimeFormatter( + legacyTimeParser, std::string_view(format), false); } auto dateTimeResult = formatter_->parse(std::string_view(input)); // Null as result for parsing error. @@ -417,6 +430,7 @@ struct GetTimestampFunction { std::shared_ptr formatter_{nullptr}; bool isConstantTimeFormat_{false}; const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // default to GMT. + bool legacyTimeParser{false}; }; template diff --git a/velox/functions/sparksql/flags.cpp b/velox/functions/sparksql/flags.cpp deleted file mode 100644 index 6ffed498d1d8..000000000000 --- a/velox/functions/sparksql/flags.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -DEFINE_bool( - spark_sql_legacy_timeParserPolicy, - false, - "When true, SIMPLE_STRICT or SIMPLE_LENIENT date formatter is used for " - "formatting and parsing. This is aligned with Spark legacy date formatting " - "and parsing behavior."); From 298d52caf84b2f2948969e2c940735755444f71a Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Wed, 11 Sep 2024 19:41:42 +0800 Subject: [PATCH 04/13] update --- velox/functions/sparksql/DateTimeFunctions.h | 22 +++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 46382709677d..8b67146fd1dd 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -218,11 +218,11 @@ struct UnixTimestampParseWithFormatFunction const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyTimeParser = config.sparkLegacyTimeParser(); + legacyTimeParser_ = config.sparkLegacyTimeParser(); if (format != nullptr) { try { this->format_ = getDateTimeFormatter( - legacyTimeParser, + legacyTimeParser_, std::string_view(format->data(), format->size()), false); } catch (const VeloxUserError&) { @@ -245,7 +245,7 @@ struct UnixTimestampParseWithFormatFunction try { if (!isConstFormat_) { this->format_ = getDateTimeFormatter( - legacyTimeParser, + legacyTimeParser_, std::string_view(format.data(), format.size()), false); } @@ -266,7 +266,7 @@ struct UnixTimestampParseWithFormatFunction private: bool isConstFormat_{false}; bool invalidFormat_{false}; - bool legacyTimeParser{false}; + bool legacyTimeParser_{false}; }; // Parses unix time in seconds to a formatted string. @@ -279,9 +279,10 @@ struct FromUnixtimeFunction { const core::QueryConfig& config, const arg_type* /*unixtime*/, const arg_type* format) { + legacyTimeParser_ = config.sparkLegacyTimeParser(); sessionTimeZone_ = getTimeZoneFromConfig(config); if (format != nullptr) { - setFormatter(*format, config.sparkLegacyTimeParser()); + setFormatter(*format, legacyTimeParser_); isConstantTimeFormat_ = true; } } @@ -291,7 +292,7 @@ struct FromUnixtimeFunction { const arg_type& second, const arg_type& format) { if (!isConstantTimeFormat_) { - setFormatter(format); + setFormatter(format, legacyTimeParser_); } const Timestamp timestamp{second, 0}; result.reserve(maxResultSize_); @@ -316,6 +317,7 @@ struct FromUnixtimeFunction { std::shared_ptr formatter_; uint32_t maxResultSize_; bool isConstantTimeFormat_{false}; + bool legacyTimeParser_{false}; }; template @@ -389,14 +391,14 @@ struct GetTimestampFunction { const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyTimeParser = config.sparkLegacyTimeParser(); + legacyTimeParser_ = config.sparkLegacyTimeParser(); auto sessionTimezoneName = config.sessionTimezone(); if (!sessionTimezoneName.empty()) { sessionTimeZone_ = tz::locateZone(sessionTimezoneName); } if (format != nullptr) { formatter_ = getDateTimeFormatter( - legacyTimeParser, std::string_view(*format), false); + legacyTimeParser_, std::string_view(*format), false); isConstantTimeFormat_ = true; } } @@ -407,7 +409,7 @@ struct GetTimestampFunction { const arg_type& format) { if (!isConstantTimeFormat_) { formatter_ = getDateTimeFormatter( - legacyTimeParser, std::string_view(format), false); + legacyTimeParser_, std::string_view(format), false); } auto dateTimeResult = formatter_->parse(std::string_view(input)); // Null as result for parsing error. @@ -430,7 +432,7 @@ struct GetTimestampFunction { std::shared_ptr formatter_{nullptr}; bool isConstantTimeFormat_{false}; const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // default to GMT. - bool legacyTimeParser{false}; + bool legacyTimeParser_{false}; }; template From ab30f2ff5c646d734ce233035caa53aa69c93d48 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Fri, 13 Sep 2024 13:09:21 +0800 Subject: [PATCH 05/13] update --- velox/core/QueryConfig.h | 3 +- velox/docs/configs.rst | 4 +- velox/functions/lib/DateTimeFormatter.cpp | 71 ++++++++++++----------- velox/functions/lib/DateTimeFormatter.h | 4 +- 4 files changed, 44 insertions(+), 38 deletions(-) diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index 4fcd15639eb6..1d6c4f0a1563 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -288,8 +288,9 @@ class QueryConfig { static constexpr const char* kSparkPartitionId = "spark.partition_id"; /// If true, simple date formatter is used for time formatting and parsing. + /// False by default. static constexpr const char* kSparkLegacyTimeParser = - "spark.sql_legacy_timeParserPolicy"; + "spark.legacy_time_parser_policy"; /// The number of local parallel table writer operators per task. static constexpr const char* kTaskWriterCount = "task_writer_count"; diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index 72407ed24dd6..b934f2461e0e 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -697,10 +697,10 @@ Spark-specific Configuration - integer - - The current task's Spark partition ID. It's set by the query engine (Spark) prior to task execution. - * - spark.sql_legacy_timeParserPolicy + * - spark.legacy_time_parser_policy - bool - false - - If true, simple date formatter is used for time formatting and parsing. + - If true, simple date formatter is used for time formatting and parsing. False by default. Tracing -------- diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 1ac14736435a..580e233dab2e 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -1709,14 +1709,17 @@ std::shared_ptr buildSimpleDateTimeFormatter( while (cur < end) { const char* startTokenPtr = cur; - // Literal case. + // For literal case, literal should be quoted using single quotes ('). If + // there is no quotes, it is interpreted as pattern letters. If there is + // only single quote, a user error will be thrown. if (*startTokenPtr == '\'') { - // 2 consecutive single quote. + // Append single literal quote for 2 consecutive single quote. if (cur + 1 < end && *(cur + 1) == '\'') { builder.appendLiteral("'"); cur += 2; } else { - // Find closing single quote. + // Append literal characters from the start until the next closing + // literal sequence single quote. int64_t count = numLiteralChars(startTokenPtr + 1, end); VELOX_USER_CHECK_NE(count, -1, "No closing single quote for literal"); for (int64_t i = 1; i <= count; i++) { @@ -1728,6 +1731,8 @@ std::shared_ptr buildSimpleDateTimeFormatter( cur += count + 2; } } else { + // Append format specifier according to pattern letters. If pattern letter + // is not supported, a user error will be thrown. int count = 1; ++cur; while (cur < end && *startTokenPtr == *cur) { @@ -1735,20 +1740,17 @@ std::shared_ptr buildSimpleDateTimeFormatter( ++cur; } switch (*startTokenPtr) { - case 'G': - builder.appendEra(); + case 'a': + builder.appendHalfDayOfDay(); break; case 'C': builder.appendCenturyOfEra(count); break; - case 'Y': - builder.appendYearOfEra(count); - break; - case 'x': - builder.appendWeekYear(count); + case 'd': + builder.appendDayOfMonth(count); break; - case 'w': - builder.appendWeekOfWeekYear(count); + case 'D': + builder.appendDayOfYear(count); break; case 'e': builder.appendDayOfWeek1Based(count); @@ -1756,27 +1758,8 @@ std::shared_ptr buildSimpleDateTimeFormatter( case 'E': builder.appendDayOfWeekText(count); break; - case 'y': - builder.appendYear(count); - break; - case 'D': - builder.appendDayOfYear(count); - break; - case 'M': - if (count <= 2) { - builder.appendMonthOfYear(count); - } else { - builder.appendMonthOfYearText(count); - } - break; - case 'd': - builder.appendDayOfMonth(count); - break; - case 'a': - builder.appendHalfDayOfDay(); - break; - case 'K': - builder.appendHourOfHalfDay(count); + case 'G': + builder.appendEra(); break; case 'h': builder.appendClockHourOfHalfDay(count); @@ -1784,18 +1767,40 @@ std::shared_ptr buildSimpleDateTimeFormatter( case 'H': builder.appendHourOfDay(count); break; + case 'K': + builder.appendHourOfHalfDay(count); + break; case 'k': builder.appendClockHourOfDay(count); break; case 'm': builder.appendMinuteOfHour(count); break; + case 'M': + if (count <= 2) { + builder.appendMonthOfYear(count); + } else { + builder.appendMonthOfYearText(count); + } + break; case 's': builder.appendSecondOfMinute(count); break; case 'S': builder.appendFractionOfSecond(count); break; + case 'w': + builder.appendWeekOfWeekYear(count); + break; + case 'x': + builder.appendWeekYear(count); + break; + case 'y': + builder.appendYear(count); + break; + case 'Y': + builder.appendYearOfEra(count); + break; case 'z': builder.appendTimeZone(count); break; diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index f29db21e63bd..4355da1d48a1 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -23,12 +23,12 @@ namespace facebook::velox::functions { -// LENIENT_SIMPLE and STRICT_SIMPLE are respectively aligned with -// java.text.SimpleDateFormat in lenient and non-lenient modes. enum class DateTimeFormatterType { JODA, MYSQL, + // Corresponding to java.text.SimpleDateFormat in lenient mode. LENIENT_SIMPLE, + // Corresponding to java.text.SimpleDateFormat in strict(lenient=false) mode STRICT_SIMPLE, UNKNOWN }; From 4923d217ef5d62c8f1d39e32aa11251335b36ebc Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Wed, 18 Sep 2024 17:57:04 +0800 Subject: [PATCH 06/13] add doc --- velox/functions/lib/DateTimeFormatter.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index 4355da1d48a1..a61942a88231 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -26,9 +26,12 @@ namespace facebook::velox::functions { enum class DateTimeFormatterType { JODA, MYSQL, - // Corresponding to java.text.SimpleDateFormat in lenient mode. + // Corresponding to java.text.SimpleDateFormat in lenient mode. It is used by + // the 'date_format', 'from_unixtime', 'unix_timestamp' and + // 'to_unix_timestamp' Spark functions. LENIENT_SIMPLE, - // Corresponding to java.text.SimpleDateFormat in strict(lenient=false) mode + // Corresponding to java.text.SimpleDateFormat in strict(lenient=false) mode. + // It is used by Spark 'cast date to string'. STRICT_SIMPLE, UNKNOWN }; From 899c835ff643dbb2ff6d694d1265623da52f3b8b Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Wed, 18 Sep 2024 18:31:43 +0800 Subject: [PATCH 07/13] update --- velox/core/QueryConfig.h | 2 +- velox/docs/configs.rst | 2 +- velox/functions/sparksql/DateTimeFunctions.h | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index 1d6c4f0a1563..0f9dfdd4007d 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -288,7 +288,7 @@ class QueryConfig { static constexpr const char* kSparkPartitionId = "spark.partition_id"; /// If true, simple date formatter is used for time formatting and parsing. - /// False by default. + /// Strict date formatter is used by default. static constexpr const char* kSparkLegacyTimeParser = "spark.legacy_time_parser_policy"; diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index b934f2461e0e..8a078afffe75 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -700,7 +700,7 @@ Spark-specific Configuration * - spark.legacy_time_parser_policy - bool - false - - If true, simple date formatter is used for time formatting and parsing. False by default. + - If true, simple date formatter is used for time formatting and parsing. Strict date formatter is used by default. Tracing -------- diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 8b67146fd1dd..d803978f9a03 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -26,15 +26,14 @@ namespace facebook::velox::functions::sparksql { std::shared_ptr getDateTimeFormatter( - const bool legacyTimeParser, + bool legacyTimeParser, const std::string_view& format, bool lenient) { if (legacyTimeParser) { return buildSimpleDateTimeFormatter(format, lenient); - } else { - return buildJodaDateTimeFormatter( - std::string_view(format.data(), format.size())); } + return buildJodaDateTimeFormatter( + std::string_view(format.data(), format.size())); } template From d5ba5c9c5446d80a448c50f4112fb121bfee9359 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Fri, 20 Sep 2024 14:17:11 +0800 Subject: [PATCH 08/13] updated --- velox/core/QueryConfig.h | 2 +- velox/docs/configs.rst | 5 ++++- velox/functions/sparksql/DateTimeFunctions.h | 10 ++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index 0f9dfdd4007d..008486739d8f 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -288,7 +288,7 @@ class QueryConfig { static constexpr const char* kSparkPartitionId = "spark.partition_id"; /// If true, simple date formatter is used for time formatting and parsing. - /// Strict date formatter is used by default. + /// Joda date formatter is used by default. static constexpr const char* kSparkLegacyTimeParser = "spark.legacy_time_parser_policy"; diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index 8a078afffe75..fe97db7a844f 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -700,7 +700,10 @@ Spark-specific Configuration * - spark.legacy_time_parser_policy - bool - false - - If true, simple date formatter is used for time formatting and parsing. Strict date formatter is used by default. + - If true, simple date formatter is used for time formatting and parsing. Joda date formatter is used by default. + - Joda date formatter performs strict checking of its input and uses different pattern string. + - For example, the 2015-07-22 10:00:00 timestamp cannot be parse if pattern is yyyy-MM-dd because the parser does not consume whole input. + - Another example is that the 'W' pattern, which means week in month, is not supported. For more differences, see #10354. Tracing -------- diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index d803978f9a03..6878fd335b28 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -281,7 +281,7 @@ struct FromUnixtimeFunction { legacyTimeParser_ = config.sparkLegacyTimeParser(); sessionTimeZone_ = getTimeZoneFromConfig(config); if (format != nullptr) { - setFormatter(*format, legacyTimeParser_); + setFormatter(*format); isConstantTimeFormat_ = true; } } @@ -291,7 +291,7 @@ struct FromUnixtimeFunction { const arg_type& second, const arg_type& format) { if (!isConstantTimeFormat_) { - setFormatter(format, legacyTimeParser_); + setFormatter(format); } const Timestamp timestamp{second, 0}; result.reserve(maxResultSize_); @@ -302,11 +302,9 @@ struct FromUnixtimeFunction { } private: - FOLLY_ALWAYS_INLINE void setFormatter( - const arg_type& format, - bool legacyTimeParser) { + FOLLY_ALWAYS_INLINE void setFormatter(const arg_type& format) { formatter_ = getDateTimeFormatter( - legacyTimeParser, + legacyTimeParser_, std::string_view(format.data(), format.size()), false); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); From 00cb1405df8475c386f9a1900a657c11f7eb1fb6 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Fri, 20 Sep 2024 17:57:07 +0800 Subject: [PATCH 09/13] updated --- velox/core/QueryConfig.h | 8 +-- velox/docs/configs.rst | 6 +- velox/functions/sparksql/DateTimeFunctions.h | 59 ++++++++++++-------- 3 files changed, 42 insertions(+), 31 deletions(-) diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index 008486739d8f..0cad5c622af4 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -289,8 +289,8 @@ class QueryConfig { /// If true, simple date formatter is used for time formatting and parsing. /// Joda date formatter is used by default. - static constexpr const char* kSparkLegacyTimeParser = - "spark.legacy_time_parser_policy"; + static constexpr const char* kSparkLegacyDateFormatter = + "spark.legacy_date_formatter"; /// The number of local parallel table writer operators per task. static constexpr const char* kTaskWriterCount = "task_writer_count"; @@ -682,8 +682,8 @@ class QueryConfig { return value; } - bool sparkLegacyTimeParser() const { - return get(kSparkLegacyTimeParser, false); + bool sparkLegacyDateFormatter() const { + return get(kSparkLegacyDateFormatter, false); } bool exprTrackCpuUsage() const { diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index fe97db7a844f..e8d972c16701 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -697,13 +697,13 @@ Spark-specific Configuration - integer - - The current task's Spark partition ID. It's set by the query engine (Spark) prior to task execution. - * - spark.legacy_time_parser_policy + * - spark.legacy_date_formatter - bool - false - - If true, simple date formatter is used for time formatting and parsing. Joda date formatter is used by default. + - If true, `Simple ` date formatter is used for time formatting and parsing. Joda date formatter is used by default. - Joda date formatter performs strict checking of its input and uses different pattern string. - For example, the 2015-07-22 10:00:00 timestamp cannot be parse if pattern is yyyy-MM-dd because the parser does not consume whole input. - - Another example is that the 'W' pattern, which means week in month, is not supported. For more differences, see #10354. + - Another example is that the 'W' pattern, which means week in month, is not supported. For more differences, see :issue:`10354`. Tracing -------- diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 6878fd335b28..0653bedaaa36 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -25,16 +25,21 @@ namespace facebook::velox::functions::sparksql { +namespace detail { std::shared_ptr getDateTimeFormatter( - bool legacyTimeParser, const std::string_view& format, - bool lenient) { - if (legacyTimeParser) { - return buildSimpleDateTimeFormatter(format, lenient); + DateTimeFormatterType type) { + switch (type) { + case DateTimeFormatterType::STRICT_SIMPLE: + return buildSimpleDateTimeFormatter(format, /*lenient=*/false); + case DateTimeFormatterType::LENIENT_SIMPLE: + return buildSimpleDateTimeFormatter(format, /*lenient=*/true); + default: + return buildJodaDateTimeFormatter( + std::string_view(format.data(), format.size())); } - return buildJodaDateTimeFormatter( - std::string_view(format.data(), format.size())); } +} // namespace detail template struct YearFunction : public InitSessionTimezone { @@ -167,8 +172,10 @@ struct UnixTimestampParseFunction { const std::vector& /*inputTypes*/, const core::QueryConfig& config, const arg_type* /*input*/) { - format_ = getDateTimeFormatter( - config.sparkLegacyTimeParser(), kDefaultFormat_, false); + format_ = detail::getDateTimeFormatter( + kDefaultFormat_, + config.sparkLegacyDateFormatter() ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); setTimezone(config); } @@ -217,13 +224,13 @@ struct UnixTimestampParseWithFormatFunction const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyTimeParser_ = config.sparkLegacyTimeParser(); + legacyTimeParser_ = config.sparkLegacyDateFormatter(); if (format != nullptr) { try { - this->format_ = getDateTimeFormatter( - legacyTimeParser_, + this->format_ = detail::getDateTimeFormatter( std::string_view(format->data(), format->size()), - false); + legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } catch (const VeloxUserError&) { invalidFormat_ = true; } @@ -243,10 +250,10 @@ struct UnixTimestampParseWithFormatFunction // Format error returns null. try { if (!isConstFormat_) { - this->format_ = getDateTimeFormatter( - legacyTimeParser_, + this->format_ = detail::getDateTimeFormatter( std::string_view(format.data(), format.size()), - false); + legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } } catch (const VeloxUserError&) { return false; @@ -278,7 +285,7 @@ struct FromUnixtimeFunction { const core::QueryConfig& config, const arg_type* /*unixtime*/, const arg_type* format) { - legacyTimeParser_ = config.sparkLegacyTimeParser(); + legacyTimeParser_ = config.sparkLegacyDateFormatter(); sessionTimeZone_ = getTimeZoneFromConfig(config); if (format != nullptr) { setFormatter(*format); @@ -303,10 +310,10 @@ struct FromUnixtimeFunction { private: FOLLY_ALWAYS_INLINE void setFormatter(const arg_type& format) { - formatter_ = getDateTimeFormatter( - legacyTimeParser_, + formatter_ = detail::getDateTimeFormatter( std::string_view(format.data(), format.size()), - false); + legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); } @@ -388,14 +395,16 @@ struct GetTimestampFunction { const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyTimeParser_ = config.sparkLegacyTimeParser(); + legacyTimeParser_ = config.sparkLegacyDateFormatter(); auto sessionTimezoneName = config.sessionTimezone(); if (!sessionTimezoneName.empty()) { sessionTimeZone_ = tz::locateZone(sessionTimezoneName); } if (format != nullptr) { - formatter_ = getDateTimeFormatter( - legacyTimeParser_, std::string_view(*format), false); + formatter_ = detail::getDateTimeFormatter( + std::string_view(*format), + legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); isConstantTimeFormat_ = true; } } @@ -405,8 +414,10 @@ struct GetTimestampFunction { const arg_type& input, const arg_type& format) { if (!isConstantTimeFormat_) { - formatter_ = getDateTimeFormatter( - legacyTimeParser_, std::string_view(format), false); + formatter_ = detail::getDateTimeFormatter( + std::string_view(format), + legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } auto dateTimeResult = formatter_->parse(std::string_view(input)); // Null as result for parsing error. From b67a6da5969a5bb1af286c0138e108f1be506b25 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Fri, 20 Sep 2024 18:10:32 +0800 Subject: [PATCH 10/13] updated --- velox/docs/functions/spark/datetime.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index 85ca72628933..3ff759c5254c 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -82,7 +82,9 @@ These functions support TIMESTAMP and DATE input types. Adjusts ``unixTime`` (elapsed seconds since UNIX epoch) to configured session timezone, then converts it to a formatted time string according to ``format``. Only supports BIGINT type for - ``unixTime``. + ``unixTime``. Using `Simple ` + date formatter or `Joda ` date formatter depends on + the ``spark.legacy_date_formatter`` configuration. `Valid patterns for date format `_. Throws exception for invalid ``format``. This function will convert input to milliseconds, and integer overflow is @@ -285,7 +287,10 @@ These functions support TIMESTAMP and DATE input types. .. spark:function:: unix_timestamp() -> integer - Returns the current UNIX timestamp in seconds. + Returns the current UNIX timestamp in seconds. Using + `Simple ` date formatter or + `Joda ` date formatter depends on the ``spark.legacy_date_formatter`` + configuration. .. spark:function:: unix_timestamp(string) -> integer :noindex: From 1b5b2c84817a223f4792c8b3863191d01df2e9ac Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Fri, 20 Sep 2024 19:42:58 +0800 Subject: [PATCH 11/13] updated --- velox/docs/functions/spark/datetime.rst | 10 ++++----- velox/functions/lib/DateTimeFormatter.h | 1 + velox/functions/sparksql/DateTimeFunctions.h | 22 ++++++++++---------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index 3ff759c5254c..4b3305891dae 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -83,8 +83,8 @@ These functions support TIMESTAMP and DATE input types. Adjusts ``unixTime`` (elapsed seconds since UNIX epoch) to configured session timezone, then converts it to a formatted time string according to ``format``. Only supports BIGINT type for ``unixTime``. Using `Simple ` - date formatter or `Joda ` date formatter depends on - the ``spark.legacy_date_formatter`` configuration. + date formatter in lenient mode that is align with Spark legacy date parser behavior or + `Joda ` date formatter depends on ``spark.legacy_date_formatter`` configuration. `Valid patterns for date format `_. Throws exception for invalid ``format``. This function will convert input to milliseconds, and integer overflow is @@ -288,9 +288,9 @@ These functions support TIMESTAMP and DATE input types. .. spark:function:: unix_timestamp() -> integer Returns the current UNIX timestamp in seconds. Using - `Simple ` date formatter or - `Joda ` date formatter depends on the ``spark.legacy_date_formatter`` - configuration. + `Simple ` date formatter in lenient mode + that is align with Spark legacy date parser behavior or `Joda ` date formatter + depends on the ``spark.legacy_date_formatter`` configuration. .. spark:function:: unix_timestamp(string) -> integer :noindex: diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index a61942a88231..dc107e7580f3 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -29,6 +29,7 @@ enum class DateTimeFormatterType { // Corresponding to java.text.SimpleDateFormat in lenient mode. It is used by // the 'date_format', 'from_unixtime', 'unix_timestamp' and // 'to_unix_timestamp' Spark functions. + // TODO: this is currently no different from STRICT_SIMPLE. LENIENT_SIMPLE, // Corresponding to java.text.SimpleDateFormat in strict(lenient=false) mode. // It is used by Spark 'cast date to string'. diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 0653bedaaa36..b3fa18e08ca7 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -224,12 +224,12 @@ struct UnixTimestampParseWithFormatFunction const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyTimeParser_ = config.sparkLegacyDateFormatter(); + legacyDateFormatter_ = config.sparkLegacyDateFormatter(); if (format != nullptr) { try { this->format_ = detail::getDateTimeFormatter( std::string_view(format->data(), format->size()), - legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE : DateTimeFormatterType::JODA); } catch (const VeloxUserError&) { invalidFormat_ = true; @@ -252,7 +252,7 @@ struct UnixTimestampParseWithFormatFunction if (!isConstFormat_) { this->format_ = detail::getDateTimeFormatter( std::string_view(format.data(), format.size()), - legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE : DateTimeFormatterType::JODA); } } catch (const VeloxUserError&) { @@ -272,7 +272,7 @@ struct UnixTimestampParseWithFormatFunction private: bool isConstFormat_{false}; bool invalidFormat_{false}; - bool legacyTimeParser_{false}; + bool legacyDateFormatter_{false}; }; // Parses unix time in seconds to a formatted string. @@ -285,7 +285,7 @@ struct FromUnixtimeFunction { const core::QueryConfig& config, const arg_type* /*unixtime*/, const arg_type* format) { - legacyTimeParser_ = config.sparkLegacyDateFormatter(); + legacyDateFormatter_ = config.sparkLegacyDateFormatter(); sessionTimeZone_ = getTimeZoneFromConfig(config); if (format != nullptr) { setFormatter(*format); @@ -312,7 +312,7 @@ struct FromUnixtimeFunction { FOLLY_ALWAYS_INLINE void setFormatter(const arg_type& format) { formatter_ = detail::getDateTimeFormatter( std::string_view(format.data(), format.size()), - legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE : DateTimeFormatterType::JODA); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); } @@ -321,7 +321,7 @@ struct FromUnixtimeFunction { std::shared_ptr formatter_; uint32_t maxResultSize_; bool isConstantTimeFormat_{false}; - bool legacyTimeParser_{false}; + bool legacyDateFormatter_{false}; }; template @@ -395,7 +395,7 @@ struct GetTimestampFunction { const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyTimeParser_ = config.sparkLegacyDateFormatter(); + legacyDateFormatter_ = config.sparkLegacyDateFormatter(); auto sessionTimezoneName = config.sessionTimezone(); if (!sessionTimezoneName.empty()) { sessionTimeZone_ = tz::locateZone(sessionTimezoneName); @@ -403,7 +403,7 @@ struct GetTimestampFunction { if (format != nullptr) { formatter_ = detail::getDateTimeFormatter( std::string_view(*format), - legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE : DateTimeFormatterType::JODA); isConstantTimeFormat_ = true; } @@ -416,7 +416,7 @@ struct GetTimestampFunction { if (!isConstantTimeFormat_) { formatter_ = detail::getDateTimeFormatter( std::string_view(format), - legacyTimeParser_ ? DateTimeFormatterType::STRICT_SIMPLE + legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE : DateTimeFormatterType::JODA); } auto dateTimeResult = formatter_->parse(std::string_view(input)); @@ -440,7 +440,7 @@ struct GetTimestampFunction { std::shared_ptr formatter_{nullptr}; bool isConstantTimeFormat_{false}; const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // default to GMT. - bool legacyTimeParser_{false}; + bool legacyDateFormatter_{false}; }; template From e370169742eb489a3d611bc8112f574274682033 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Fri, 20 Sep 2024 20:03:57 +0800 Subject: [PATCH 12/13] reformat --- velox/functions/sparksql/DateTimeFunctions.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index b3fa18e08ca7..ba4a8f520a67 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -175,7 +175,7 @@ struct UnixTimestampParseFunction { format_ = detail::getDateTimeFormatter( kDefaultFormat_, config.sparkLegacyDateFormatter() ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + : DateTimeFormatterType::JODA); setTimezone(config); } @@ -230,7 +230,7 @@ struct UnixTimestampParseWithFormatFunction this->format_ = detail::getDateTimeFormatter( std::string_view(format->data(), format->size()), legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + : DateTimeFormatterType::JODA); } catch (const VeloxUserError&) { invalidFormat_ = true; } @@ -253,7 +253,7 @@ struct UnixTimestampParseWithFormatFunction this->format_ = detail::getDateTimeFormatter( std::string_view(format.data(), format.size()), legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + : DateTimeFormatterType::JODA); } } catch (const VeloxUserError&) { return false; @@ -313,7 +313,7 @@ struct FromUnixtimeFunction { formatter_ = detail::getDateTimeFormatter( std::string_view(format.data(), format.size()), legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + : DateTimeFormatterType::JODA); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); } @@ -404,7 +404,7 @@ struct GetTimestampFunction { formatter_ = detail::getDateTimeFormatter( std::string_view(*format), legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + : DateTimeFormatterType::JODA); isConstantTimeFormat_ = true; } } @@ -417,7 +417,7 @@ struct GetTimestampFunction { formatter_ = detail::getDateTimeFormatter( std::string_view(format), legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + : DateTimeFormatterType::JODA); } auto dateTimeResult = formatter_->parse(std::string_view(input)); // Null as result for parsing error. From ca7141203a0ac4fe113d85c56a489d71d56c50b5 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Sat, 21 Sep 2024 23:39:14 +0800 Subject: [PATCH 13/13] fix ci --- velox/functions/sparksql/DateTimeFunctions.h | 32 ++++++++++---------- velox/functions/sparksql/Split.h | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index ba4a8f520a67..2908f5d62826 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -224,13 +224,13 @@ struct UnixTimestampParseWithFormatFunction const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyDateFormatter_ = config.sparkLegacyDateFormatter(); + legacyFormatter_ = config.sparkLegacyDateFormatter(); if (format != nullptr) { try { this->format_ = detail::getDateTimeFormatter( std::string_view(format->data(), format->size()), - legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } catch (const VeloxUserError&) { invalidFormat_ = true; } @@ -252,8 +252,8 @@ struct UnixTimestampParseWithFormatFunction if (!isConstFormat_) { this->format_ = detail::getDateTimeFormatter( std::string_view(format.data(), format.size()), - legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } } catch (const VeloxUserError&) { return false; @@ -272,7 +272,7 @@ struct UnixTimestampParseWithFormatFunction private: bool isConstFormat_{false}; bool invalidFormat_{false}; - bool legacyDateFormatter_{false}; + bool legacyFormatter_{false}; }; // Parses unix time in seconds to a formatted string. @@ -285,7 +285,7 @@ struct FromUnixtimeFunction { const core::QueryConfig& config, const arg_type* /*unixtime*/, const arg_type* format) { - legacyDateFormatter_ = config.sparkLegacyDateFormatter(); + legacyFormatter_ = config.sparkLegacyDateFormatter(); sessionTimeZone_ = getTimeZoneFromConfig(config); if (format != nullptr) { setFormatter(*format); @@ -312,8 +312,8 @@ struct FromUnixtimeFunction { FOLLY_ALWAYS_INLINE void setFormatter(const arg_type& format) { formatter_ = detail::getDateTimeFormatter( std::string_view(format.data(), format.size()), - legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); } @@ -321,7 +321,7 @@ struct FromUnixtimeFunction { std::shared_ptr formatter_; uint32_t maxResultSize_; bool isConstantTimeFormat_{false}; - bool legacyDateFormatter_{false}; + bool legacyFormatter_{false}; }; template @@ -395,7 +395,7 @@ struct GetTimestampFunction { const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyDateFormatter_ = config.sparkLegacyDateFormatter(); + legacyFormatter_ = config.sparkLegacyDateFormatter(); auto sessionTimezoneName = config.sessionTimezone(); if (!sessionTimezoneName.empty()) { sessionTimeZone_ = tz::locateZone(sessionTimezoneName); @@ -403,8 +403,8 @@ struct GetTimestampFunction { if (format != nullptr) { formatter_ = detail::getDateTimeFormatter( std::string_view(*format), - legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); isConstantTimeFormat_ = true; } } @@ -416,8 +416,8 @@ struct GetTimestampFunction { if (!isConstantTimeFormat_) { formatter_ = detail::getDateTimeFormatter( std::string_view(format), - legacyDateFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } auto dateTimeResult = formatter_->parse(std::string_view(input)); // Null as result for parsing error. @@ -440,7 +440,7 @@ struct GetTimestampFunction { std::shared_ptr formatter_{nullptr}; bool isConstantTimeFormat_{false}; const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // default to GMT. - bool legacyDateFormatter_{false}; + bool legacyFormatter_{false}; }; template diff --git a/velox/functions/sparksql/Split.h b/velox/functions/sparksql/Split.h index 86fdde503ced..2cee345f77b2 100644 --- a/velox/functions/sparksql/Split.h +++ b/velox/functions/sparksql/Split.h @@ -165,6 +165,6 @@ struct Split { result.add_item().setNoCopy(StringView(start + pos, end - pos)); } - mutable detail::ReCache cache_; + mutable facebook::velox::functions::detail::ReCache cache_; }; } // namespace facebook::velox::functions::sparksql