diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index ed57ef8258ad2..3c0e377c1bae8 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -297,6 +297,22 @@ class QueryConfig { static constexpr const char* kSparkLegacyDateFormatter = "spark.legacy_date_formatter"; + /// The first day-of-week varies by culture. + /// firstDayOfWeek is is a 1-based weekday number starting with Sunday. It + /// determines how week-based calendar works. For example, the ISO-8601 use + /// Monday (2) and the US uses Sunday (1). It should be set to match the + /// 'Calender.getFirstDayOfWeek()' in Java. Sunday (1) is used by default. + static constexpr const char* kSparkFirstDayOfWeek = + "spark.legacy_date_formatter.first_day_of_week"; + + /// The minimal number of days in the first week by culture. + /// The week that includes January 1st and has 'minimalDaysInFirstWeek' or + /// more days is referred to as week 1. It determines how week-based calendar + /// works. It should be set to match the + /// 'Calender.getMinimalDaysInFirstWeek()' in Java. 1 days is used by default. + static constexpr const char* kSparkMinimalDaysInFirstWeek = + "spark.legacy_date_formatter.minimal_days_in_first_week"; + /// The number of local parallel table writer operators per task. static constexpr const char* kTaskWriterCount = "task_writer_count"; @@ -759,6 +775,22 @@ class QueryConfig { return get(kSparkLegacyDateFormatter, false); } + uint8_t sparkFirstDayOfWeek() const { + auto value = get(kSparkFirstDayOfWeek, 1); + VELOX_CHECK( + 1 <= value && value <= 7, + "firstDayOfWeek must be a number between 1 and 7"); + return static_cast(value); + } + + uint8_t sparkMinimalDaysInFirstWeek() const { + auto value = get(kSparkMinimalDaysInFirstWeek, 1); + VELOX_CHECK( + 1 <= value && value <= 7, + "minimalDaysInFirstWee must be a number between 1 and 7"); + return static_cast(value); + } + bool exprTrackCpuUsage() const { return get(kExprTrackCpuUsage, false); } diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 0d9a6d49c98d2..9f7fc89374235 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -1334,9 +1334,8 @@ int32_t DateTimeFormatter::format( static_cast(calDate.year()), static_cast(calDate.month()), static_cast(calDate.day()), - 2, // (ISO 8601) Monday = 2 - 4 // At least 4 days in first week - ); + firstDayOfWeek_, + minimalDaysInFirstWeek_); result += padContent( static_cast(year), @@ -1839,14 +1838,11 @@ Expected> buildSimpleDateTimeFormatter( case 'w': builder.appendWeekOfWeekYear(count); break; - case 'x': - builder.appendWeekYear(count); - break; case 'y': builder.appendYear(count); break; case 'Y': - builder.appendYearOfEra(count); + builder.appendWeekYear(count); break; case 'z': builder.appendTimeZone(count); diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index 82fba6037cb1b..2691aae021738 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -208,11 +208,31 @@ class DateTimeFormatter { char* result, bool allowOverflow = false) const; + void setFirstDayOfWeek(uint8_t firstDayOfWeek) { + firstDayOfWeek_ = firstDayOfWeek; + } + + void setMinimalDaysInFirstWeek(uint8_t minimalDaysInFirstWeek) { + minimalDaysInFirstWeek_ = minimalDaysInFirstWeek; + } + private: std::unique_ptr literalBuf_; size_t bufSize_; std::vector tokens_; DateTimeFormatterType type_; + + /// The first day-of-week varies by culture. + /// firstDayOfWeek is is a 1-based weekday number starting with Sunday. It + /// determines how week-based calendar works. For example, the ISO-8601 use + /// Monday (2) and the US uses Sunday (1). + uint8_t firstDayOfWeek_ = 2; + + /// The minimal number of days in the first week by culture. + /// The week that includes January 1st and has 'minimalDaysInFirstWeek' or + /// more days is referred to as week 1. It determines how week-based calendar + /// works. For example, the ISO-8601 use 4 days. + uint8_t minimalDaysInFirstWeek_ = 4; }; Expected> buildMysqlDateTimeFormatter( diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index c7a0695e23b31..cdd5bca64b522 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -26,19 +26,32 @@ namespace facebook::velox::functions::sparksql { namespace detail { -Expected> getDateTimeFormatter( - const std::string_view& format, - DateTimeFormatterType type) { - switch (type) { - case DateTimeFormatterType::STRICT_SIMPLE: - return buildSimpleDateTimeFormatter(format, /*lenient=*/false); - case DateTimeFormatterType::LENIENT_SIMPLE: - return buildSimpleDateTimeFormatter(format, /*lenient=*/true); - default: +struct DateTimeFormatterProvider { + FOLLY_ALWAYS_INLINE void init(const core::QueryConfig& config) { + legacyFormatter_ = config.sparkLegacyDateFormatter(); + firstDayOfWeek_ = config.sparkFirstDayOfWeek(); + minimalDaysInFirstWeek_ = config.sparkMinimalDaysInFirstWeek(); + } + + FOLLY_ALWAYS_INLINE auto get(const std::string_view& format) { + if (legacyFormatter_) { + return buildSimpleDateTimeFormatter(format, /*lenient=*/false) + .then([this](std::shared_ptr f) { + f->setFirstDayOfWeek(firstDayOfWeek_); + f->setMinimalDaysInFirstWeek(minimalDaysInFirstWeek_); + return f; + }); + } else { return buildJodaDateTimeFormatter( std::string_view(format.data(), format.size())); + } } -} + + std::shared_ptr formatter_; + bool legacyFormatter_{false}; + uint8_t firstDayOfWeek_ = 2; + uint8_t minimalDaysInFirstWeek_ = 4; +}; } // namespace detail template @@ -127,10 +140,8 @@ struct UnixTimestampParseFunction { const std::vector& /*inputTypes*/, const core::QueryConfig& config, const arg_type* /*input*/) { - auto formatter = detail::getDateTimeFormatter( - kDefaultFormat_, - config.sparkLegacyDateFormatter() ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + formatter_provider_.init(config); + auto formatter = formatter_provider_.get(kDefaultFormat_); VELOX_CHECK(!formatter.hasError(), "Default format should always be valid"); format_ = formatter.value(); setTimezone(config); @@ -166,6 +177,7 @@ struct UnixTimestampParseFunction { // Default if format is not specified, as per Spark documentation. constexpr static std::string_view kDefaultFormat_{"yyyy-MM-dd HH:mm:ss"}; std::shared_ptr format_; + detail::DateTimeFormatterProvider formatter_provider_; const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // fallback to GMT. }; @@ -181,12 +193,10 @@ struct UnixTimestampParseWithFormatFunction const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyFormatter_ = config.sparkLegacyDateFormatter(); + this->formatter_provider_.init(config); if (format != nullptr) { - auto formatter = detail::getDateTimeFormatter( - std::string_view(format->data(), format->size()), - legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + auto formatter = this->formatter_provider_.get( + std::string_view(format->data(), format->size())); if (formatter.hasError()) { invalidFormat_ = true; } else { @@ -207,10 +217,8 @@ struct UnixTimestampParseWithFormatFunction // Format error returns null. if (!isConstFormat_) { - auto formatter = detail::getDateTimeFormatter( - std::string_view(format.data(), format.size()), - legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA); + auto formatter = this->formatter_provider_.get( + std::string_view(format.data(), format.size())); if (formatter.hasError()) { return false; } @@ -230,7 +238,6 @@ struct UnixTimestampParseWithFormatFunction private: bool isConstFormat_{false}; bool invalidFormat_{false}; - bool legacyFormatter_{false}; }; // Parses unix time in seconds to a formatted string. @@ -243,7 +250,7 @@ struct FromUnixtimeFunction { const core::QueryConfig& config, const arg_type* /*unixtime*/, const arg_type* format) { - legacyFormatter_ = config.sparkLegacyDateFormatter(); + formatter_provider_.init(config); sessionTimeZone_ = getTimeZoneFromConfig(config); if (format != nullptr) { setFormatter(*format); @@ -268,13 +275,11 @@ struct FromUnixtimeFunction { private: FOLLY_ALWAYS_INLINE void setFormatter(const arg_type& format) { - formatter_ = detail::getDateTimeFormatter( - std::string_view(format.data(), format.size()), - legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA) - .thenOrThrow(folly::identity, [&](const Status& status) { - VELOX_USER_FAIL("{}", status.message()); - }); + formatter_ = + formatter_provider_.get(std::string_view(format.data(), format.size())) + .thenOrThrow(folly::identity, [&](const Status& status) { + VELOX_USER_FAIL("{}", status.message()); + }); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); } @@ -282,7 +287,7 @@ struct FromUnixtimeFunction { std::shared_ptr formatter_; uint32_t maxResultSize_; bool isConstantTimeFormat_{false}; - bool legacyFormatter_{false}; + detail::DateTimeFormatterProvider formatter_provider_; }; template @@ -356,16 +361,14 @@ struct GetTimestampFunction { const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { - legacyFormatter_ = config.sparkLegacyDateFormatter(); + formatter_provider_.init(config); auto sessionTimezoneName = config.sessionTimezone(); if (!sessionTimezoneName.empty()) { sessionTimeZone_ = tz::locateZone(sessionTimezoneName); } if (format != nullptr) { - formatter_ = detail::getDateTimeFormatter( - std::string_view(*format), - legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA) + formatter_ = formatter_provider_ + .get(std::string_view(format->data(), format->size())) .thenOrThrow(folly::identity, [&](const Status& status) { VELOX_USER_FAIL("{}", status.message()); }); @@ -378,10 +381,8 @@ struct GetTimestampFunction { const arg_type& input, const arg_type& format) { if (!isConstantTimeFormat_) { - formatter_ = detail::getDateTimeFormatter( - std::string_view(format), - legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE - : DateTimeFormatterType::JODA) + formatter_ = formatter_provider_ + .get(std::string_view(format.data(), format.size())) .thenOrThrow(folly::identity, [&](const Status& status) { VELOX_USER_FAIL("{}", status.message()); }); @@ -407,7 +408,7 @@ struct GetTimestampFunction { std::shared_ptr formatter_{nullptr}; bool isConstantTimeFormat_{false}; const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // default to GMT. - bool legacyFormatter_{false}; + detail::DateTimeFormatterProvider formatter_provider_; }; template diff --git a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp index ca5f9f4fb2c71..6c0d0e477ce67 100644 --- a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp @@ -917,6 +917,71 @@ TEST_F(DateTimeFunctionsTest, fromUnixtime) { fromUnixTime(getUnixTime("2020-06-30 23:59:59"), "yyyy-MM-dd HH:mm:ss"), "2020-07-01 07:59:59"); + // Weekyear cases of ISO-8601 standard. + queryCtx_->testingOverrideConfigUnsafe({ + {core::QueryConfig::kSparkLegacyDateFormatter, "true"}, + {core::QueryConfig::kSparkFirstDayOfWeek, std::to_string(2)}, + {core::QueryConfig::kSparkMinimalDaysInFirstWeek, std::to_string(4)}, + }); + EXPECT_EQ(fromUnixTime(getUnixTime("2017-01-01 00:00:00"), "YYYY"), "2016"); + EXPECT_EQ(fromUnixTime(getUnixTime("2017-12-31 00:00:00"), "YYYY"), "2017"); + EXPECT_EQ(fromUnixTime(getUnixTime("2018-01-01 00:00:00"), "YYYY"), "2018"); + EXPECT_EQ(fromUnixTime(getUnixTime("2018-12-31 00:00:00"), "YYYY"), "2019"); + EXPECT_EQ(fromUnixTime(getUnixTime("2019-01-01 00:00:00"), "YYYY"), "2019"); + EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-30 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-31 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2020-01-01 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-31 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-01-01 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-01-02 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-01-03 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-31 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2022-01-01 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2022-01-02 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2022-12-31 00:00:00"), "YYYY"), "2022"); + + // Weekyear cases of spark legacy date formatter with default config. + queryCtx_->testingOverrideConfigUnsafe({ + {core::QueryConfig::kSparkLegacyDateFormatter, "true"}, + {core::QueryConfig::kSparkFirstDayOfWeek, std::to_string(1)}, + {core::QueryConfig::kSparkMinimalDaysInFirstWeek, std::to_string(1)}, + }); + EXPECT_EQ(fromUnixTime(getUnixTime("2017-01-01 00:00:00"), "YYYY"), "2017"); + EXPECT_EQ(fromUnixTime(getUnixTime("2017-12-31 00:00:00"), "YYYY"), "2018"); + EXPECT_EQ(fromUnixTime(getUnixTime("2018-01-01 00:00:00"), "YYYY"), "2018"); + EXPECT_EQ(fromUnixTime(getUnixTime("2018-12-30 00:00:00"), "YYYY"), "2019"); + EXPECT_EQ(fromUnixTime(getUnixTime("2018-12-31 00:00:00"), "YYYY"), "2019"); + EXPECT_EQ(fromUnixTime(getUnixTime("2019-01-01 00:00:00"), "YYYY"), "2019"); + EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-29 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-30 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-31 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2020-01-01 00:00:00"), "YYYY"), "2020"); + EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-27 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-28 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-29 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-30 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-31 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-01-01 00:00:00"), "YYYY"), "2021"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-26 00:00:00"), "YYYY"), "2022"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-27 00:00:00"), "YYYY"), "2022"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-28 00:00:00"), "YYYY"), "2022"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-29 00:00:00"), "YYYY"), "2022"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-30 00:00:00"), "YYYY"), "2022"); + EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-31 00:00:00"), "YYYY"), "2022"); + EXPECT_EQ(fromUnixTime(getUnixTime("2022-01-01 00:00:00"), "YYYY"), "2022"); + EXPECT_EQ(fromUnixTime(getUnixTime("2022-12-31 00:00:00"), "YYYY"), "2022"); + + // Week config should only apply to spark legacy date formatter. + queryCtx_->testingOverrideConfigUnsafe({ + {core::QueryConfig::kSparkLegacyDateFormatter, "false"}, + {core::QueryConfig::kSparkFirstDayOfWeek, std::to_string(1)}, + {core::QueryConfig::kSparkMinimalDaysInFirstWeek, std::to_string(1)}, + }); + EXPECT_EQ(fromUnixTime(getUnixTime("2017-12-31 00:00:00"), "x"), "2017"); + + // Reset config + queryCtx_->testingOverrideConfigUnsafe({}); + // Invalid format. VELOX_ASSERT_THROW( fromUnixTime(0, "yyyy-AA"), "Specifier A is not supported.");