From a1d923ed1ea357a957bc897c67d426ce2991af22 Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Fri, 25 Oct 2024 18:06:04 -0700 Subject: [PATCH] Add support for "week of month" in SimpleDateTimeFormatter (#11103) Summary: `java.text.SimpleDateFormat` supports using 'week of month' to parse/format date. The specifier of 'week of month' is 'W'. Now DateTimeFormatter supports 3 group of fields specifying the day within the year. They are following combinations: ``` year + week + dayOfWeek year + dayOfYear year + month + day ``` This PR introduces a new combination that is `year + month + weekOfMonth + dayOfWeek` and adds support for "week of month" in SimpleDateTimeFormatter. Relates issue : https://github.com/facebookincubator/velox/issues/10354 Pull Request resolved: https://github.com/facebookincubator/velox/pull/11103 Reviewed By: Yuhta, amitkdutta Differential Revision: D64920551 Pulled By: pedroerp fbshipit-source-id: 3db9b6d33783aac0ee41791aeac96142e63fb22a --- velox/functions/lib/DateTimeFormatter.cpp | 66 +++++++++- velox/functions/lib/DateTimeFormatter.h | 5 +- .../lib/DateTimeFormatterBuilder.cpp | 7 + .../functions/lib/DateTimeFormatterBuilder.h | 10 ++ .../lib/tests/DateTimeFormatterTest.cpp | 124 ++++++++++++++++++ 5 files changed, 205 insertions(+), 7 deletions(-) diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index dc58e6d527ae..b86d3eb24b8a 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -47,10 +47,14 @@ struct Date { int32_t dayOfYear = 1; bool dayOfYearFormat = false; + int32_t weekOfMonth = 1; + bool weekOfMonthDateFormat = false; + bool centuryFormat = false; bool isYearOfEra = false; // Year of era cannot be zero or negative. bool hasYear = false; // Whether year was explicitly specified. + bool hasDayOfWeek = false; // Whether dayOfWeek was explicitly specified. int32_t hour = 0; int32_t minute = 0; @@ -456,6 +460,7 @@ int64_t parseDayOfWeekText(const char* cur, const char* end, Date& date) { auto it = dayOfWeekMap.find(std::string_view(cur, 3)); if (it != dayOfWeekMap.end()) { date.dayOfWeek = it->second.second; + date.hasDayOfWeek = true; if (end - cur >= it->second.first.size() + 3) { if (std::strncmp( cur + 3, it->second.first.data(), it->second.first.size()) == @@ -612,6 +617,8 @@ std::string_view getSpecifierName(DateTimeFormatSpecifier specifier) { return "TIMEZONE_OFFSET_ID"; case DateTimeFormatSpecifier::LITERAL_PERCENT: return "LITERAL_PERCENT"; + case DateTimeFormatSpecifier::WEEK_OF_MONTH: + return "WEEK_OF_MONTH"; default: { VELOX_UNREACHABLE("[Unexpected date format specifier]"); return ""; // Make compiler happy. @@ -628,6 +635,7 @@ int getMaxDigitConsume( case DateTimeFormatSpecifier::CENTURY_OF_ERA: case DateTimeFormatSpecifier::DAY_OF_WEEK_1_BASED: case DateTimeFormatSpecifier::FRACTION_OF_SECOND: + case DateTimeFormatSpecifier::WEEK_OF_MONTH: return curPattern.minRepresentDigits; case DateTimeFormatSpecifier::YEAR_OF_ERA: @@ -720,7 +728,9 @@ int32_t parseFromPattern( return -1; } cur += size; - date.weekDateFormat = true; + if (!date.weekOfMonthDateFormat) { + date.weekDateFormat = true; + } date.dayOfYearFormat = false; if (!date.hasYear) { date.hasYear = true; @@ -838,8 +848,10 @@ int32_t parseFromPattern( break; case DateTimeFormatSpecifier::MONTH_OF_YEAR: - if (number < 1 || number > 12) { - return -1; + if (type != DateTimeFormatterType::LENIENT_SIMPLE) { + if (number < 1 || number > 12) { + return -1; + } } date.month = number; date.weekDateFormat = false; @@ -858,6 +870,7 @@ int32_t parseFromPattern( date.day = number; date.weekDateFormat = false; date.dayOfYearFormat = false; + date.weekOfMonthDateFormat = false; // Joda has this weird behavior where it returns 1970 as the year by // default (if no year is specified), but if either day or month are // specified, it fallsback to 2000. @@ -872,6 +885,7 @@ int32_t parseFromPattern( date.dayOfYear = number; date.dayOfYearFormat = true; date.weekDateFormat = false; + date.weekOfMonthDateFormat = false; // Joda has this weird behavior where it returns 1970 as the year by // default (if no year is specified), but if either day or month are // specified, it fallsback to 2000. @@ -944,6 +958,7 @@ int32_t parseFromPattern( date.weekDateFormat = true; date.dayOfYearFormat = false; date.centuryFormat = false; + date.weekOfMonthDateFormat = false; date.hasYear = true; break; @@ -954,6 +969,7 @@ int32_t parseFromPattern( date.week = number; date.weekDateFormat = true; date.dayOfYearFormat = false; + date.weekOfMonthDateFormat = false; if (!date.hasYear) { date.hasYear = true; date.year = 2000; @@ -961,17 +977,32 @@ int32_t parseFromPattern( break; case DateTimeFormatSpecifier::DAY_OF_WEEK_1_BASED: - if (number < 1 || number > 7) { - return -1; + if (type != DateTimeFormatterType::LENIENT_SIMPLE) { + if (number < 1 || number > 7) { + return -1; + } } date.dayOfWeek = number; - date.weekDateFormat = true; + date.hasDayOfWeek = true; + if (!date.weekOfMonthDateFormat) { + date.weekDateFormat = true; + } date.dayOfYearFormat = false; if (!date.hasYear) { date.hasYear = true; date.year = 2000; } break; + case DateTimeFormatSpecifier::WEEK_OF_MONTH: + date.weekOfMonthDateFormat = true; + date.weekOfMonth = number; + date.weekDateFormat = false; + date.hasYear = true; + // For week of month date format, the default value of dayOfWeek is 7. + if (!date.hasDayOfWeek) { + date.dayOfWeek = 7; + } + break; default: VELOX_NYI( @@ -1003,6 +1034,7 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { break; case DateTimeFormatSpecifier::DAY_OF_WEEK_0_BASED: case DateTimeFormatSpecifier::DAY_OF_WEEK_1_BASED: + case DateTimeFormatSpecifier::WEEK_OF_MONTH: size += std::max((int)token.pattern.minRepresentDigits, 1); break; case DateTimeFormatSpecifier::DAY_OF_WEEK_TEXT: @@ -1321,6 +1353,18 @@ int32_t DateTimeFormatter::format( result); break; } + case DateTimeFormatSpecifier::WEEK_OF_MONTH: { + result += padContent( + ceil( + (7 + static_cast(calDate.day()) - + weekday.c_encoding() - 1) / + 7.0), + '0', + token.pattern.minRepresentDigits, + maxResultEnd, + result); + break; + } case DateTimeFormatSpecifier::WEEK_YEAR: default: VELOX_UNSUPPORTED( @@ -1423,6 +1467,13 @@ Expected DateTimeFormatter::parse( } else if (date.dayOfYearFormat) { daysSinceEpoch = util::daysSinceEpochFromDayOfYear(date.year, date.dayOfYear); + } else if (date.weekOfMonthDateFormat) { + daysSinceEpoch = util::daysSinceEpochFromWeekOfMonthDate( + date.year, + date.month, + date.weekOfMonth, + date.dayOfWeek, + this->type_ == DateTimeFormatterType::LENIENT_SIMPLE); } else { daysSinceEpoch = util::daysSinceEpochFromDate(date.year, date.month, date.day); @@ -1815,6 +1866,9 @@ Expected> buildSimpleDateTimeFormatter( case 'w': builder.appendWeekOfWeekYear(count); break; + case 'W': + builder.appendWeekOfMonth(count); + break; case 'x': builder.appendWeekYear(count); break; diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index 4b104f6ecef2..62678f0f0334 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -112,7 +112,10 @@ enum class DateTimeFormatSpecifier : uint8_t { TIMEZONE_OFFSET_ID = 22, // A literal % character - LITERAL_PERCENT = 23 + LITERAL_PERCENT = 23, + + // Week of month based on java.text.SimpleDateFormat, e.g: 2 + WEEK_OF_MONTH = 24 }; struct FormatPattern { diff --git a/velox/functions/lib/DateTimeFormatterBuilder.cpp b/velox/functions/lib/DateTimeFormatterBuilder.cpp index 5fa2b0bacdcd..aa80058ac8f3 100644 --- a/velox/functions/lib/DateTimeFormatterBuilder.cpp +++ b/velox/functions/lib/DateTimeFormatterBuilder.cpp @@ -56,6 +56,13 @@ DateTimeFormatterBuilder& DateTimeFormatterBuilder::appendWeekOfWeekYear( return *this; } +DateTimeFormatterBuilder& DateTimeFormatterBuilder::appendWeekOfMonth( + size_t minDigits) { + tokens_.emplace_back( + FormatPattern{DateTimeFormatSpecifier::WEEK_OF_MONTH, minDigits}); + return *this; +} + DateTimeFormatterBuilder& DateTimeFormatterBuilder::appendDayOfWeek0Based( size_t minDigits) { tokens_.emplace_back( diff --git a/velox/functions/lib/DateTimeFormatterBuilder.h b/velox/functions/lib/DateTimeFormatterBuilder.h index b5e27d1103ca..aa508869432e 100644 --- a/velox/functions/lib/DateTimeFormatterBuilder.h +++ b/velox/functions/lib/DateTimeFormatterBuilder.h @@ -88,6 +88,16 @@ class DateTimeFormatterBuilder { /// will be 001 DateTimeFormatterBuilder& appendWeekOfWeekYear(size_t minDigits); + /// Appends week of month to formatter builder, e.g: 2 + /// + /// \param minDigits describes the minimum number of digits this format is + /// required to represent week of month. The format by default is going + /// use as few digits as possible greater than or equal to minDigits to + /// represent week of month. e.g. 1999-01-01, with min digit being 1 the + /// formatted result will be 1, with min digit being 4 the formatted result + /// will be 0001 + DateTimeFormatterBuilder& appendWeekOfMonth(size_t minDigits); + /// Appends day of week to formatter builder. The number is 0 based with 0 ~ 6 /// representing Sunday to Saturday respectively /// diff --git a/velox/functions/lib/tests/DateTimeFormatterTest.cpp b/velox/functions/lib/tests/DateTimeFormatterTest.cpp index 588ba950c96d..e81c44728dba 100644 --- a/velox/functions/lib/tests/DateTimeFormatterTest.cpp +++ b/velox/functions/lib/tests/DateTimeFormatterTest.cpp @@ -2317,4 +2317,128 @@ TEST_F(MysqlDateTimeTest, parseConsecutiveSpecifiers) { EXPECT_THROW(parseMysql("1212", "%Y%H"), VeloxUserError); } +class SimpleDateTimeFormatterTest : public DateTimeFormatterTest { + protected: + DateTimeResult parseSimple( + const std::string_view& input, + const std::string_view& format, + bool lenient) { + auto dateTimeResultExpected = + (*buildSimpleDateTimeFormatter(format, lenient))->parse(input); + return dateTimeResult(dateTimeResultExpected); + } + + std::string formatSimpleDateTime( + const std::string& format, + const Timestamp& timestamp, + const tz::TimeZone* timezone, + bool lenient) const { + auto formatter = buildSimpleDateTimeFormatter(format, lenient).value(); + const auto maxSize = formatter->maxResultSize(timezone); + std::string result(maxSize, '\0'); + auto resultSize = + formatter->format(timestamp, timezone, maxSize, result.data()); + result.resize(resultSize); + return result; + } +}; + +TEST_F(SimpleDateTimeFormatterTest, validSimpleBuild) { + // W specifier case. + std::vector expected = { + DateTimeToken(FormatPattern{DateTimeFormatSpecifier::WEEK_OF_MONTH, 1})}; + EXPECT_EQ(expected, (*buildSimpleDateTimeFormatter("W", true))->tokens()); + EXPECT_EQ(expected, (*buildSimpleDateTimeFormatter("W", false))->tokens()); +} + +TEST_F(SimpleDateTimeFormatterTest, parseSimpleWeekOfMonth) { + // Common cases for lenient and strict mode. + for (bool lenient : {true, false}) { + // Format contains year, month, weekOfMonth and dayOfWeek. + EXPECT_EQ( + fromTimestampString("2024-08-02"), + parseSimple("2024 08 01 5", "yyyy MM WW e", lenient).timestamp); + EXPECT_EQ( + fromTimestampString("2024-08-03"), + parseSimple("2024 08 01 6", "yyyy MM WW e", lenient).timestamp); + EXPECT_EQ( + fromTimestampString("2024-08-09"), + parseSimple("2024 08 02 5", "yyyy MM WW e", lenient).timestamp); + EXPECT_EQ( + fromTimestampString("2024-08-12"), + parseSimple("2024 08 03 1", "yyyy MM WW e", lenient).timestamp); + + // Format contains year, month and weekOfMonth. + EXPECT_EQ( + fromTimestampString("2024-07-28"), + parseSimple("2024 08 01", "yyyy MM WW", lenient).timestamp); + + // Format contains year and weekOfMonth. + EXPECT_EQ( + fromTimestampString("2023-12-31"), + parseSimple("2024 01", "yyyy WW", lenient).timestamp); + + // Format contains weekOfMonth. + EXPECT_EQ( + fromTimestampString("1969-12-28"), + parseSimple("1", "W", lenient).timestamp); + } + + // Field out of range for lenient mode. + EXPECT_EQ( + fromTimestampString("2024-09-30"), + parseSimple("2024 08 10 1", "yyyy MM WW e", true).timestamp); + EXPECT_EQ( + fromTimestampString("2024-07-28"), + parseSimple("2024 08 01", "yyyy MM WW", true).timestamp); + EXPECT_EQ( + fromTimestampString("2025-02-24"), + parseSimple("2024 15 01 1", "yyyy MM WW e", true).timestamp); + EXPECT_EQ( + fromTimestampString("2024-07-29"), + parseSimple("2024 08 01 9", "yyyy MM WW e", true).timestamp); + + // Field out of range for strict mode. + EXPECT_THROW( + parseSimple("2024 08 10 1", "yyyy MM WW e", false), VeloxUserError); + EXPECT_THROW(parseSimple("2024 08 10", "yyyy MM WW", false), VeloxUserError); + EXPECT_THROW( + parseSimple("2024 15 01 1", "yyyy MM WW e", false), VeloxUserError); + EXPECT_THROW( + parseSimple("2024 08 01 9", "yyyy MM WW e", false), VeloxUserError); +} + +TEST_F(SimpleDateTimeFormatterTest, formatResultSize) { + EXPECT_EQ( + (*buildSimpleDateTimeFormatter("WW", false))->maxResultSize(nullptr), 2); + EXPECT_EQ( + (*buildSimpleDateTimeFormatter("WW", true))->maxResultSize(nullptr), 2); +} + +TEST_F(SimpleDateTimeFormatterTest, formatWeekOfMonth) { + auto* timezone = tz::locateZone("GMT"); + for (bool lenient : {true, false}) { + EXPECT_EQ( + formatSimpleDateTime( + "W", fromTimestampString("2024-08-01"), timezone, lenient), + "1"); + EXPECT_EQ( + formatSimpleDateTime( + "W", fromTimestampString("2024-08-10"), timezone, lenient), + "2"); + EXPECT_EQ( + formatSimpleDateTime( + "W", fromTimestampString("2024-08-11"), timezone, lenient), + "3"); + EXPECT_EQ( + formatSimpleDateTime( + "W", fromTimestampString("2024-08-15"), timezone, lenient), + "3"); + EXPECT_EQ( + formatSimpleDateTime( + "W", fromTimestampString("2024-08-30"), timezone, lenient), + "5"); + } +} + } // namespace facebook::velox::functions