diff --git a/velox/external/date/patches/0006-add_get_time_zone_names.patch b/velox/external/date/patches/0006-add_get_time_zone_names.patch new file mode 100644 index 0000000000000..f7b60effe4f3d --- /dev/null +++ b/velox/external/date/patches/0006-add_get_time_zone_names.patch @@ -0,0 +1,30 @@ +diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp +--- a/velox/external/date/tz.cpp ++++ b/velox/external/date/tz.cpp +@@ -3538,6 +3538,14 @@ + return get_tzdb_list().front(); + } + ++std::vector get_time_zone_names() { ++ std::vector result; ++ for (const auto& z : get_tzdb().zones) { ++ result.push_back(z.name()); ++ } ++ return result; ++} ++ + const time_zone* + #if HAS_STRING_VIEW + tzdb::locate_zone(std::string_view tz_name) const +diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h +--- a/velox/external/date/tz.h ++++ b/velox/external/date/tz.h +@@ -1258,6 +1258,8 @@ + + DATE_API const tzdb& get_tzdb(); + ++std::vector get_time_zone_names(); ++ + class tzdb_list + { + std::atomic head_{nullptr}; diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp index 69513d7d3145a..13ebe93561da9 100644 --- a/velox/external/date/tz.cpp +++ b/velox/external/date/tz.cpp @@ -3538,6 +3538,14 @@ get_tzdb() return get_tzdb_list().front(); } +std::vector get_time_zone_names() { + std::vector result; + for (const auto& z : get_tzdb().zones) { + result.push_back(z.name()); + } + return result; +} + const time_zone* #if HAS_STRING_VIEW tzdb::locate_zone(std::string_view tz_name) const diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h index 4ec0dbb44cfd2..aa6d42c8d3596 100644 --- a/velox/external/date/tz.h +++ b/velox/external/date/tz.h @@ -1258,6 +1258,8 @@ operator<<(std::ostream& os, const tzdb& db); DATE_API const tzdb& get_tzdb(); +std::vector get_time_zone_names(); + class tzdb_list { std::atomic head_{nullptr}; diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index e0ab43bd29d46..77950f60147f7 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -347,6 +347,128 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) { return -1; } +// Contains a list of all time zone names in a convenient format for searching. +// +// Time zone names without the '/' character (without a prefix) are stored in +// timeZoneNamesWithoutPrefix ordered by size desc. +// +// Time zone names with the '/' character (with a prefix) are stored in a map +// timeZoneNamePrefixMap from prefix (the string before the first '/') to a +// vector of strings which contains the suffixes (the strings after the first +// '/') ordered by size desc. +struct TimeZoneNameMappings { + std::vector timeZoneNamesWithoutPrefix; + std::unordered_map> + timeZoneNamePrefixMap; +}; + +TimeZoneNameMappings getTimeZoneNameMappings() { + // Here we use get_time_zone_names instead of calling get_tzdb and + // constructing the list ourselves because there is some unknown issue with + // the tz library where the time_zone objects after the first one in the tzdb + // will be invalid (contain nullptrs) after the get_tzdb function returns. + const std::vector timeZoneNames = date::get_time_zone_names(); + + TimeZoneNameMappings result; + for (size_t i = 0; i < timeZoneNames.size(); i++) { + const auto& timeZoneName = timeZoneNames[i]; + auto separatorPoint = timeZoneName.find('/'); + + if (separatorPoint == std::string::npos) { + result.timeZoneNamesWithoutPrefix.push_back(timeZoneName); + } else { + std::string prefix = timeZoneName.substr(0, separatorPoint); + std::string suffix = timeZoneName.substr(separatorPoint + 1); + + result.timeZoneNamePrefixMap[prefix].push_back(suffix); + } + } + + std::sort( + result.timeZoneNamesWithoutPrefix.begin(), + result.timeZoneNamesWithoutPrefix.end(), + [](const std::string& a, const std::string& b) { + return b.size() < a.size(); + }); + + for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) { + std::sort( + suffixes.begin(), + suffixes.end(), + [](const std::string& a, const std::string& b) { + return b.size() < a.size(); + }); + } + + return result; +} + +int64_t parseTimezoneName(const char* cur, const char* end, Date& date) { + static const TimeZoneNameMappings timeZoneNameMappings = + getTimeZoneNameMappings(); + + if (cur < end) { + // Find the first instance of '/' in the remainder of the string + const char* separatorPoint = cur; + while (separatorPoint < end && *separatorPoint != '/') { + ++separatorPoint; + } + + // Try to find a time zone with a prefix that includes the speratorPoint. + if (separatorPoint != end) { + std::string prefix(cur, separatorPoint); + + auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix); + if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) { + // This is greedy, find the longest suffix for the given prefix that + // fits the string. We know the value in the map is already sorted by + // length in decreasing order. + for (const auto& suffixName : it->second) { + if (suffixName.size() <= end - separatorPoint - 1 && + suffixName == + std::string_view( + separatorPoint + 1, + separatorPoint + 1 + suffixName.size())) { + auto timeZoneNameSize = prefix.size() + 1 + suffixName.size(); + date.timezone = + tz::locateZone(std::string_view(cur, timeZoneNameSize), false); + + if (!date.timezone) { + return -1; + } + + return timeZoneNameSize; + } + } + } + } + + // If we found a '/' but didn't find a match in the set of time zones with + // prefixes, try search before the '/' for a time zone without a prefix. If + // we didn't find a '/' then end already equals separatorPoint. + end = separatorPoint; + + for (const auto& timeZoneName : + timeZoneNameMappings.timeZoneNamesWithoutPrefix) { + // Again, this is greedy, find the largest time zone name without a prefix + // that fits the string. We know timeZoneNamesWithoutPrefix is already + // sorted by length in decreasing order. + if (timeZoneName.size() <= end - cur && + timeZoneName == std::string_view(cur, cur + timeZoneName.size())) { + date.timezone = tz::locateZone(timeZoneName, false); + + if (!date.timezone) { + return -1; + } + + return timeZoneName.size(); + } + } + } + + return -1; +} + int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) { // For timezone offset ids, there are three formats allowed by Joda: // @@ -639,8 +761,8 @@ int getMaxDigitConsume( return curPattern.minRepresentDigits; } else { if (type == DateTimeFormatterType::MYSQL) { - // MySQL format will try to read in at most 4 digits when supplied a - // year, never more. + // MySQL format will try to read in at most 4 digits when + // supplied a year, never more. return 4; } return curPattern.minRepresentDigits > 9 ? curPattern.minRepresentDigits @@ -681,7 +803,13 @@ int32_t parseFromPattern( bool specifierNext, DateTimeFormatterType type) { if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) { - auto size = parseTimezoneOffset(cur, end, date); + int64_t size; + if (curPattern.minRepresentDigits < 3) { + size = parseTimezoneOffset(cur, end, date); + } else { + size = parseTimezoneName(cur, end, date); + } + if (size == -1) { return -1; } @@ -762,12 +890,13 @@ int32_t parseFromPattern( curPattern.specifier == DateTimeFormatSpecifier::YEAR_OF_ERA || curPattern.specifier == DateTimeFormatSpecifier::WEEK_YEAR) && curPattern.minRepresentDigits == 2) { - // If abbreviated two year digit is provided in format string, try to read - // in two digits of year and convert to appropriate full length year The - // two-digit mapping is as follows: [00, 69] -> [2000, 2069] + // If abbreviated two year digit is provided in format string, try + // to read in two digits of year and convert to appropriate full + // length year The two-digit mapping is as follows: [00, 69] -> + // [2000, 2069] // [70, 99] -> [1970, 1999] - // If more than two digits are provided, then simply read in full year - // normally without conversion + // If more than two digits are provided, then simply read in full + // year normally without conversion int count = 0; while (cur < end && cur < startPos + maxDigitConsume && characterIsDigit(*cur)) { @@ -782,8 +911,8 @@ int32_t parseFromPattern( number += 2000; } } else if (type == DateTimeFormatterType::MYSQL) { - // In MySQL format, year read in must have exactly two digits, otherwise - // return -1 to indicate parsing error. + // In MySQL format, year read in must have exactly two digits, + // otherwise return -1 to indicate parsing error. if (count > 2) { // Larger than expected, print suffix. cur = cur - count + 2; @@ -813,7 +942,8 @@ int32_t parseFromPattern( switch (curPattern.specifier) { case DateTimeFormatSpecifier::CENTURY_OF_ERA: - // Enforce Joda's year range if year was specified as "century of year". + // Enforce Joda's year range if year was specified as "century of + // year". if (number < 0 || number > 2922789) { return -1; } @@ -827,7 +957,8 @@ int32_t parseFromPattern( date.centuryFormat = false; date.isYearOfEra = (curPattern.specifier == DateTimeFormatSpecifier::YEAR_OF_ERA); - // Enforce Joda's year range if year was specified as "year of era". + // Enforce Joda's year range if year was specified as "year of + // era". if (date.isYearOfEra && (number > 292278993 || number < 1)) { return -1; } @@ -846,9 +977,9 @@ int32_t parseFromPattern( date.month = number; date.weekDateFormat = false; date.dayOfYearFormat = false; - // Joda has this weird behavior where it returns 1970 as the year by - // default (if no year is specified), but if either day or month are - // specified, it fallsback to 2000. + // Joda has this weird behavior where it returns 1970 as the year + // by default (if no year is specified), but if either day or + // month are specified, it fallsback to 2000. if (!date.hasYear) { date.hasYear = true; date.year = 2000; @@ -860,9 +991,9 @@ int32_t parseFromPattern( date.day = number; date.weekDateFormat = false; date.dayOfYearFormat = false; - // Joda has this weird behavior where it returns 1970 as the year by - // default (if no year is specified), but if either day or month are - // specified, it fallsback to 2000. + // Joda has this weird behavior where it returns 1970 as the year + // by default (if no year is specified), but if either day or + // month are specified, it fallsback to 2000. if (!date.hasYear) { date.hasYear = true; date.year = 2000; @@ -874,9 +1005,9 @@ int32_t parseFromPattern( date.dayOfYear = number; date.dayOfYearFormat = true; date.weekDateFormat = false; - // Joda has this weird behavior where it returns 1970 as the year by - // default (if no year is specified), but if either day or month are - // specified, it fallsback to 2000. + // Joda has this weird behavior where it returns 1970 as the year + // by default (if no year is specified), but if either day or + // month are specified, it fallsback to 2000. if (!date.hasYear) { date.hasYear = true; date.year = 2000; @@ -1284,14 +1415,14 @@ int32_t DateTimeFormatter::format( } break; case DateTimeFormatSpecifier::TIMEZONE: { - // TODO: implement short name time zone, need a map from full name to - // short name + // TODO: implement short name time zone, need a map from full name + // to short name VELOX_UNSUPPORTED("time zone name is not yet supported"); } break; case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: { - // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset - // with a colon, 'ZZZ' or more outputs the zone id. + // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the + // offset with a colon, 'ZZZ' or more outputs the zone id. // TODO Add support for 'Z' and 'ZZZ'. if (offset == 0 && zeroOffsetText.has_value()) { std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size()); @@ -1448,8 +1579,8 @@ Expected> buildMysqlDateTimeFormatter( Status::UserError("Both printing and parsing not supported")); } - // For %r we should reserve 1 extra space because it has 3 literals ':' ':' - // and ' ' + // For %r we should reserve 1 extra space because it has 3 literals ':' + // ':' and ' ' DateTimeFormatterBuilder builder( format.size() + countOccurence(format, "%r")); @@ -1725,9 +1856,9 @@ Expected> buildSimpleDateTimeFormatter( while (cur < end) { const char* startTokenPtr = cur; - // For literal case, literal should be quoted using single quotes ('). If - // there is no quotes, it is interpreted as pattern letters. If there is - // only single quote, a user error will be thrown. + // For literal case, literal should be quoted using single quotes ('). + // If there is no quotes, it is interpreted as pattern letters. If there + // is only single quote, a user error will be thrown. if (*startTokenPtr == '\'') { // Append single literal quote for 2 consecutive single quote. if (cur + 1 < end && *(cur + 1) == '\'') { @@ -1753,8 +1884,8 @@ Expected> buildSimpleDateTimeFormatter( cur += count + 2; } } else { - // Append format specifier according to pattern letters. If pattern letter - // is not supported, a user error will be thrown. + // Append format specifier according to pattern letters. If pattern + // letter is not supported, a user error will be thrown. int count = 1; ++cur; while (cur < end && *startTokenPtr == *cur) { diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index be8618bf51759..5b50612d37977 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -2976,10 +2976,52 @@ TEST_F(DateTimeFunctionsTest, parseDatetime) { ts, parseDatetime("2024-02-25+06:00:99 UTC", "yyyy-MM-dd+HH:mm:99 ZZZ")); EXPECT_EQ( ts, parseDatetime("2024-02-25+06:00:99 UTC", "yyyy-MM-dd+HH:mm:99 ZZZ")); - + // Test a time zone with a prefix. + EXPECT_EQ( + TimestampWithTimezone(1708869600000, "America/Los_Angeles"), + parseDatetime( + "2024-02-25+06:00:99 America/Los_Angeles", + "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone with a prefix is greedy. Etc/GMT-1 and Etc/GMT-10 are both + // valid time zone names. + EXPECT_EQ( + TimestampWithTimezone(1708804800000, "Etc/GMT-10"), + parseDatetime( + "2024-02-25+06:00:99 Etc/GMT-10", "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone without a prefix is greedy. NZ and NZ-CHAT are both + // valid time zone names. + EXPECT_EQ( + TimestampWithTimezone(1708791300000, "NZ-CHAT"), + parseDatetime("2024-02-25+06:00:99 NZ-CHAT", "yyyy-MM-dd+HH:mm:99 ZZZ")); + // Test a time zone with a prefix can handle trailing data. + EXPECT_EQ( + TimestampWithTimezone(1708869600000, "America/Los_Angeles"), + parseDatetime( + "America/Los_Angeles2024-02-25+06:00:99", "ZZZyyyy-MM-dd+HH:mm:99")); + // Test a time zone without a prefix can handle trailing data. + EXPECT_EQ( + TimestampWithTimezone(1708840800000, "GMT"), + parseDatetime("GMT2024-02-25+06:00:99", "ZZZyyyy-MM-dd+HH:mm:99")); + // Test parsing can fall back to checking for time zones without a prefix when + // a '/' is present but not part of the time zone name. + EXPECT_EQ( + TimestampWithTimezone(1708840800000, "GMT"), + parseDatetime("GMT/2024-02-25+06:00:99", "ZZZ/yyyy-MM-dd+HH:mm:99")); + + // Test an invalid time zone without a prefix. (zzz should be used to match + // abbreviations) VELOX_ASSERT_THROW( parseDatetime("2024-02-25+06:00:99 PST", "yyyy-MM-dd+HH:mm:99 ZZZ"), "Invalid date format: '2024-02-25+06:00:99 PST'"); + // Test an invalid time zone with a prefix that doesn't appear at all. + VELOX_ASSERT_THROW( + parseDatetime("2024-02-25+06:00:99 ABC/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), + "Invalid date format: '2024-02-25+06:00:99 ABC/XYZ'"); + // Test an invalid time zone with a prefix that does appear. + VELOX_ASSERT_THROW( + parseDatetime( + "2024-02-25+06:00:99 America/XYZ", "yyyy-MM-dd+HH:mm:99 ZZZ"), + "Invalid date format: '2024-02-25+06:00:99 America/XYZ'"); } TEST_F(DateTimeFunctionsTest, formatDateTime) {