-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for z, zz, zzz in format_datetime #11323
Closed
kevinwilfong
wants to merge
3
commits into
facebookincubator:main
from
kevinwilfong:export-D64774281
Closed
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
30 changes: 30 additions & 0 deletions
30
velox/external/date/patches/0006-add_get_time_zone_names.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp | ||
--- a/velox/external/date/tz.cpp | ||
+++ b/velox/external/date/tz.cpp | ||
@@ -3538,6 +3538,14 @@ | ||
return get_tzdb_list().front(); | ||
} | ||
|
||
+std::vector<std::string> get_time_zone_names() { | ||
+ std::vector<std::string> result; | ||
+ for (const auto& z : get_tzdb().zones) { | ||
+ result.push_back(z.name()); | ||
+ } | ||
+ return result; | ||
+} | ||
+ | ||
const time_zone* | ||
#if HAS_STRING_VIEW | ||
tzdb::locate_zone(std::string_view tz_name) const | ||
diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h | ||
--- a/velox/external/date/tz.h | ||
+++ b/velox/external/date/tz.h | ||
@@ -1258,6 +1258,8 @@ | ||
|
||
DATE_API const tzdb& get_tzdb(); | ||
|
||
+std::vector<std::string> get_time_zone_names(); | ||
+ | ||
class tzdb_list | ||
{ | ||
std::atomic<tzdb*> head_{nullptr}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -351,6 +351,133 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) { | |
return -1; | ||
} | ||
|
||
// Contains a list of all time zone names in a convenient format for searching. | ||
// | ||
// Time zone names without the '/' character (without a prefix) are stored in | ||
// timeZoneNamesWithoutPrefix ordered by size desc. | ||
// | ||
// Time zone names with the '/' character (with a prefix) are stored in a map | ||
// timeZoneNamePrefixMap from prefix (the string before the first '/') to a | ||
// vector of strings which contains the suffixes (the strings after the first | ||
// '/') ordered by size desc. | ||
struct TimeZoneNameMappings { | ||
std::vector<std::string> timeZoneNamesWithoutPrefix; | ||
std::unordered_map<std::string, std::vector<std::string>> | ||
timeZoneNamePrefixMap; | ||
}; | ||
|
||
TimeZoneNameMappings getTimeZoneNameMappings() { | ||
// Here we use get_time_zone_names instead of calling get_tzdb and | ||
// constructing the list ourselves because there is some unknown issue with | ||
// the tz library where the time_zone objects after the first one in the tzdb | ||
// will be invalid (contain nullptrs) after the get_tzdb function returns. | ||
const std::vector<std::string> timeZoneNames = date::get_time_zone_names(); | ||
|
||
TimeZoneNameMappings result; | ||
for (size_t i = 0; i < timeZoneNames.size(); i++) { | ||
const auto& timeZoneName = timeZoneNames[i]; | ||
auto separatorPoint = timeZoneName.find('/'); | ||
|
||
if (separatorPoint == std::string::npos) { | ||
result.timeZoneNamesWithoutPrefix.push_back(timeZoneName); | ||
} else { | ||
std::string prefix = timeZoneName.substr(0, separatorPoint); | ||
std::string suffix = timeZoneName.substr(separatorPoint + 1); | ||
|
||
result.timeZoneNamePrefixMap[prefix].push_back(suffix); | ||
} | ||
} | ||
|
||
std::sort( | ||
result.timeZoneNamesWithoutPrefix.begin(), | ||
result.timeZoneNamesWithoutPrefix.end(), | ||
[](const std::string& a, const std::string& b) { | ||
return b.size() < a.size(); | ||
}); | ||
|
||
for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) { | ||
std::sort( | ||
suffixes.begin(), | ||
suffixes.end(), | ||
[](const std::string& a, const std::string& b) { | ||
return b.size() < a.size(); | ||
}); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
int64_t parseTimezoneName(const char* cur, const char* end, Date& date) { | ||
// For time zone names we try to greedily find the longest substring starting | ||
// from cur that is a valid time zone name. To help speed things along we | ||
// treat time zone names as {prefix}/{suffix} (for the first instance of '/') | ||
// and create lists of suffixes per prefix. We order these lists by length of | ||
// the suffix so once we identify the prefix, we can return the first suffix | ||
// we find in the string. We treat time zone names without a prefix (i.e. | ||
// without a '/') separately but similarly. | ||
static const TimeZoneNameMappings timeZoneNameMappings = | ||
getTimeZoneNameMappings(); | ||
|
||
if (cur < end) { | ||
// Find the first instance of '/' in the remainder of the string | ||
const char* separatorPoint = cur; | ||
while (separatorPoint < end && *separatorPoint != '/') { | ||
++separatorPoint; | ||
} | ||
|
||
// Try to find a time zone with a prefix that includes the speratorPoint. | ||
if (separatorPoint != end) { | ||
std::string prefix(cur, separatorPoint); | ||
|
||
auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix); | ||
if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) { | ||
// This is greedy, find the longest suffix for the given prefix that | ||
// fits the string. We know the value in the map is already sorted by | ||
// length in decreasing order. | ||
for (const auto& suffixName : it->second) { | ||
if (suffixName.size() <= end - separatorPoint - 1 && | ||
suffixName == | ||
std::string_view(separatorPoint + 1, suffixName.size())) { | ||
auto timeZoneNameSize = prefix.size() + 1 + suffixName.size(); | ||
date.timezone = | ||
tz::locateZone(std::string_view(cur, timeZoneNameSize), false); | ||
|
||
if (!date.timezone) { | ||
return -1; | ||
} | ||
|
||
return timeZoneNameSize; | ||
} | ||
} | ||
} | ||
} | ||
|
||
// If we found a '/' but didn't find a match in the set of time zones with | ||
// prefixes, try search before the '/' for a time zone without a prefix. If | ||
// we didn't find a '/' then end already equals separatorPoint. | ||
end = separatorPoint; | ||
|
||
for (const auto& timeZoneName : | ||
timeZoneNameMappings.timeZoneNamesWithoutPrefix) { | ||
// Again, this is greedy, find the largest time zone name without a prefix | ||
// that fits the string. We know timeZoneNamesWithoutPrefix is already | ||
// sorted by length in decreasing order. | ||
if (timeZoneName.size() <= end - cur && | ||
timeZoneName == std::string_view(cur, timeZoneName.size())) { | ||
date.timezone = tz::locateZone(timeZoneName, false); | ||
|
||
if (!date.timezone) { | ||
return -1; | ||
} | ||
|
||
return timeZoneName.size(); | ||
} | ||
} | ||
} | ||
|
||
return -1; | ||
} | ||
|
||
int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) { | ||
// For timezone offset ids, there are three formats allowed by Joda: | ||
// | ||
|
@@ -518,7 +645,7 @@ std::string formatFractionOfSecond( | |
return toAdd; | ||
} | ||
|
||
int32_t appendTimezoneOffset(int64_t offset, char* result) { | ||
int32_t appendTimezoneOffset(int64_t offset, char* result, bool includeColon) { | ||
int pos = 0; | ||
if (offset >= 0) { | ||
result[pos++] = '+'; | ||
|
@@ -536,7 +663,9 @@ int32_t appendTimezoneOffset(int64_t offset, char* result) { | |
result[pos++] = char(hours % 10 + '0'); | ||
} | ||
|
||
result[pos++] = ':'; | ||
if (includeColon) { | ||
result[pos++] = ':'; | ||
} | ||
|
||
const auto minutes = (offset / 60) % 60; | ||
if LIKELY (minutes == 0) { | ||
|
@@ -687,7 +816,13 @@ int32_t parseFromPattern( | |
bool specifierNext, | ||
DateTimeFormatterType type) { | ||
if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) { | ||
auto size = parseTimezoneOffset(cur, end, date); | ||
int64_t size; | ||
if (curPattern.minRepresentDigits < 3) { | ||
size = parseTimezoneOffset(cur, end, date); | ||
} else { | ||
size = parseTimezoneName(cur, end, date); | ||
} | ||
|
||
if (size == -1) { | ||
return -1; | ||
} | ||
|
@@ -1068,20 +1203,33 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const { | |
size += std::max((int)token.pattern.minRepresentDigits, 9); | ||
break; | ||
case DateTimeFormatSpecifier::TIMEZONE: | ||
if (timezone == nullptr) { | ||
VELOX_USER_FAIL("Timezone unknown"); | ||
if (token.pattern.minRepresentDigits <= 3) { | ||
// The longest abbreviation according to here is 5, e.g. some time | ||
// zones use the offset as the abbreviation, like +0530. | ||
// https://en.wikipedia.org/wiki/List_of_tz_database_time_zones | ||
size += 5; | ||
} else { | ||
VELOX_NYI( | ||
"Date format specifier is not yet implemented: {} ({})", | ||
getSpecifierName(token.pattern.specifier), | ||
token.pattern.minRepresentDigits); | ||
} | ||
size += std::max( | ||
token.pattern.minRepresentDigits, timezone->name().length()); | ||
|
||
break; | ||
case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: | ||
if (token.pattern.minRepresentDigits != 2) { | ||
VELOX_UNSUPPORTED( | ||
"Date format specifier is not supported: {} ({})", | ||
getSpecifierName(token.pattern.specifier), | ||
token.pattern.minRepresentDigits); | ||
if (token.pattern.minRepresentDigits == 1) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe a comment explaining what these magic numbers mean? |
||
// 'Z' means output the time zone offset without a colon. | ||
size += 8; | ||
} else if (token.pattern.minRepresentDigits == 2) { | ||
// 'ZZ' means output the time zone offset with a colon. | ||
size += 9; | ||
} else { | ||
// 'ZZZ' (or more) means otuput the time zone ID. | ||
if (timezone == nullptr) { | ||
VELOX_USER_FAIL("Timezone unknown"); | ||
} | ||
size += timezone->name().length(); | ||
} | ||
size += 9; | ||
break; | ||
// Not supported. | ||
case DateTimeFormatSpecifier::WEEK_YEAR: | ||
|
@@ -1310,37 +1458,40 @@ int32_t DateTimeFormatter::format( | |
} break; | ||
|
||
case DateTimeFormatSpecifier::TIMEZONE: { | ||
// TODO: implement short name time zone, need a map from full name to | ||
// short name | ||
VELOX_USER_CHECK_NOT_NULL( | ||
timezone, | ||
"The time zone cannot be formatted if it is not present."); | ||
if (token.pattern.minRepresentDigits <= 3) { | ||
VELOX_UNSUPPORTED("short name time zone is not yet supported"); | ||
} | ||
if (timezone == nullptr) { | ||
VELOX_USER_FAIL("Timezone unknown"); | ||
const std::string& abbrev = timezone->getShortName( | ||
std::chrono::milliseconds(timestamp.toMillis()), | ||
tz::TimeZone::TChoose::kEarliest); | ||
std::memcpy(result, abbrev.data(), abbrev.length()); | ||
result += abbrev.length(); | ||
} else { | ||
// TODO: implement full name time zone | ||
VELOX_NYI("full time zone name is not yet supported"); | ||
} | ||
const auto& piece = timezone->name(); | ||
std::memcpy(result, piece.data(), piece.length()); | ||
result += piece.length(); | ||
} break; | ||
|
||
case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: { | ||
// Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset | ||
// with a colon, 'ZZZ' or more outputs the zone id. | ||
// TODO Add support for 'Z' and 'ZZZ'. | ||
if (token.pattern.minRepresentDigits != 2) { | ||
VELOX_UNSUPPORTED( | ||
"format is not supported for specifier {} ({})", | ||
getSpecifierName(token.pattern.specifier), | ||
token.pattern.minRepresentDigits); | ||
} | ||
|
||
if (offset == 0 && zeroOffsetText.has_value()) { | ||
std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size()); | ||
result += zeroOffsetText->size(); | ||
break; | ||
} | ||
|
||
result += appendTimezoneOffset(offset, result); | ||
if (token.pattern.minRepresentDigits >= 3) { | ||
// Append the time zone ID. | ||
const auto& piece = timezone->name(); | ||
std::memcpy(result, piece.data(), piece.length()); | ||
result += piece.length(); | ||
break; | ||
} | ||
|
||
result += appendTimezoneOffset( | ||
offset, result, token.pattern.minRepresentDigits == 2); | ||
break; | ||
} | ||
case DateTimeFormatSpecifier::WEEK_OF_WEEK_YEAR: { | ||
|
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
see my comment below, but should we encapsulate this logic within the TimeZone object? So these things could be exposed under a more convenient API like