Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinwilfong authored and facebook-github-bot committed Oct 24, 2024
1 parent 95f952e commit fad8c8d
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 33 deletions.
30 changes: 30 additions & 0 deletions velox/external/date/patches/0006-add_get_time_zone_names.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp
--- a/velox/external/date/tz.cpp
+++ b/velox/external/date/tz.cpp
@@ -3538,6 +3538,14 @@
return get_tzdb_list().front();
}

+std::vector<std::string> get_time_zone_names() {
+ std::vector<std::string> result;
+ for (const auto& z : get_tzdb().zones) {
+ result.push_back(z.name());
+ }
+ return result;
+}
+
const time_zone*
#if HAS_STRING_VIEW
tzdb::locate_zone(std::string_view tz_name) const
diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h
--- a/velox/external/date/tz.h
+++ b/velox/external/date/tz.h
@@ -1258,6 +1258,8 @@

DATE_API const tzdb& get_tzdb();

+std::vector<std::string> get_time_zone_names();
+
class tzdb_list
{
std::atomic<tzdb*> head_{nullptr};
8 changes: 8 additions & 0 deletions velox/external/date/tz.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3538,6 +3538,14 @@ get_tzdb()
return get_tzdb_list().front();
}

std::vector<std::string> get_time_zone_names() {
std::vector<std::string> result;
for (const auto& z : get_tzdb().zones) {
result.push_back(z.name());
}
return result;
}

const time_zone*
#if HAS_STRING_VIEW
tzdb::locate_zone(std::string_view tz_name) const
Expand Down
2 changes: 2 additions & 0 deletions velox/external/date/tz.h
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,8 @@ operator<<(std::ostream& os, const tzdb& db);

DATE_API const tzdb& get_tzdb();

std::vector<std::string> get_time_zone_names();

class tzdb_list
{
std::atomic<tzdb*> head_{nullptr};
Expand Down
193 changes: 161 additions & 32 deletions velox/functions/lib/DateTimeFormatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,126 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) {
return -1;
}

// Contains a list of all time zone names in a convenient format for searching.
//
// Time zone names without the '/' character (without a prefix) are stored in
// timeZoneNamesWithoutPrefix ordered by size desc.
//
// Time zone names with the '/' character (with a prefix) are stored in a map
// timeZoneNamePrefixMap from prefix (the string before the first '/') to a
// vector of strings which contains the suffixes (the strings after the first
// '/') ordered by size desc.
struct TimeZoneNameMappings {
std::vector<std::string> timeZoneNamesWithoutPrefix;
std::unordered_map<std::string, std::vector<std::string>>
timeZoneNamePrefixMap;
};

TimeZoneNameMappings getTimeZoneNameMappings() {
// Here we use get_time_zone_names instead of calling get_tzdb and
// constructing the list ourselves because there is some unknown issue with
// the tz library where the time_zone objects after the first one in the tzdb
// will be invalid (contain nullptrs) after the get_tzdb function returns.
const std::vector<std::string> timeZoneNames = date::get_time_zone_names();

TimeZoneNameMappings result;
for (size_t i = 0; i < timeZoneNames.size(); i++) {
const auto& timeZoneName = timeZoneNames[i];
auto separatorPoint = timeZoneName.find('/');

if (separatorPoint == std::string::npos) {
result.timeZoneNamesWithoutPrefix.push_back(timeZoneName);
} else {
std::string prefix = timeZoneName.substr(0, separatorPoint);
std::string suffix = timeZoneName.substr(separatorPoint + 1);

result.timeZoneNamePrefixMap[prefix].push_back(suffix);
}
}

std::sort(
result.timeZoneNamesWithoutPrefix.begin(),
result.timeZoneNamesWithoutPrefix.end(),
[](const std::string& a, const std::string& b) {
return b.size() < a.size();
});

for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) {
std::sort(
suffixes.begin(),
suffixes.end(),
[](const std::string& a, const std::string& b) {
return b.size() < a.size();
});
}

return result;
}

int64_t parseTimezoneName(const char* cur, const char* end, Date& date) {
static const TimeZoneNameMappings timeZoneNameMappings =
getTimeZoneNameMappings();

if (cur < end) {
// Find the first instance of '/' in the remainder of the string
const char* separatorPoint = cur;
while (separatorPoint < end && *separatorPoint != '/') {
++separatorPoint;
}

// Try to find a time zone with a prefix that includes the speratorPoint.
if (separatorPoint != end) {
std::string prefix(cur, separatorPoint);

auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix);
if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) {
// This is greedy, find the longest suffix for the given prefix that
// fits the string. We know the value in the map is already sorted by
// length in decreasing order.
for (const auto& suffixName : it->second) {
if (suffixName.size() <= end - separatorPoint - 1 &&
suffixName ==
std::string_view(separatorPoint + 1, suffixName.size())) {
auto timeZoneNameSize = prefix.size() + 1 + suffixName.size();
date.timezone =
tz::locateZone(std::string_view(cur, timeZoneNameSize), false);

if (!date.timezone) {
return -1;
}

return timeZoneNameSize;
}
}
}
}

// If we found a '/' but didn't find a match in the set of time zones with
// prefixes, try search before the '/' for a time zone without a prefix. If
// we didn't find a '/' then end already equals separatorPoint.
end = separatorPoint;

for (const auto& timeZoneName :
timeZoneNameMappings.timeZoneNamesWithoutPrefix) {
// Again, this is greedy, find the largest time zone name without a prefix
// that fits the string. We know timeZoneNamesWithoutPrefix is already
// sorted by length in decreasing order.
if (timeZoneName.size() <= end - cur &&
timeZoneName == std::string_view(cur, timeZoneName.size())) {
date.timezone = tz::locateZone(timeZoneName, false);

if (!date.timezone) {
return -1;
}

return timeZoneName.size();
}
}
}

return -1;
}

int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) {
// For timezone offset ids, there are three formats allowed by Joda:
//
Expand Down Expand Up @@ -639,8 +759,8 @@ int getMaxDigitConsume(
return curPattern.minRepresentDigits;
} else {
if (type == DateTimeFormatterType::MYSQL) {
// MySQL format will try to read in at most 4 digits when supplied a
// year, never more.
// MySQL format will try to read in at most 4 digits when
// supplied a year, never more.
return 4;
}
return curPattern.minRepresentDigits > 9 ? curPattern.minRepresentDigits
Expand Down Expand Up @@ -681,7 +801,13 @@ int32_t parseFromPattern(
bool specifierNext,
DateTimeFormatterType type) {
if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) {
auto size = parseTimezoneOffset(cur, end, date);
int64_t size;
if (curPattern.minRepresentDigits < 3) {
size = parseTimezoneOffset(cur, end, date);
} else {
size = parseTimezoneName(cur, end, date);
}

if (size == -1) {
return -1;
}
Expand Down Expand Up @@ -762,12 +888,13 @@ int32_t parseFromPattern(
curPattern.specifier == DateTimeFormatSpecifier::YEAR_OF_ERA ||
curPattern.specifier == DateTimeFormatSpecifier::WEEK_YEAR) &&
curPattern.minRepresentDigits == 2) {
// If abbreviated two year digit is provided in format string, try to read
// in two digits of year and convert to appropriate full length year The
// two-digit mapping is as follows: [00, 69] -> [2000, 2069]
// If abbreviated two year digit is provided in format string, try
// to read in two digits of year and convert to appropriate full
// length year The two-digit mapping is as follows: [00, 69] ->
// [2000, 2069]
// [70, 99] -> [1970, 1999]
// If more than two digits are provided, then simply read in full year
// normally without conversion
// If more than two digits are provided, then simply read in full
// year normally without conversion
int count = 0;
while (cur < end && cur < startPos + maxDigitConsume &&
characterIsDigit(*cur)) {
Expand All @@ -782,8 +909,8 @@ int32_t parseFromPattern(
number += 2000;
}
} else if (type == DateTimeFormatterType::MYSQL) {
// In MySQL format, year read in must have exactly two digits, otherwise
// return -1 to indicate parsing error.
// In MySQL format, year read in must have exactly two digits,
// otherwise return -1 to indicate parsing error.
if (count > 2) {
// Larger than expected, print suffix.
cur = cur - count + 2;
Expand Down Expand Up @@ -813,7 +940,8 @@ int32_t parseFromPattern(

switch (curPattern.specifier) {
case DateTimeFormatSpecifier::CENTURY_OF_ERA:
// Enforce Joda's year range if year was specified as "century of year".
// Enforce Joda's year range if year was specified as "century of
// year".
if (number < 0 || number > 2922789) {
return -1;
}
Expand All @@ -827,7 +955,8 @@ int32_t parseFromPattern(
date.centuryFormat = false;
date.isYearOfEra =
(curPattern.specifier == DateTimeFormatSpecifier::YEAR_OF_ERA);
// Enforce Joda's year range if year was specified as "year of era".
// Enforce Joda's year range if year was specified as "year of
// era".
if (date.isYearOfEra && (number > 292278993 || number < 1)) {
return -1;
}
Expand All @@ -846,9 +975,9 @@ int32_t parseFromPattern(
date.month = number;
date.weekDateFormat = false;
date.dayOfYearFormat = false;
// Joda has this weird behavior where it returns 1970 as the year by
// default (if no year is specified), but if either day or month are
// specified, it fallsback to 2000.
// Joda has this weird behavior where it returns 1970 as the year
// by default (if no year is specified), but if either day or
// month are specified, it fallsback to 2000.
if (!date.hasYear) {
date.hasYear = true;
date.year = 2000;
Expand All @@ -860,9 +989,9 @@ int32_t parseFromPattern(
date.day = number;
date.weekDateFormat = false;
date.dayOfYearFormat = false;
// Joda has this weird behavior where it returns 1970 as the year by
// default (if no year is specified), but if either day or month are
// specified, it fallsback to 2000.
// Joda has this weird behavior where it returns 1970 as the year
// by default (if no year is specified), but if either day or
// month are specified, it fallsback to 2000.
if (!date.hasYear) {
date.hasYear = true;
date.year = 2000;
Expand All @@ -874,9 +1003,9 @@ int32_t parseFromPattern(
date.dayOfYear = number;
date.dayOfYearFormat = true;
date.weekDateFormat = false;
// Joda has this weird behavior where it returns 1970 as the year by
// default (if no year is specified), but if either day or month are
// specified, it fallsback to 2000.
// Joda has this weird behavior where it returns 1970 as the year
// by default (if no year is specified), but if either day or
// month are specified, it fallsback to 2000.
if (!date.hasYear) {
date.hasYear = true;
date.year = 2000;
Expand Down Expand Up @@ -1286,14 +1415,14 @@ int32_t DateTimeFormatter::format(
} break;

case DateTimeFormatSpecifier::TIMEZONE: {
// TODO: implement short name time zone, need a map from full name to
// short name
// TODO: implement short name time zone, need a map from full name
// to short name
VELOX_UNSUPPORTED("time zone name is not yet supported");
} break;

case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: {
// Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset
// with a colon, 'ZZZ' or more outputs the zone id.
// Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the
// offset with a colon, 'ZZZ' or more outputs the zone id.
if (offset == 0 && zeroOffsetText.has_value()) {
std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size());
result += zeroOffsetText->size();
Expand Down Expand Up @@ -1450,8 +1579,8 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildMysqlDateTimeFormatter(
Status::UserError("Both printing and parsing not supported"));
}

// For %r we should reserve 1 extra space because it has 3 literals ':' ':'
// and ' '
// For %r we should reserve 1 extra space because it has 3 literals ':'
// ':' and ' '
DateTimeFormatterBuilder builder(
format.size() + countOccurence(format, "%r"));

Expand Down Expand Up @@ -1727,9 +1856,9 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildSimpleDateTimeFormatter(
while (cur < end) {
const char* startTokenPtr = cur;

// For literal case, literal should be quoted using single quotes ('). If
// there is no quotes, it is interpreted as pattern letters. If there is
// only single quote, a user error will be thrown.
// For literal case, literal should be quoted using single quotes (').
// If there is no quotes, it is interpreted as pattern letters. If there
// is only single quote, a user error will be thrown.
if (*startTokenPtr == '\'') {
// Append single literal quote for 2 consecutive single quote.
if (cur + 1 < end && *(cur + 1) == '\'') {
Expand All @@ -1755,8 +1884,8 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildSimpleDateTimeFormatter(
cur += count + 2;
}
} else {
// Append format specifier according to pattern letters. If pattern letter
// is not supported, a user error will be thrown.
// Append format specifier according to pattern letters. If pattern
// letter is not supported, a user error will be thrown.
int count = 1;
++cur;
while (cur < end && *startTokenPtr == *cur) {
Expand Down
Loading

0 comments on commit fad8c8d

Please sign in to comment.