Add support for ZZZ in parse_datetime (facebookincubator#11312)

Summary: This diff adds support for JODA's ZZZ pattern in Presto's parse_datetime function. This is used to parse time zone IDs (called "time zone names" in the tz library, but this means something else in JODA). I borrowed the algorithm from JODA to ensure it matches Presto Java's behavior. The idea is to greedily consume the longest substring that matches a known time zone. I borrowed their algorithm which is to break the set of known time zones into a list of those without a prefix (without the '/' character) and lists of suffixes for those with prefixes. This limits the number of strings that need to be compared. I modified it slightly to pre-sort these lists by size descending, so we don't have to necessarily compare every string, but can stop early if we find a match. One other change is I added a get_time_zone_names function to our copy of the tz library. I tried calling get_tzdb() from DateTimeFormatter directly and accessing its zones member to get the names, but for some reason after get_tzdb() returns every time_zone in zones (except the first one) has a string name_ that has nullptr for its data after get_tzdb() returns. I spent a good amount of time trying to figure out why, but couldn't figure it out, so I gave up and added this helper method (for whatever reason everything is fine as long as it's done in the tz.cpp file). If anyone has pointers as to what's going on I'm happy to investigate further, I'd much rather use the existing get_tzdb function if I can. Differential Revision: D64708598
kevinwilfong · Oct 21, 2024 · 5a93516 · 5a93516
1 parent 7629331
commit 5a93516
Show file tree

Hide file tree

Showing 5 changed files with 246 additions and 33 deletions.
diff --git a/velox/external/date/patches/0006-add_get_time_zone_names.patch b/velox/external/date/patches/0006-add_get_time_zone_names.patch
@@ -0,0 +1,30 @@
+diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp
+--- a/velox/external/date/tz.cpp
++++ b/velox/external/date/tz.cpp
+@@ -3538,6 +3538,14 @@
+     return get_tzdb_list().front();
+ }
+
++std::vector<std::string> get_time_zone_names() {
++  std::vector<std::string> result;
++  for (const auto& z : get_tzdb().zones) {
++    result.push_back(z.name());
++  }
++  return result;
++}
++
+ const time_zone*
+ #if HAS_STRING_VIEW
+ tzdb::locate_zone(std::string_view tz_name) const
+diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h
+--- a/velox/external/date/tz.h
++++ b/velox/external/date/tz.h
+@@ -1258,6 +1258,8 @@
+
+ DATE_API const tzdb& get_tzdb();
+
++std::vector<std::string> get_time_zone_names();
++
+ class tzdb_list
+ {
+     std::atomic<tzdb*> head_{nullptr};
diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp
@@ -3538,6 +3538,14 @@ get_tzdb()
     return get_tzdb_list().front();
 }
 
+std::vector<std::string> get_time_zone_names() {
+  std::vector<std::string> result;
+  for (const auto& z : get_tzdb().zones) {
+    result.push_back(z.name());
+  }
+  return result;
+}
+
 const time_zone*
 #if HAS_STRING_VIEW
 tzdb::locate_zone(std::string_view tz_name) const

diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h
@@ -1258,6 +1258,8 @@ operator<<(std::ostream& os, const tzdb& db);
 
 DATE_API const tzdb& get_tzdb();
 
+std::vector<std::string> get_time_zone_names();
+
 class tzdb_list
 {
     std::atomic<tzdb*> head_{nullptr};

diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp
@@ -347,6 +347,128 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) {
   return -1;
 }
 
+// Contains a list of all time zone names in a convenient format for searching.
+//
+// Time zone names without the '/' character (without a prefix) are stored in
+// timeZoneNamesWithoutPrefix ordered by size desc.
+//
+// Time zone names with the '/' character (with a prefix) are stored in a map
+// timeZoneNamePrefixMap from prefix (the string before the first '/') to a
+// vector of strings which contains the suffixes (the strings after the first
+// '/') ordered by size desc.
+struct TimeZoneNameMappings {
+  std::vector<std::string> timeZoneNamesWithoutPrefix;
+  std::unordered_map<std::string, std::vector<std::string>>
+      timeZoneNamePrefixMap;
+};
+
+TimeZoneNameMappings getTimeZoneNameMappings() {
+  // Here we use get_time_zone_names instead of calling get_tzdb and
+  // constructing the list ourselves because there is some unknown issue with
+  // the tz library where the time_zone objects after the first one in the tzdb
+  // will be invalid (contain nullptrs) after the get_tzdb function returns.
+  const std::vector<std::string> timeZoneNames = date::get_time_zone_names();
+
+  TimeZoneNameMappings result;
+  for (size_t i = 0; i < timeZoneNames.size(); i++) {
+    const auto& timeZoneName = timeZoneNames[i];
+    auto separatorPoint = timeZoneName.find('/');
+
+    if (separatorPoint == std::string::npos) {
+      result.timeZoneNamesWithoutPrefix.push_back(timeZoneName);
+    } else {
+      std::string prefix = timeZoneName.substr(0, separatorPoint);
+      std::string suffix = timeZoneName.substr(separatorPoint + 1);
+
+      result.timeZoneNamePrefixMap[prefix].push_back(suffix);
+    }
+  }
+
+  std::sort(
+      result.timeZoneNamesWithoutPrefix.begin(),
+      result.timeZoneNamesWithoutPrefix.end(),
+      [](const std::string& a, const std::string& b) {
+        return b.size() < a.size();
+      });
+
+  for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) {
+    std::sort(
+        suffixes.begin(),
+        suffixes.end(),
+        [](const std::string& a, const std::string& b) {
+          return b.size() < a.size();
+        });
+  }
+
+  return result;
+}
+
+int64_t parseTimezoneName(const char* cur, const char* end, Date& date) {
+  static const TimeZoneNameMappings timeZoneNameMappings =
+      getTimeZoneNameMappings();
+
+  if (cur < end) {
+    // Find the first instance of '/' in the remainder of the string
+    const char* separatorPoint = cur;
+    while (separatorPoint < end && *separatorPoint != '/') {
+      ++separatorPoint;
+    }
+
+    // Try to find a time zone with a prefix that includes the speratorPoint.
+    if (separatorPoint != end) {
+      std::string prefix(cur, separatorPoint);
+
+      auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix);
+      if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) {
+        // This is greedy, find the longest suffix for the given prefix that
+        // fits the string. We know the value in the map is already sorted by
+        // length in decreasing order.
+        for (const auto& suffixName : it->second) {
+          if (suffixName.size() <= end - separatorPoint - 1 &&
+              suffixName ==
+                  std::string_view(
+                      separatorPoint + 1,
+                      separatorPoint + 1 + suffixName.size())) {
+            auto timeZoneNameSize = prefix.size() + 1 + suffixName.size();
+            date.timezone =
+                tz::locateZone(std::string_view(cur, timeZoneNameSize), false);
+
+            if (!date.timezone) {
+              return -1;
+            }
+
+            return timeZoneNameSize;
+          }
+        }
+      }
+    }
+
+    // If we found a '/' but didn't find a match in the set of time zones with
+    // prefixes, try search before the '/' for a time zone without a prefix. If
+    // we didn't find a '/' then end already equals separatorPoint.
+    end = separatorPoint;
+
+    for (const auto& timeZoneName :
+         timeZoneNameMappings.timeZoneNamesWithoutPrefix) {
+      // Again, this is greedy, find the largest time zone name without a prefix
+      // that fits the string. We know timeZoneNamesWithoutPrefix is already
+      // sorted by length in decreasing order.
+      if (timeZoneName.size() <= end - cur &&
+          timeZoneName == std::string_view(cur, cur + timeZoneName.size())) {
+        date.timezone = tz::locateZone(timeZoneName, false);
+
+        if (!date.timezone) {
+          return -1;
+        }
+
+        return timeZoneName.size();
+      }
+    }
+  }
+
+  return -1;
+}
+
 int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) {
   // For timezone offset ids, there are three formats allowed by Joda:
   //
@@ -639,8 +761,8 @@ int getMaxDigitConsume(
         return curPattern.minRepresentDigits;
       } else {
         if (type == DateTimeFormatterType::MYSQL) {
-          // MySQL format will try to read in at most 4 digits when supplied a
-          // year, never more.
+          // MySQL format will try to read in at most 4 digits when
+          // supplied a year, never more.
           return 4;
         }
         return curPattern.minRepresentDigits > 9 ? curPattern.minRepresentDigits
@@ -681,7 +803,13 @@ int32_t parseFromPattern(
     bool specifierNext,
     DateTimeFormatterType type) {
   if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) {
-    auto size = parseTimezoneOffset(cur, end, date);
+    int64_t size;
+    if (curPattern.minRepresentDigits < 3) {
+      size = parseTimezoneOffset(cur, end, date);
+    } else {
+      size = parseTimezoneName(cur, end, date);
+    }
+
     if (size == -1) {
       return -1;
     }
@@ -762,12 +890,13 @@ int32_t parseFromPattern(
          curPattern.specifier == DateTimeFormatSpecifier::YEAR_OF_ERA ||
          curPattern.specifier == DateTimeFormatSpecifier::WEEK_YEAR) &&
         curPattern.minRepresentDigits == 2) {
-      // If abbreviated two year digit is provided in format string, try to read
-      // in two digits of year and convert to appropriate full length year The
-      // two-digit mapping is as follows: [00, 69] -> [2000, 2069]
+      // If abbreviated two year digit is provided in format string, try
+      // to read in two digits of year and convert to appropriate full
+      // length year The two-digit mapping is as follows: [00, 69] ->
+      // [2000, 2069]
       //                                  [70, 99] -> [1970, 1999]
-      // If more than two digits are provided, then simply read in full year
-      // normally without conversion
+      // If more than two digits are provided, then simply read in full
+      // year normally without conversion
       int count = 0;
       while (cur < end && cur < startPos + maxDigitConsume &&
              characterIsDigit(*cur)) {
@@ -782,8 +911,8 @@ int32_t parseFromPattern(
           number += 2000;
         }
       } else if (type == DateTimeFormatterType::MYSQL) {
-        // In MySQL format, year read in must have exactly two digits, otherwise
-        // return -1 to indicate parsing error.
+        // In MySQL format, year read in must have exactly two digits,
+        // otherwise return -1 to indicate parsing error.
         if (count > 2) {
           // Larger than expected, print suffix.
           cur = cur - count + 2;
@@ -813,7 +942,8 @@ int32_t parseFromPattern(
 
     switch (curPattern.specifier) {
       case DateTimeFormatSpecifier::CENTURY_OF_ERA:
-        // Enforce Joda's year range if year was specified as "century of year".
+        // Enforce Joda's year range if year was specified as "century of
+        // year".
         if (number < 0 || number > 2922789) {
           return -1;
         }
@@ -827,7 +957,8 @@ int32_t parseFromPattern(
         date.centuryFormat = false;
         date.isYearOfEra =
             (curPattern.specifier == DateTimeFormatSpecifier::YEAR_OF_ERA);
-        // Enforce Joda's year range if year was specified as "year of era".
+        // Enforce Joda's year range if year was specified as "year of
+        // era".
         if (date.isYearOfEra && (number > 292278993 || number < 1)) {
           return -1;
         }
@@ -846,9 +977,9 @@ int32_t parseFromPattern(
         date.month = number;
         date.weekDateFormat = false;
         date.dayOfYearFormat = false;
-        // Joda has this weird behavior where it returns 1970 as the year by
-        // default (if no year is specified), but if either day or month are
-        // specified, it fallsback to 2000.
+        // Joda has this weird behavior where it returns 1970 as the year
+        // by default (if no year is specified), but if either day or
+        // month are specified, it fallsback to 2000.
         if (!date.hasYear) {
           date.hasYear = true;
           date.year = 2000;
@@ -860,9 +991,9 @@ int32_t parseFromPattern(
         date.day = number;
         date.weekDateFormat = false;
         date.dayOfYearFormat = false;
-        // Joda has this weird behavior where it returns 1970 as the year by
-        // default (if no year is specified), but if either day or month are
-        // specified, it fallsback to 2000.
+        // Joda has this weird behavior where it returns 1970 as the year
+        // by default (if no year is specified), but if either day or
+        // month are specified, it fallsback to 2000.
         if (!date.hasYear) {
           date.hasYear = true;
           date.year = 2000;
@@ -874,9 +1005,9 @@ int32_t parseFromPattern(
         date.dayOfYear = number;
         date.dayOfYearFormat = true;
         date.weekDateFormat = false;
-        // Joda has this weird behavior where it returns 1970 as the year by
-        // default (if no year is specified), but if either day or month are
-        // specified, it fallsback to 2000.
+        // Joda has this weird behavior where it returns 1970 as the year
+        // by default (if no year is specified), but if either day or
+        // month are specified, it fallsback to 2000.
         if (!date.hasYear) {
           date.hasYear = true;
           date.year = 2000;
@@ -1284,14 +1415,14 @@ int32_t DateTimeFormatter::format(
         } break;
 
         case DateTimeFormatSpecifier::TIMEZONE: {
-          // TODO: implement short name time zone, need a map from full name to
-          // short name
+          // TODO: implement short name time zone, need a map from full name
+          // to short name
           VELOX_UNSUPPORTED("time zone name is not yet supported");
         } break;
 
         case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: {
-          // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset
-          // with a colon, 'ZZZ' or more outputs the zone id.
+          // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the
+          // offset with a colon, 'ZZZ' or more outputs the zone id.
           // TODO Add support for 'Z' and 'ZZZ'.
           if (offset == 0 && zeroOffsetText.has_value()) {
             std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size());
@@ -1448,8 +1579,8 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildMysqlDateTimeFormatter(
         Status::UserError("Both printing and parsing not supported"));
   }
 
-  // For %r we should reserve 1 extra space because it has 3 literals ':' ':'
-  // and ' '
+  // For %r we should reserve 1 extra space because it has 3 literals ':'
+  // ':' and ' '
   DateTimeFormatterBuilder builder(
       format.size() + countOccurence(format, "%r"));
 
@@ -1725,9 +1856,9 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildSimpleDateTimeFormatter(
   while (cur < end) {
     const char* startTokenPtr = cur;
 
-    // For literal case, literal should be quoted using single quotes ('). If
-    // there is no quotes, it is interpreted as pattern letters. If there is
-    // only single quote, a user error will be thrown.
+    // For literal case, literal should be quoted using single quotes (').
+    // If there is no quotes, it is interpreted as pattern letters. If there
+    // is only single quote, a user error will be thrown.
     if (*startTokenPtr == '\'') {
       // Append single literal quote for 2 consecutive single quote.
       if (cur + 1 < end && *(cur + 1) == '\'') {
@@ -1753,8 +1884,8 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildSimpleDateTimeFormatter(
         cur += count + 2;
       }
     } else {
-      // Append format specifier according to pattern letters. If pattern letter
-      // is not supported, a user error will be thrown.
+      // Append format specifier according to pattern letters. If pattern
+      // letter is not supported, a user error will be thrown.
       int count = 1;
       ++cur;
       while (cur < end && *startTokenPtr == *cur) {