facebookincubator · kevinwilfong · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024 · pedroerp
diff --git a/velox/external/date/patches/0006-add_get_time_zone_names.patch b/velox/external/date/patches/0006-add_get_time_zone_names.patch
@@ -0,0 +1,30 @@
+diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp
+--- a/velox/external/date/tz.cpp
++++ b/velox/external/date/tz.cpp
+@@ -3538,6 +3538,14 @@
+     return get_tzdb_list().front();
+ }
+
++std::vector<std::string> get_time_zone_names() {
++  std::vector<std::string> result;
++  for (const auto& z : get_tzdb().zones) {
++    result.push_back(z.name());
++  }
++  return result;
++}
++
+ const time_zone*
+ #if HAS_STRING_VIEW
+ tzdb::locate_zone(std::string_view tz_name) const
+diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h
+--- a/velox/external/date/tz.h
++++ b/velox/external/date/tz.h
+@@ -1258,6 +1258,8 @@
+
+ DATE_API const tzdb& get_tzdb();
+
++std::vector<std::string> get_time_zone_names();
++
+ class tzdb_list
+ {
+     std::atomic<tzdb*> head_{nullptr};
diff --git a/velox/external/date/tz.cpp b/velox/external/date/tz.cpp
@@ -3538,6 +3538,14 @@ get_tzdb()
     return get_tzdb_list().front();
 }
 
+std::vector<std::string> get_time_zone_names() {
+  std::vector<std::string> result;
+  for (const auto& z : get_tzdb().zones) {
+    result.push_back(z.name());
+  }
+  return result;
+}
+
 const time_zone*
 #if HAS_STRING_VIEW
 tzdb::locate_zone(std::string_view tz_name) const

diff --git a/velox/external/date/tz.h b/velox/external/date/tz.h
@@ -1258,6 +1258,8 @@ operator<<(std::ostream& os, const tzdb& db);
 
 DATE_API const tzdb& get_tzdb();
 
+std::vector<std::string> get_time_zone_names();
+
 class tzdb_list
 {
     std::atomic<tzdb*> head_{nullptr};

diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp
@@ -351,6 +351,133 @@ int64_t parseTimezone(const char* cur, const char* end, Date& date) {
   return -1;
 }
 
+// Contains a list of all time zone names in a convenient format for searching.
+//
+// Time zone names without the '/' character (without a prefix) are stored in
+// timeZoneNamesWithoutPrefix ordered by size desc.
+//
+// Time zone names with the '/' character (with a prefix) are stored in a map
+// timeZoneNamePrefixMap from prefix (the string before the first '/') to a
+// vector of strings which contains the suffixes (the strings after the first
+// '/') ordered by size desc.
+struct TimeZoneNameMappings {
+  std::vector<std::string> timeZoneNamesWithoutPrefix;
+  std::unordered_map<std::string, std::vector<std::string>>
+      timeZoneNamePrefixMap;
+};
+
+TimeZoneNameMappings getTimeZoneNameMappings() {
+  // Here we use get_time_zone_names instead of calling get_tzdb and
+  // constructing the list ourselves because there is some unknown issue with
+  // the tz library where the time_zone objects after the first one in the tzdb
+  // will be invalid (contain nullptrs) after the get_tzdb function returns.
+  const std::vector<std::string> timeZoneNames = date::get_time_zone_names();
+
+  TimeZoneNameMappings result;
+  for (size_t i = 0; i < timeZoneNames.size(); i++) {
+    const auto& timeZoneName = timeZoneNames[i];
+    auto separatorPoint = timeZoneName.find('/');
+
+    if (separatorPoint == std::string::npos) {
+      result.timeZoneNamesWithoutPrefix.push_back(timeZoneName);
+    } else {
+      std::string prefix = timeZoneName.substr(0, separatorPoint);
+      std::string suffix = timeZoneName.substr(separatorPoint + 1);
+
+      result.timeZoneNamePrefixMap[prefix].push_back(suffix);
+    }
+  }
+
+  std::sort(
+      result.timeZoneNamesWithoutPrefix.begin(),
+      result.timeZoneNamesWithoutPrefix.end(),
+      [](const std::string& a, const std::string& b) {
+        return b.size() < a.size();
+      });
+
+  for (auto& [prefix, suffixes] : result.timeZoneNamePrefixMap) {
+    std::sort(
+        suffixes.begin(),
+        suffixes.end(),
+        [](const std::string& a, const std::string& b) {
+          return b.size() < a.size();
+        });
+  }
+
+  return result;
+}
+
+int64_t parseTimezoneName(const char* cur, const char* end, Date& date) {
+  // For time zone names we try to greedily find the longest substring starting
+  // from cur that is a valid time zone name. To help speed things along we
+  // treat time zone names as {prefix}/{suffix} (for the first instance of '/')
+  // and create lists of suffixes per prefix. We order these lists by length of
+  // the suffix so once we identify the prefix, we can return the first suffix
+  // we find in the string. We treat time zone names without a prefix (i.e.
+  // without a '/') separately but similarly.
+  static const TimeZoneNameMappings timeZoneNameMappings =
+      getTimeZoneNameMappings();
+
+  if (cur < end) {
+    // Find the first instance of '/' in the remainder of the string
+    const char* separatorPoint = cur;
+    while (separatorPoint < end && *separatorPoint != '/') {
+      ++separatorPoint;
+    }
+
+    // Try to find a time zone with a prefix that includes the speratorPoint.
+    if (separatorPoint != end) {
+      std::string prefix(cur, separatorPoint);
+
+      auto it = timeZoneNameMappings.timeZoneNamePrefixMap.find(prefix);
+      if (it != timeZoneNameMappings.timeZoneNamePrefixMap.end()) {
+        // This is greedy, find the longest suffix for the given prefix that
+        // fits the string. We know the value in the map is already sorted by
+        // length in decreasing order.
+        for (const auto& suffixName : it->second) {
+          if (suffixName.size() <= end - separatorPoint - 1 &&
+              suffixName ==
+                  std::string_view(separatorPoint + 1, suffixName.size())) {
+            auto timeZoneNameSize = prefix.size() + 1 + suffixName.size();
+            date.timezone =
+                tz::locateZone(std::string_view(cur, timeZoneNameSize), false);
+
+            if (!date.timezone) {
+              return -1;
+            }
+
+            return timeZoneNameSize;
+          }
+        }
+      }
+    }
+
+    // If we found a '/' but didn't find a match in the set of time zones with
+    // prefixes, try search before the '/' for a time zone without a prefix. If
+    // we didn't find a '/' then end already equals separatorPoint.
+    end = separatorPoint;
+
+    for (const auto& timeZoneName :
+         timeZoneNameMappings.timeZoneNamesWithoutPrefix) {
+      // Again, this is greedy, find the largest time zone name without a prefix
+      // that fits the string. We know timeZoneNamesWithoutPrefix is already
+      // sorted by length in decreasing order.
+      if (timeZoneName.size() <= end - cur &&
+          timeZoneName == std::string_view(cur, timeZoneName.size())) {
+        date.timezone = tz::locateZone(timeZoneName, false);
+
+        if (!date.timezone) {
+          return -1;
+        }
+
+        return timeZoneName.size();
+      }
+    }
+  }
+
+  return -1;
+}
+
 int64_t parseTimezoneOffset(const char* cur, const char* end, Date& date) {
   // For timezone offset ids, there are three formats allowed by Joda:
   //
@@ -518,7 +645,7 @@ std::string formatFractionOfSecond(
   return toAdd;
 }
 
-int32_t appendTimezoneOffset(int64_t offset, char* result) {
+int32_t appendTimezoneOffset(int64_t offset, char* result, bool includeColon) {
   int pos = 0;
   if (offset >= 0) {
     result[pos++] = '+';
@@ -536,7 +663,9 @@ int32_t appendTimezoneOffset(int64_t offset, char* result) {
     result[pos++] = char(hours % 10 + '0');
   }
 
-  result[pos++] = ':';
+  if (includeColon) {
+    result[pos++] = ':';
+  }
 
   const auto minutes = (offset / 60) % 60;
   if LIKELY (minutes == 0) {
@@ -687,7 +816,13 @@ int32_t parseFromPattern(
     bool specifierNext,
     DateTimeFormatterType type) {
   if (curPattern.specifier == DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID) {
-    auto size = parseTimezoneOffset(cur, end, date);
+    int64_t size;
+    if (curPattern.minRepresentDigits < 3) {
+      size = parseTimezoneOffset(cur, end, date);
+    } else {
+      size = parseTimezoneName(cur, end, date);
+    }
+
     if (size == -1) {
       return -1;
     }
@@ -1068,20 +1203,33 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const {
         size += std::max((int)token.pattern.minRepresentDigits, 9);
         break;
       case DateTimeFormatSpecifier::TIMEZONE:
-        if (timezone == nullptr) {
-          VELOX_USER_FAIL("Timezone unknown");
+        if (token.pattern.minRepresentDigits <= 3) {
+          // The longest abbreviation according to here is 5, e.g. some time
+          // zones use the offset as the abbreviation, like +0530.
+          // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
+          size += 5;
+        } else {
+          VELOX_NYI(
+              "Date format specifier is not yet implemented: {} ({})",
+              getSpecifierName(token.pattern.specifier),
+              token.pattern.minRepresentDigits);
         }
-        size += std::max(
-            token.pattern.minRepresentDigits, timezone->name().length());
+
         break;
       case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID:
-        if (token.pattern.minRepresentDigits != 2) {
-          VELOX_UNSUPPORTED(
-              "Date format specifier is not supported: {} ({})",
-              getSpecifierName(token.pattern.specifier),
-              token.pattern.minRepresentDigits);
+        if (token.pattern.minRepresentDigits == 1) {
+          // 'Z' means output the time zone offset without a colon.
+          size += 8;
+        } else if (token.pattern.minRepresentDigits == 2) {
+          // 'ZZ' means output the time zone offset with a colon.
+          size += 9;
+        } else {
+          // 'ZZZ' (or more) means otuput the time zone ID.
+          if (timezone == nullptr) {
+            VELOX_USER_FAIL("Timezone unknown");
+          }
+          size += timezone->name().length();
         }
-        size += 9;
         break;
       // Not supported.
       case DateTimeFormatSpecifier::WEEK_YEAR:
@@ -1310,37 +1458,40 @@ int32_t DateTimeFormatter::format(
         } break;
 
         case DateTimeFormatSpecifier::TIMEZONE: {
-          // TODO: implement short name time zone, need a map from full name to
-          // short name
+          VELOX_USER_CHECK_NOT_NULL(
+              timezone,
+              "The time zone cannot be formatted if it is not present.");
           if (token.pattern.minRepresentDigits <= 3) {
-            VELOX_UNSUPPORTED("short name time zone is not yet supported");
-          }
-          if (timezone == nullptr) {
-            VELOX_USER_FAIL("Timezone unknown");
+            const std::string& abbrev = timezone->getShortName(
+                std::chrono::milliseconds(timestamp.toMillis()),
+                tz::TimeZone::TChoose::kEarliest);
+            std::memcpy(result, abbrev.data(), abbrev.length());
+            result += abbrev.length();
+          } else {
+            // TODO: implement full name time zone
+            VELOX_NYI("full time zone name is not yet supported");
           }
-          const auto& piece = timezone->name();
-          std::memcpy(result, piece.data(), piece.length());
-          result += piece.length();
         } break;
 
         case DateTimeFormatSpecifier::TIMEZONE_OFFSET_ID: {
           // Zone: 'Z' outputs offset without a colon, 'ZZ' outputs the offset
           // with a colon, 'ZZZ' or more outputs the zone id.
-          // TODO Add support for 'Z' and 'ZZZ'.
-          if (token.pattern.minRepresentDigits != 2) {
-            VELOX_UNSUPPORTED(
-                "format is not supported for specifier {} ({})",
-                getSpecifierName(token.pattern.specifier),
-                token.pattern.minRepresentDigits);
-          }
-
           if (offset == 0 && zeroOffsetText.has_value()) {
             std::memcpy(result, zeroOffsetText->data(), zeroOffsetText->size());
             result += zeroOffsetText->size();
             break;
           }
 
-          result += appendTimezoneOffset(offset, result);
+          if (token.pattern.minRepresentDigits >= 3) {
+            // Append the time zone ID.
+            const auto& piece = timezone->name();
+            std::memcpy(result, piece.data(), piece.length());
+            result += piece.length();
+            break;
+          }
+
+          result += appendTimezoneOffset(
+              offset, result, token.pattern.minRepresentDigits == 2);
           break;
         }
         case DateTimeFormatSpecifier::WEEK_OF_WEEK_YEAR: {