refactor: Move some logic from URLFunctions.h to URIParser (#11761)

Summary: Pull Request resolved: #11761 There's some UDFs outside of prestosql's URL functions where I'd like to reuse some of the logic I wrote for handling URIs. Specifically: * tryConsumeIPV6Address: this is generally useful for parsing IPv6 addresses * isMultipleInvalidSequences: this is generally useful for determining how many valid subsequences make up an invalid code point from tryGetUtf8CharLength. * extractParameter: this is generally useful for extracting the parameter from a URI's query string This change moves those functions into URIParser where they can be reused. Reviewed By: kgpai, yuandagits Differential Revision: D66832201 fbshipit-source-id: bf10289b49ac3b0abd572f2b5a985b7758f4bee4
facebookincubator · Dec 7, 2024 · 1b5d3db · 1b5d3db
1 parent 1bbf4f8
commit 1b5d3db
Show file tree

Hide file tree

Showing 3 changed files with 169 additions and 150 deletions.
diff --git a/velox/functions/prestosql/URIParser.cpp b/velox/functions/prestosql/URIParser.cpp
@@ -20,7 +20,7 @@
 
 namespace facebook::velox::functions {
 
-namespace detail {
+namespace {
 using Mask = std::bitset<128>;
 
 Mask createMask(size_t low, size_t high) {
@@ -321,101 +321,6 @@ bool isAtCompression(const char* str, const size_t len, const int32_t pos) {
   return pos < len - 1 && str[pos] == ':' && str[pos + 1] == ':';
 }
 
-// IPv6address   =                            6( h16 ":" ) ls32
-//               /                       "::" 5( h16 ":" ) ls32
-//               / [               h16 ] "::" 4( h16 ":" ) ls32
-//               / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
-//               / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
-//               / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
-//               / [ *4( h16 ":" ) h16 ] "::"              ls32
-//               / [ *5( h16 ":" ) h16 ] "::"              h16
-//               / [ *6( h16 ":" ) h16 ] "::"
-// h16           = 1*4HEXDIG
-// ls32          = ( h16 ":" h16 ) / IPv4address
-bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos) {
-  bool hasCompression = false;
-  uint8_t numBytes = 0;
-  int32_t posInAddress = pos;
-
-  if (isAtCompression(str, len, posInAddress)) {
-    hasCompression = true;
-    // Consume the compression '::'.
-    posInAddress += 2;
-  }
-
-  while (posInAddress < len && numBytes < 16) {
-    int32_t posInHex = posInAddress;
-    for (int i = 0; i < 4; i++) {
-      if (posInHex == len || !test(kHex, str[posInHex])) {
-        break;
-      }
-
-      posInHex++;
-    }
-
-    if (posInHex == posInAddress) {
-      // We need to be able to consume at least one hex digit.
-      break;
-    }
-
-    if (posInHex < len) {
-      if (str[posInHex] == '.') {
-        // We may be in the IPV4 Address.
-        if (tryConsumeIPV4Address(str, len, posInAddress)) {
-          numBytes += 4;
-          break;
-        } else {
-          // A '.' can't appear anywhere except in a valid IPV4 address.
-          return false;
-        }
-      }
-      if (str[posInHex] == ':') {
-        if (isAtCompression(str, len, posInHex)) {
-          if (hasCompression) {
-            // We can't have two compressions.
-            return false;
-          } else {
-            // We found a 2 byte hex value followed by a compression.
-            numBytes += 2;
-            hasCompression = true;
-            // Consume the hex block and the compression '::'.
-            posInAddress = posInHex + 2;
-
-            continue;
-          }
-        } else {
-          if (posInHex == len || !test(kHex, str[posInHex + 1])) {
-            // Peak ahead, we can't end on a single ':'.
-            return false;
-          }
-          // We found a 2 byte hex value followed by a single ':'.
-          numBytes += 2;
-          // Consume the hex block and the ':'.
-          posInAddress = posInHex + 1;
-
-          continue;
-        }
-      } else {
-        // We found a 2 byte hex value at the end of the string.
-        numBytes += 2;
-        posInAddress = posInHex;
-        break;
-      }
-    }
-
-    break;
-  }
-
-  // A valid IPv6 address must have exactly 16 bytes, or a compression.
-  if ((numBytes == 16 && !hasCompression) ||
-      (hasCompression && numBytes <= 14 && numBytes % 2 == 0)) {
-    pos = posInAddress;
-    return true;
-  } else {
-    return false;
-  }
-}
-
 // IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
 bool tryConsumeIPVFuture(const char* str, const size_t len, int32_t& pos) {
   int32_t posInAddress = pos;
@@ -712,18 +617,113 @@ bool tryConsumeUri(const char* str, const size_t len, int32_t& pos, URI& uri) {
   return true;
 }
 
-} // namespace detail
+} // namespace
+
+// IPv6address   =                            6( h16 ":" ) ls32
+//               /                       "::" 5( h16 ":" ) ls32
+//               / [               h16 ] "::" 4( h16 ":" ) ls32
+//               / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+//               / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+//               / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+//               / [ *4( h16 ":" ) h16 ] "::"              ls32
+//               / [ *5( h16 ":" ) h16 ] "::"              h16
+//               / [ *6( h16 ":" ) h16 ] "::"
+// h16           = 1*4HEXDIG
+// ls32          = ( h16 ":" h16 ) / IPv4address
+bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos) {
+  bool hasCompression = false;
+  uint8_t numBytes = 0;
+  int32_t posInAddress = pos;
+
+  if (isAtCompression(str, len, posInAddress)) {
+    hasCompression = true;
+    // Consume the compression '::'.
+    posInAddress += 2;
+  }
+
+  while (posInAddress < len && numBytes < 16) {
+    int32_t posInHex = posInAddress;
+    for (int i = 0; i < 4; i++) {
+      if (posInHex == len || !test(kHex, str[posInHex])) {
+        break;
+      }
+
+      posInHex++;
+    }
+
+    if (posInHex == posInAddress) {
+      // We need to be able to consume at least one hex digit.
+      break;
+    }
+
+    if (posInHex < len) {
+      if (str[posInHex] == '.') {
+        // We may be in the IPV4 Address.
+        if (tryConsumeIPV4Address(str, len, posInAddress)) {
+          numBytes += 4;
+          break;
+        } else {
+          // A '.' can't appear anywhere except in a valid IPV4 address.
+          return false;
+        }
+      }
+      if (str[posInHex] == ':') {
+        if (isAtCompression(str, len, posInHex)) {
+          if (hasCompression) {
+            // We can't have two compressions.
+            return false;
+          } else {
+            // We found a 2 byte hex value followed by a compression.
+            numBytes += 2;
+            hasCompression = true;
+            // Consume the hex block and the compression '::'.
+            posInAddress = posInHex + 2;
+
+            continue;
+          }
+        } else {
+          if (posInHex == len || !test(kHex, str[posInHex + 1])) {
+            // Peak ahead, we can't end on a single ':'.
+            return false;
+          }
+          // We found a 2 byte hex value followed by a single ':'.
+          numBytes += 2;
+          // Consume the hex block and the ':'.
+          posInAddress = posInHex + 1;
+
+          continue;
+        }
+      } else {
+        // We found a 2 byte hex value at the end of the string.
+        numBytes += 2;
+        posInAddress = posInHex;
+        break;
+      }
+    }
+
+    break;
+  }
+
+  // A valid IPv6 address must have exactly 16 bytes, or a compression.
+  if ((numBytes == 16 && !hasCompression) ||
+      (hasCompression && numBytes <= 14 && numBytes % 2 == 0)) {
+    pos = posInAddress;
+    return true;
+  } else {
+    return false;
+  }
+}
 
 // URI-reference = URI / relative-ref
 bool parseUri(const StringView& uriStr, URI& uri) {
   int32_t pos = 0;
-  if (detail::tryConsumeUri(uriStr.data(), uriStr.size(), pos, uri) &&
+  if (tryConsumeUri(uriStr.data(), uriStr.size(), pos, uri) &&
       pos == uriStr.size()) {
     return true;
   }
 
   pos = 0;
-  detail::consumeRelativeRef(uriStr.data(), uriStr.size(), pos, uri);
+  consumeRelativeRef(uriStr.data(), uriStr.size(), pos, uri);
 
   return pos == uriStr.size();
 }

diff --git a/velox/functions/prestosql/URIParser.h b/velox/functions/prestosql/URIParser.h
@@ -15,9 +15,16 @@
  */
 #pragma once
 
+#include <boost/regex.hpp>
 #include "velox/type/StringView.h"
 
 namespace facebook::velox::functions {
+namespace detail {
+FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
+  const auto& sub = match[idx];
+  return StringView(sub.first, sub.length());
+}
+} // namespace detail
 /// A struct containing the parts of the URI that were extracted during parsing.
 /// If the field was not found, it is empty.
 ///
@@ -38,4 +45,64 @@ struct URI {
 
 /// Parse a URI string into a URI struct according to RFC 3986.
 bool parseUri(const StringView& uriStr, URI& uri);
+
+/// If the string starting at str is a valid IPv6 address, returns true and pos
+/// is updated to the first character after the IP address. Otherwise returns
+/// false and pos is unchanged.
+bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos);
+
+template <typename T>
+FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
+    const T& inputBuffer,
+    size_t inputIndex) {
+  return
+      // 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
+      // value less than 0x90 is considered an overlong encoding.
+      (inputBuffer[inputIndex] == '\xe0' &&
+       (inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
+      (inputBuffer[inputIndex] == '\xf0' &&
+       (inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
+      // 0xf4 followed by a byte >= 0x90 looks valid to
+      // tryGetUtf8CharLength, but is actually outside the range of valid
+      // code points.
+      (inputBuffer[inputIndex] == '\xf4' &&
+       (inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
+      // The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
+      // multi-byte code points to tryGetUtf8CharLength, but are not part of
+      // any valid code point.
+      (unsigned char)inputBuffer[inputIndex] > 0xf4 ||
+      inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
+}
+
+/// Find an extract the value for the parameter with key `param` from the query
+/// portion of a URI `query`. `query` should already be decoded if necessary.
+template <typename TString>
+std::optional<StringView> extractParameter(
+    const StringView& query,
+    const TString& param) {
+  if (!query.empty()) {
+    // Parse query string.
+    static const boost::regex kQueryParamRegex(
+        "(^|&)" // start of query or start of parameter "&"
+        "([^=&]*)=?" // parameter name and "=" if value is expected
+        "([^&]*)" // parameter value (allows "=" to appear)
+        "(?=(&|$))" // forward reference, next should be end of query or
+                    // start of next parameter
+    );
+
+    const boost::cregex_iterator begin(
+        query.data(), query.data() + query.size(), kQueryParamRegex);
+    boost::cregex_iterator end;
+
+    for (auto it = begin; it != end; ++it) {
+      if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
+        auto key = detail::submatch((*it), 2);
+        if (param.compare(key) == 0) {
+          return detail::submatch((*it), 3);
+        }
+      }
+    }
+  }
+  return std::nullopt;
+}
 } // namespace facebook::velox::functions
diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <boost/regex.hpp>
 #include "velox/external/utf8proc/utf8procImpl.h"
 #include "velox/functions/Macros.h"
 #include "velox/functions/lib/Utf8Utils.h"
@@ -39,11 +38,6 @@ constexpr std::array<std::string_view, 6> kDecodedReplacementCharacterStrings{
     "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
     "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"};
 
-FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
-  const auto& sub = match[idx];
-  return StringView(sub.first, sub.length());
-}
-
 FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) {
   return c < 10 ? (c + '0') : (c + 'A' - 10);
 }
@@ -54,29 +48,6 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) {
   output[2] = toHex(c % 16);
 }
 
-template <typename T>
-FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
-    const T& inputBuffer,
-    size_t inputIndex) {
-  return
-      // 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
-      // value less than 0x90 is considered an overlong encoding.
-      (inputBuffer[inputIndex] == '\xe0' &&
-       (inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
-      (inputBuffer[inputIndex] == '\xf0' &&
-       (inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
-      // 0xf4 followed by a byte >= 0x90 looks valid to
-      // tryGetUtf8CharLength, but is actually outside the range of valid
-      // code points.
-      (inputBuffer[inputIndex] == '\xf4' &&
-       (inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
-      // The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
-      // multi-byte code points to tryGetUtf8CharLength, but are not part of
-      // any valid code point.
-      (unsigned char)inputBuffer[inputIndex] > 0xf4 ||
-      inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
-}
-
 /// Escapes ``input`` by encoding it so that it can be safely included in
 /// URL query parameter names and values:
 ///
@@ -441,35 +412,16 @@ struct UrlExtractParameterFunction {
     }
 
     if (!uri.query.empty()) {
-      // Parse query string.
-      static const boost::regex kQueryParamRegex(
-          "(^|&)" // start of query or start of parameter "&"
-          "([^=&]*)=?" // parameter name and "=" if value is expected
-          "([^&]*)" // parameter value (allows "=" to appear)
-          "(?=(&|$))" // forward reference, next should be end of query or
-                      // start of next parameter
-      );
-
       StringView query = uri.query;
       std::string unescapedQuery;
       if (uri.queryHasEncoded) {
         detail::urlUnescape(unescapedQuery, uri.query);
         query = StringView(unescapedQuery);
       }
 
-      const boost::cregex_iterator begin(
-          query.data(), query.data() + query.size(), kQueryParamRegex);
-      boost::cregex_iterator end;
-
-      for (auto it = begin; it != end; ++it) {
-        if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
-          auto key = detail::submatch((*it), 2);
-          if (param.compare(key) == 0) {
-            auto value = detail::submatch((*it), 3);
-            result.copy_from(value);
-            return true;
-          }
-        }
+      if (const auto value = extractParameter(query, param)) {
+        result.copy_from(value.value());
+        return true;
       }
     }