diff --git a/velox/functions/prestosql/URIParser.cpp b/velox/functions/prestosql/URIParser.cpp index f00d63d78971..a84547218aa6 100644 --- a/velox/functions/prestosql/URIParser.cpp +++ b/velox/functions/prestosql/URIParser.cpp @@ -20,7 +20,7 @@ namespace facebook::velox::functions { -namespace detail { +namespace { using Mask = std::bitset<128>; Mask createMask(size_t low, size_t high) { @@ -321,101 +321,6 @@ bool isAtCompression(const char* str, const size_t len, const int32_t pos) { return pos < len - 1 && str[pos] == ':' && str[pos + 1] == ':'; } -// IPv6address = 6( h16 ":" ) ls32 -// / "::" 5( h16 ":" ) ls32 -// / [ h16 ] "::" 4( h16 ":" ) ls32 -// / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 -// / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 -// / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 -// / [ *4( h16 ":" ) h16 ] "::" ls32 -// / [ *5( h16 ":" ) h16 ] "::" h16 -// / [ *6( h16 ":" ) h16 ] "::" -// h16 = 1*4HEXDIG -// ls32 = ( h16 ":" h16 ) / IPv4address -bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos) { - bool hasCompression = false; - uint8_t numBytes = 0; - int32_t posInAddress = pos; - - if (isAtCompression(str, len, posInAddress)) { - hasCompression = true; - // Consume the compression '::'. - posInAddress += 2; - } - - while (posInAddress < len && numBytes < 16) { - int32_t posInHex = posInAddress; - for (int i = 0; i < 4; i++) { - if (posInHex == len || !test(kHex, str[posInHex])) { - break; - } - - posInHex++; - } - - if (posInHex == posInAddress) { - // We need to be able to consume at least one hex digit. - break; - } - - if (posInHex < len) { - if (str[posInHex] == '.') { - // We may be in the IPV4 Address. - if (tryConsumeIPV4Address(str, len, posInAddress)) { - numBytes += 4; - break; - } else { - // A '.' can't appear anywhere except in a valid IPV4 address. - return false; - } - } - if (str[posInHex] == ':') { - if (isAtCompression(str, len, posInHex)) { - if (hasCompression) { - // We can't have two compressions. - return false; - } else { - // We found a 2 byte hex value followed by a compression. - numBytes += 2; - hasCompression = true; - // Consume the hex block and the compression '::'. - posInAddress = posInHex + 2; - - continue; - } - } else { - if (posInHex == len || !test(kHex, str[posInHex + 1])) { - // Peak ahead, we can't end on a single ':'. - return false; - } - // We found a 2 byte hex value followed by a single ':'. - numBytes += 2; - // Consume the hex block and the ':'. - posInAddress = posInHex + 1; - - continue; - } - } else { - // We found a 2 byte hex value at the end of the string. - numBytes += 2; - posInAddress = posInHex; - break; - } - } - - break; - } - - // A valid IPv6 address must have exactly 16 bytes, or a compression. - if ((numBytes == 16 && !hasCompression) || - (hasCompression && numBytes <= 14 && numBytes % 2 == 0)) { - pos = posInAddress; - return true; - } else { - return false; - } -} - // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) bool tryConsumeIPVFuture(const char* str, const size_t len, int32_t& pos) { int32_t posInAddress = pos; @@ -712,18 +617,113 @@ bool tryConsumeUri(const char* str, const size_t len, int32_t& pos, URI& uri) { return true; } -} // namespace detail +} // namespace + +// IPv6address = 6( h16 ":" ) ls32 +// / "::" 5( h16 ":" ) ls32 +// / [ h16 ] "::" 4( h16 ":" ) ls32 +// / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +// / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +// / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +// / [ *4( h16 ":" ) h16 ] "::" ls32 +// / [ *5( h16 ":" ) h16 ] "::" h16 +// / [ *6( h16 ":" ) h16 ] "::" +// h16 = 1*4HEXDIG +// ls32 = ( h16 ":" h16 ) / IPv4address +bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos) { + bool hasCompression = false; + uint8_t numBytes = 0; + int32_t posInAddress = pos; + + if (isAtCompression(str, len, posInAddress)) { + hasCompression = true; + // Consume the compression '::'. + posInAddress += 2; + } + + while (posInAddress < len && numBytes < 16) { + int32_t posInHex = posInAddress; + for (int i = 0; i < 4; i++) { + if (posInHex == len || !test(kHex, str[posInHex])) { + break; + } + + posInHex++; + } + + if (posInHex == posInAddress) { + // We need to be able to consume at least one hex digit. + break; + } + + if (posInHex < len) { + if (str[posInHex] == '.') { + // We may be in the IPV4 Address. + if (tryConsumeIPV4Address(str, len, posInAddress)) { + numBytes += 4; + break; + } else { + // A '.' can't appear anywhere except in a valid IPV4 address. + return false; + } + } + if (str[posInHex] == ':') { + if (isAtCompression(str, len, posInHex)) { + if (hasCompression) { + // We can't have two compressions. + return false; + } else { + // We found a 2 byte hex value followed by a compression. + numBytes += 2; + hasCompression = true; + // Consume the hex block and the compression '::'. + posInAddress = posInHex + 2; + + continue; + } + } else { + if (posInHex == len || !test(kHex, str[posInHex + 1])) { + // Peak ahead, we can't end on a single ':'. + return false; + } + // We found a 2 byte hex value followed by a single ':'. + numBytes += 2; + // Consume the hex block and the ':'. + posInAddress = posInHex + 1; + + continue; + } + } else { + // We found a 2 byte hex value at the end of the string. + numBytes += 2; + posInAddress = posInHex; + break; + } + } + + break; + } + + // A valid IPv6 address must have exactly 16 bytes, or a compression. + if ((numBytes == 16 && !hasCompression) || + (hasCompression && numBytes <= 14 && numBytes % 2 == 0)) { + pos = posInAddress; + return true; + } else { + return false; + } +} // URI-reference = URI / relative-ref bool parseUri(const StringView& uriStr, URI& uri) { int32_t pos = 0; - if (detail::tryConsumeUri(uriStr.data(), uriStr.size(), pos, uri) && + if (tryConsumeUri(uriStr.data(), uriStr.size(), pos, uri) && pos == uriStr.size()) { return true; } pos = 0; - detail::consumeRelativeRef(uriStr.data(), uriStr.size(), pos, uri); + consumeRelativeRef(uriStr.data(), uriStr.size(), pos, uri); return pos == uriStr.size(); } diff --git a/velox/functions/prestosql/URIParser.h b/velox/functions/prestosql/URIParser.h index c3ad8ae14cd2..6a20c39b7550 100644 --- a/velox/functions/prestosql/URIParser.h +++ b/velox/functions/prestosql/URIParser.h @@ -15,9 +15,16 @@ */ #pragma once +#include #include "velox/type/StringView.h" namespace facebook::velox::functions { +namespace detail { +FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { + const auto& sub = match[idx]; + return StringView(sub.first, sub.length()); +} +} // namespace detail /// A struct containing the parts of the URI that were extracted during parsing. /// If the field was not found, it is empty. /// @@ -38,4 +45,64 @@ struct URI { /// Parse a URI string into a URI struct according to RFC 3986. bool parseUri(const StringView& uriStr, URI& uri); + +/// If the string starting at str is a valid IPv6 address, returns true and pos +/// is updated to the first character after the IP address. Otherwise returns +/// false and pos is unchanged. +bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos); + +template +FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences( + const T& inputBuffer, + size_t inputIndex) { + return + // 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a + // value less than 0x90 is considered an overlong encoding. + (inputBuffer[inputIndex] == '\xe0' && + (inputBuffer[inputIndex + 1] & 0xe0) == 0x80) || + (inputBuffer[inputIndex] == '\xf0' && + (inputBuffer[inputIndex + 1] & 0xf0) == 0x80) || + // 0xf4 followed by a byte >= 0x90 looks valid to + // tryGetUtf8CharLength, but is actually outside the range of valid + // code points. + (inputBuffer[inputIndex] == '\xf4' && + (inputBuffer[inputIndex + 1] & 0xf0) != 0x80) || + // The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of + // multi-byte code points to tryGetUtf8CharLength, but are not part of + // any valid code point. + (unsigned char)inputBuffer[inputIndex] > 0xf4 || + inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1'; +} + +/// Find an extract the value for the parameter with key `param` from the query +/// portion of a URI `query`. `query` should already be decoded if necessary. +template +std::optional extractParameter( + const StringView& query, + const TString& param) { + if (!query.empty()) { + // Parse query string. + static const boost::regex kQueryParamRegex( + "(^|&)" // start of query or start of parameter "&" + "([^=&]*)=?" // parameter name and "=" if value is expected + "([^&]*)" // parameter value (allows "=" to appear) + "(?=(&|$))" // forward reference, next should be end of query or + // start of next parameter + ); + + const boost::cregex_iterator begin( + query.data(), query.data() + query.size(), kQueryParamRegex); + boost::cregex_iterator end; + + for (auto it = begin; it != end; ++it) { + if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty. + auto key = detail::submatch((*it), 2); + if (param.compare(key) == 0) { + return detail::submatch((*it), 3); + } + } + } + } + return std::nullopt; +} } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h index 1fb1cd6befe4..74a5892ea874 100644 --- a/velox/functions/prestosql/URLFunctions.h +++ b/velox/functions/prestosql/URLFunctions.h @@ -15,7 +15,6 @@ */ #pragma once -#include #include "velox/external/utf8proc/utf8procImpl.h" #include "velox/functions/Macros.h" #include "velox/functions/lib/Utf8Utils.h" @@ -39,11 +38,6 @@ constexpr std::array kDecodedReplacementCharacterStrings{ "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd", "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"}; -FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { - const auto& sub = match[idx]; - return StringView(sub.first, sub.length()); -} - FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) { return c < 10 ? (c + '0') : (c + 'A' - 10); } @@ -54,29 +48,6 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) { output[2] = toHex(c % 16); } -template -FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences( - const T& inputBuffer, - size_t inputIndex) { - return - // 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a - // value less than 0x90 is considered an overlong encoding. - (inputBuffer[inputIndex] == '\xe0' && - (inputBuffer[inputIndex + 1] & 0xe0) == 0x80) || - (inputBuffer[inputIndex] == '\xf0' && - (inputBuffer[inputIndex + 1] & 0xf0) == 0x80) || - // 0xf4 followed by a byte >= 0x90 looks valid to - // tryGetUtf8CharLength, but is actually outside the range of valid - // code points. - (inputBuffer[inputIndex] == '\xf4' && - (inputBuffer[inputIndex + 1] & 0xf0) != 0x80) || - // The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of - // multi-byte code points to tryGetUtf8CharLength, but are not part of - // any valid code point. - (unsigned char)inputBuffer[inputIndex] > 0xf4 || - inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1'; -} - /// Escapes ``input`` by encoding it so that it can be safely included in /// URL query parameter names and values: /// @@ -441,15 +412,6 @@ struct UrlExtractParameterFunction { } if (!uri.query.empty()) { - // Parse query string. - static const boost::regex kQueryParamRegex( - "(^|&)" // start of query or start of parameter "&" - "([^=&]*)=?" // parameter name and "=" if value is expected - "([^&]*)" // parameter value (allows "=" to appear) - "(?=(&|$))" // forward reference, next should be end of query or - // start of next parameter - ); - StringView query = uri.query; std::string unescapedQuery; if (uri.queryHasEncoded) { @@ -457,19 +419,9 @@ struct UrlExtractParameterFunction { query = StringView(unescapedQuery); } - const boost::cregex_iterator begin( - query.data(), query.data() + query.size(), kQueryParamRegex); - boost::cregex_iterator end; - - for (auto it = begin; it != end; ++it) { - if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty. - auto key = detail::submatch((*it), 2); - if (param.compare(key) == 0) { - auto value = detail::submatch((*it), 3); - result.copy_from(value); - return true; - } - } + if (const auto value = extractParameter(query, param)) { + result.copy_from(value.value()); + return true; } }