Skip to content

Commit

Permalink
fix: Add support to parse illegal unicode in json_parse (facebookincu…
Browse files Browse the repository at this point in the history
…bator#11744)

Summary:
Currently json_parse will throw if it encounters bad / illegal unicode in json whereas Presto java will add placeholder chars (�) and carry on parsing the string. This PR attempts to replicate this behavior.


Reviewed By: kevinwilfong

Differential Revision: D66774226

Pulled By: kgpai
  • Loading branch information
kgpai authored and facebook-github-bot committed Dec 9, 2024
1 parent 929affe commit edf6c41
Show file tree
Hide file tree
Showing 7 changed files with 221 additions and 39 deletions.
71 changes: 71 additions & 0 deletions velox/functions/lib/Utf8Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,4 +173,75 @@ tryGetUtf8CharLength(const char* input, int64_t size, int32_t& codePoint) {
return -1;
}

bool hasInvalidUTF8(const char* input, int32_t len) {
for (size_t inputIndex = 0; inputIndex < len;) {
if (IS_ASCII(input[inputIndex])) {
// Ascii
inputIndex++;
} else {
// Unicode
int32_t codePoint;
auto charLength =
tryGetUtf8CharLength(input + inputIndex, len - inputIndex, codePoint);
if (charLength < 0) {
return true;
}
inputIndex += charLength;
}
}

return false;
}

size_t replaceInvalidUTF8Characters(
char* outputBuffer,
const char* input,
int32_t len) {
size_t inputIndex = 0;
size_t outputIndex = 0;

while (inputIndex < len) {
if (IS_ASCII(input[inputIndex])) {
outputBuffer[outputIndex++] = input[inputIndex++];
} else {
// Unicode
int32_t codePoint;
const auto charLength =
tryGetUtf8CharLength(input + inputIndex, len - inputIndex, codePoint);
if (charLength > 0) {
std::memcpy(outputBuffer + outputIndex, input + inputIndex, charLength);
outputIndex += charLength;
inputIndex += charLength;
} else {
size_t replaceCharactersToWriteOut = inputIndex < len - 1 &&
isMultipleInvalidSequences(input, inputIndex)
? -charLength
: 1;
const auto& replacementCharacterString =
kReplacementCharacterStrings[replaceCharactersToWriteOut - 1];
std::memcpy(
outputBuffer + outputIndex,
replacementCharacterString.data(),
replacementCharacterString.size());
outputIndex += replacementCharacterString.size();
inputIndex += -charLength;
}
}
}

return outputIndex;
}

template <>
void replaceInvalidUTF8Characters(
std::string& out,
const char* input,
int32_t len) {
auto maxLen = len * kReplacementCharacterStrings[0].size();
out.resize(maxLen);
auto outputBuffer = out.data();
auto outputIndex = replaceInvalidUTF8Characters(outputBuffer, input, len);
out.resize(outputIndex);
}

} // namespace facebook::velox::functions
73 changes: 73 additions & 0 deletions velox/functions/lib/Utf8Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

namespace facebook::velox::functions {

#define IS_ASCII(x) !((x) & 0x80)

/// This function is not part of the original utf8proc.
/// Tries to get the length of UTF-8 encoded code point. A
/// positive return value means the UTF-8 sequence is valid, and
Expand Down Expand Up @@ -86,4 +88,75 @@ FOLLY_ALWAYS_INLINE int validateAndGetNextUtf8Length(
/// -1 for invalid UTF-8 first byte.
int firstByteCharLength(const char* u_input);

/// Invalid character replacement matrix.
constexpr std::array<std::string_view, 6> kReplacementCharacterStrings{
"\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"};

/// Returns true if there are multiple UTF-8 invalid sequences.
template <typename T>
FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
const T& inputBuffer,
size_t inputIndex) {
return
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
// value less than 0x90 is considered an overlong encoding.
(inputBuffer[inputIndex] == '\xe0' &&
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
(inputBuffer[inputIndex] == '\xf0' &&
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
// 0xf4 followed by a byte >= 0x90 looks valid to
// tryGetUtf8CharLength, but is actually outside the range of valid
// code points.
(inputBuffer[inputIndex] == '\xf4' &&
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
// multi-byte code points to tryGetUtf8CharLength, but are not part of
// any valid code point.
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
}

/// Returns true only if invalid UTF-8 is present in the input string.
bool hasInvalidUTF8(const char* input, int32_t len);

/// Replaces invalid UTF-8 characters with replacement characters similar to
/// that produced by Presto java. The function requires that output have
/// sufficient capacity for the output string.
/// @param out Pointer to output string
/// @param input Pointer to input string
/// @param len Length of input string
/// @return number of bytes written
size_t
replaceInvalidUTF8Characters(char* output, const char* input, int32_t len);

/// Replaces invalid UTF-8 characters with replacement characters similar to
/// that produced by Presto java. The function will allocate 1 byte for each
/// orininal character plus extra 2 bytes for each maximal subpart of an
/// ill-formed subsequence for an upper bound of 3x size of the input string.
/// @param out Reference to output string
/// @param input Pointer to input string
/// @param len Length of input string
template <typename TOutString>
void replaceInvalidUTF8Characters(
TOutString& out,
const char* input,
int32_t len) {
auto maxLen = len * kReplacementCharacterStrings[0].size();
out.reserve(maxLen);
auto outputBuffer = out.data();
auto outputIndex = replaceInvalidUTF8Characters(outputBuffer, input, len);
out.resize(outputIndex);
}

template <>
void replaceInvalidUTF8Characters(
std::string& out,
const char* input,
int32_t len);

} // namespace facebook::velox::functions
43 changes: 43 additions & 0 deletions velox/functions/lib/tests/Utf8Test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,5 +104,48 @@ TEST(Utf8Test, tryCharLength) {
ASSERT_EQ(-1, tryCharLength({0xBF}));
}

TEST(UTF8Test, validUtf8) {
auto tryHasInvalidUTF8 = [](const std::vector<unsigned char>& bytes) {
return hasInvalidUTF8(
reinterpret_cast<const char*>(bytes.data()), bytes.size());
};

ASSERT_FALSE(tryHasInvalidUTF8({0x5c, 0x19, 0x7A}));
ASSERT_TRUE(tryHasInvalidUTF8({0x5c, 0x19, 0x7A, 0xBF}));
ASSERT_TRUE(tryHasInvalidUTF8({0x64, 0x65, 0x1A, 0b11100000, 0x81, 0xBF}));
}

TEST(UTF8Test, replaceInvalidUTF8Characters) {
auto testReplaceInvalidUTF8Chars = [](const std::string& input,
const std::string& expected) {
std::string output;
replaceInvalidUTF8Characters(output, input.data(), input.size());
ASSERT_EQ(expected, output);
};

// Good case
testReplaceInvalidUTF8Chars("Hello World", "Hello World");
// Bad encoding
testReplaceInvalidUTF8Chars("hello \xBF world", "hello � world");
// Bad encoding with 3 byte char
testReplaceInvalidUTF8Chars("hello \xe0\x94\x83 world", "hello ��� world");
// Bad encoding with 4 byte char
testReplaceInvalidUTF8Chars(
"hello \xf0\x80\x80\x80\x80 world", "hello ����� world");

// Overlong 4 byte utf8 character.
testReplaceInvalidUTF8Chars(
"hello \xef\xbf\xbd\xef\xbf\xbd world", "hello �� world");

// Test invalid byte 0xC0
testReplaceInvalidUTF8Chars(
"hello \xef\xbf\xbd\xef\xbf\xbd world", "hello �� world");

// Test long 4 byte utf8 with continuation byte
testReplaceInvalidUTF8Chars(
"hello \xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd world",
"hello ���� world");
}

} // namespace
} // namespace facebook::velox::functions
31 changes: 25 additions & 6 deletions velox/functions/prestosql/JsonFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "velox/expression/VectorFunction.h"
#include "velox/functions/lib/Utf8Utils.h"
#include "velox/functions/prestosql/json/JsonStringUtil.h"
#include "velox/functions/prestosql/json/SIMDJsonUtil.h"
#include "velox/functions/prestosql/types/JsonType.h"
Expand Down Expand Up @@ -166,15 +167,23 @@ class JsonParseFunction : public exec::VectorFunction {
const auto& arg = args[0];
if (arg->isConstantEncoding()) {
auto value = arg->as<ConstantVector<StringView>>()->valueAt(0);
paddedInput_.resize(value.size() + simdjson::SIMDJSON_PADDING);
memcpy(paddedInput_.data(), value.data(), value.size());
auto escapeSize = escapedStringSize(value.data(), value.size());
auto size = value.size();
if (FOLLY_UNLIKELY(hasInvalidUTF8(value.data(), value.size()))) {
size = replaceInvalidUTF8Characters(
paddedInput_.data(), value.data(), value.size());
paddedInput_.resize(size + simdjson::SIMDJSON_PADDING);
} else {
paddedInput_.resize(size + simdjson::SIMDJSON_PADDING);
memcpy(paddedInput_.data(), value.data(), size);
}

auto escapeSize = escapedStringSize(value.data(), size);
auto buffer = AlignedBuffer::allocate<char>(escapeSize, context.pool());
BufferTracker bufferTracker{buffer};

JsonViews jsonViews;

if (auto error = parse(value.size(), jsonViews)) {
if (auto error = parse(size, jsonViews)) {
context.setErrors(rows, errors_[error]);
return;
}
Expand Down Expand Up @@ -219,8 +228,18 @@ class JsonParseFunction : public exec::VectorFunction {
rows.applyToSelected([&](auto row) {
JsonViews jsonViews;
auto value = flatInput->valueAt(row);
memcpy(paddedInput_.data(), value.data(), value.size());
if (auto error = parse(value.size(), jsonViews)) {
auto size = value.size();
if (FOLLY_UNLIKELY(hasInvalidUTF8(value.data(), size))) {
size = replaceInvalidUTF8Characters(
paddedInput_.data(), value.data(), size);
if (maxSize < size) {
paddedInput_.resize(size + simdjson::SIMDJSON_PADDING);
}
} else {
memcpy(paddedInput_.data(), value.data(), size);
}

if (auto error = parse(size, jsonViews)) {
context.setVeloxExceptionError(row, errors_[error]);
} else {
auto canonicalString = bufferTracker.getCanonicalString(jsonViews);
Expand Down
24 changes: 1 addition & 23 deletions velox/functions/prestosql/URIParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <boost/regex.hpp>
#include "velox/functions/lib/Utf8Utils.h"
#include "velox/type/StringView.h"

namespace facebook::velox::functions {
Expand Down Expand Up @@ -51,29 +52,6 @@ bool parseUri(const StringView& uriStr, URI& uri);
/// false and pos is unchanged.
bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos);

template <typename T>
FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
const T& inputBuffer,
size_t inputIndex) {
return
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
// value less than 0x90 is considered an overlong encoding.
(inputBuffer[inputIndex] == '\xe0' &&
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
(inputBuffer[inputIndex] == '\xf0' &&
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
// 0xf4 followed by a byte >= 0x90 looks valid to
// tryGetUtf8CharLength, but is actually outside the range of valid
// code points.
(inputBuffer[inputIndex] == '\xf4' &&
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
// multi-byte code points to tryGetUtf8CharLength, but are not part of
// any valid code point.
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
}

/// Find an extract the value for the parameter with key `param` from the query
/// portion of a URI `query`. `query` should already be decoded if necessary.
template <typename TString>
Expand Down
15 changes: 5 additions & 10 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,15 @@
namespace facebook::velox::functions {

namespace detail {

/// Encoded replacement character strings.
constexpr std::array<std::string_view, 6> kEncodedReplacementCharacterStrings =
{"%EF%BF%BD",
"%EF%BF%BD%EF%BF%BD",
"%EF%BF%BD%EF%BF%BD%EF%BF%BD",
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD",
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD",
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"};
constexpr std::array<std::string_view, 6> kDecodedReplacementCharacterStrings{
"\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"};

FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) {
return c < 10 ? (c + '0') : (c + 'A' - 10);
Expand Down Expand Up @@ -178,7 +173,7 @@ FOLLY_ALWAYS_INLINE void urlUnescape(
} else if (charLength < 0) {
// This isn't the start of a valid UTF-8 character, write out the
// replacement character.
const auto& replacementString = kDecodedReplacementCharacterStrings[0];
const auto& replacementString = kReplacementCharacterStrings[0];
std::memcpy(
outputBuffer, replacementString.data(), replacementString.length());
outputBuffer += replacementString.length();
Expand Down Expand Up @@ -216,8 +211,8 @@ FOLLY_ALWAYS_INLINE void urlUnescape(
size_t charLength = outputBuffer - charStart;
size_t replaceCharactersToWriteOut =
isMultipleInvalidSequences(charStart, 0) ? charLength : 1;
const auto& replacementString = kDecodedReplacementCharacterStrings
[replaceCharactersToWriteOut - 1];
const auto& replacementString =
kReplacementCharacterStrings[replaceCharactersToWriteOut - 1];

outputBuffer = charStart;
std::memcpy(
Expand Down
3 changes: 3 additions & 0 deletions velox/functions/prestosql/tests/JsonFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ TEST_F(JsonFunctionsTest, jsonParse) {
R"("Items for D \ud835\udc52\ud835\udcc1 ")",
R"("Items for D \uD835\uDC52\uD835\uDCC1 ")");

// Test bad unicode characters
testJsonParse("\"Hello \xc0\xaf World\"", "\"Hello �� World\"");

VELOX_ASSERT_THROW(
jsonParse(R"({"k1":})"), "The JSON document has an improper structure");
VELOX_ASSERT_THROW(
Expand Down

0 comments on commit edf6c41

Please sign in to comment.