forked from facebookincubator/velox
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Change unicode escaping in JSON (facebookincubator#10887)
Summary: Change how unicode is escaped in JSON Text. The objective is to make it consistent with Presto and make a canonical JSON representation. The implementation is consistent with Java jackson 2.11.0 Utf8Generator Pull Request resolved: facebookincubator#10887 Reviewed By: Yuhta Differential Revision: D62591195 Pulled By: gggrace14 fbshipit-source-id: 25235f97e371103197e522d5ffb5c090a7e30888
- Loading branch information
1 parent
1e736ba
commit 00e65bc
Showing
7 changed files
with
335 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include <array> | ||
|
||
#include "folly/Unicode.h" | ||
|
||
#include "velox/common/base/Exceptions.h" | ||
#include "velox/functions/lib/Utf8Utils.h" | ||
#include "velox/functions/prestosql/json/JsonStringUtil.h" | ||
|
||
using namespace facebook::velox::functions; | ||
|
||
namespace facebook::velox { | ||
namespace { | ||
|
||
FOLLY_ALWAYS_INLINE char hexDigit(uint8_t c) { | ||
VELOX_DCHECK_LT(c, 16); | ||
return c < 10 ? c + '0' : c - 10 + 'A'; | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE void writeHex(char16_t value, char*& out) { | ||
value = folly::Endian::little(value); | ||
*out++ = '\\'; | ||
*out++ = 'u'; | ||
*out++ = hexDigit((value >> 12) & 0x0F); | ||
*out++ = hexDigit((value >> 8) & 0x0F); | ||
*out++ = hexDigit((value >> 4) & 0x0F); | ||
*out++ = hexDigit(value & 0x0F); | ||
} | ||
|
||
std::array<int8_t, 128> getAsciiEscapes() { | ||
std::array<int8_t, 128> escapes; | ||
std::fill(escapes.data(), escapes.data() + 32, -1); | ||
escapes['"'] = '"'; | ||
escapes['\\'] = '\\'; | ||
escapes['\b'] = 'b'; | ||
escapes['\t'] = 't'; | ||
escapes['\n'] = 'n'; | ||
escapes['\f'] = 'f'; | ||
escapes['\r'] = 'r'; | ||
return escapes; | ||
} | ||
static const std::array<int8_t, 128> asciiEscapes = getAsciiEscapes(); | ||
|
||
FOLLY_ALWAYS_INLINE void encodeAscii(int8_t value, char*& out) { | ||
int8_t escapeCode = asciiEscapes[value]; | ||
if (escapeCode == 0) { | ||
*out++ = char(value); | ||
} else if (escapeCode > 0) { | ||
*out++ = '\\'; | ||
*out++ = char(escapeCode); | ||
} else { | ||
writeHex(value, out); | ||
} | ||
} | ||
|
||
std::array<int8_t, 128> getEncodedAsciiSizes() { | ||
std::array<int8_t, 128> sizes; | ||
for (int c = 0; c < 128; c++) { | ||
int8_t escapeCode = asciiEscapes[c]; | ||
if (escapeCode == 0) { | ||
sizes[c] = 1; | ||
} else if (escapeCode > 0) { | ||
sizes[c] = 2; | ||
} else { | ||
sizes[c] = 6; | ||
} | ||
} | ||
return sizes; | ||
} | ||
static const std::array<int8_t, 128> encodedAsciiSizes = getEncodedAsciiSizes(); | ||
|
||
// Encode `codePoint` value into one or two UTF-16 code units. Write each code | ||
// unit as prefixed hexadecimals of 6 chars. | ||
FOLLY_ALWAYS_INLINE void encodeUtf16Hex(char32_t codePoint, char*& out) { | ||
VELOX_DCHECK(codePoint <= 0x10FFFFu); | ||
// Two 16-bit code units are needed. | ||
if (codePoint >= 0x10000u) { | ||
writeHex( | ||
static_cast<char16_t>( | ||
0xD800u + (((codePoint - 0x10000u) >> 10) & 0x3FFu)), | ||
out); | ||
writeHex( | ||
static_cast<char16_t>(0xDC00u + ((codePoint - 0x10000u) & 0x3FFu)), | ||
out); | ||
return; | ||
} | ||
// One 16-bit code unit is needed. | ||
writeHex(static_cast<char16_t>(codePoint), out); | ||
} | ||
|
||
} // namespace | ||
|
||
void testingEncodeUtf16Hex(char32_t codePoint, char*& out) { | ||
encodeUtf16Hex(codePoint, out); | ||
} | ||
|
||
void escapeString(const char* input, size_t length, char* output) { | ||
char* pos = output; | ||
|
||
auto* start = reinterpret_cast<const unsigned char*>(input); | ||
auto* end = reinterpret_cast<const unsigned char*>(input + length); | ||
while (start < end) { | ||
int count = validateAndGetNextUtf8Length(start, end); | ||
switch (count) { | ||
case 1: { | ||
encodeAscii(int8_t(*start), pos); | ||
start++; | ||
continue; | ||
} | ||
case 2: { | ||
memcpy(pos, reinterpret_cast<const char*>(start), 2); | ||
pos += 2; | ||
start += 2; | ||
continue; | ||
} | ||
case 3: { | ||
memcpy(pos, reinterpret_cast<const char*>(start), 3); | ||
pos += 3; | ||
start += 3; | ||
continue; | ||
} | ||
case 4: { | ||
char32_t codePoint = folly::utf8ToCodePoint(start, end, true); | ||
if (codePoint == U'\ufffd') { | ||
writeHex(0xFFFDu, pos); | ||
continue; | ||
} | ||
encodeUtf16Hex(codePoint, pos); | ||
continue; | ||
} | ||
default: { | ||
writeHex(0xFFFDu, pos); | ||
start++; | ||
} | ||
} | ||
} | ||
} | ||
|
||
size_t escapedStringSize(const char* input, size_t length) { | ||
// 6 chars that is returned by `writeHex`. | ||
constexpr size_t kEncodedHexSize = 6; | ||
|
||
size_t outSize = 0; | ||
|
||
auto* start = reinterpret_cast<const unsigned char*>(input); | ||
auto* end = reinterpret_cast<const unsigned char*>(input + length); | ||
while (start < end) { | ||
int count = validateAndGetNextUtf8Length(start, end); | ||
switch (count) { | ||
case 1: | ||
outSize += encodedAsciiSizes[int8_t(*start)]; | ||
break; | ||
case 2: | ||
case 3: | ||
outSize += count; | ||
break; | ||
case 4: | ||
outSize += kEncodedHexSize * 2; | ||
break; | ||
default: | ||
outSize += kEncodedHexSize; | ||
count = 1; | ||
} | ||
start += count; | ||
} | ||
|
||
return outSize; | ||
} | ||
|
||
} // namespace facebook::velox |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
namespace facebook::velox { | ||
/// Escape the unicode characters of `input` to make it canonical for JSON | ||
/// and legal to print in JSON text. It is assumed that the input is UTF-8 | ||
/// encoded. | ||
/// It handles the different unicode planes or code point ranges as follows, | ||
/// 1. Basic Multilingual Plane [0, 0xFFFF] | ||
/// a. [0, 0x7F] ASCII. Input is encoded by one UTF-8 byte. Refer to | ||
/// the `encodeAscii` function for output. | ||
/// b. [0x80, 0x07FF]. Input is encoded by two UTF-8 bytes. Output the UTF-8 | ||
/// encoding of the code point, which are thus identical bytes as | ||
/// the input. | ||
/// c. [0x0800, 0xD7FF] + [0xE000, 0xFFFF]. Input is encoded by three UTF-8 | ||
/// bytes. Output the UTF-8 encoding of the code point, which are thus | ||
/// identical bytes as the input. | ||
/// 2. 16 Supplementary Planes [0x10000, 0x10FFFF] | ||
/// a. [0x10000, 0x10FFFF]. Input is encoded by four UTF-8 bytes. Output | ||
/// the UTF-16 encoding of the code point, with two UTF-16 code units in | ||
/// uppercase hexadecimal and prefixed with '\' and 'u'. | ||
/// For illegal code point value or invalid UTF-8 input, return "\uFFFD". | ||
/// @param input: Input string to escape that is UTF-8 encoded. | ||
/// @param length: Length of the input string. | ||
/// @param output: Output string to write the escaped input to. The caller is | ||
/// responsible to allocate enough space for output. | ||
void escapeString(const char* input, size_t length, char* output); | ||
|
||
/// Return the size of string after the unicode characters of `input` are | ||
/// escaped using the method as in`escapeString`. The function will iterate | ||
/// over `input` once. | ||
/// @param input: Input string to escape that is UTF-8 encoded. | ||
/// @param length: Length of the input string. | ||
size_t escapedStringSize(const char* input, size_t length); | ||
|
||
/// For test only. Encode `codePoint` value by UTF-16 and write the one or two | ||
/// prefixed hexadecimals to `out`. Move `out` forward by 6 or 12 chars | ||
/// accordingly. The caller shall ensure there is enough space in `out`. | ||
void testingEncodeUtf16Hex(char32_t codePoint, char*& out); | ||
} // namespace facebook::velox |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.