Skip to content

Commit

Permalink
Fix CAST(JSON as ROW(ARRAY)) (#9447)
Browse files Browse the repository at this point in the history
Summary:

CAST(JSON as ROW(ARRAY()) used to fail with

```
OUT_OF_ORDER_ITERATION: Objects and arrays can only be iterated when they are first encountered.
```

According to simdjson documentation, https://github.com/simdjson/simdjson/blob/master/doc/basics.md, it is not allowed to store object values for later processing. These must be consumed or copied before proceeding.

Also, fixed behavior when JSON object contains duplicate keys. Presto throws, but previous implementation used to allow duplicates.

Also, fix the test to actually verify JSON objects with mixed case keys.

Reviewed By: xiaoxmeng, Yuhta

Differential Revision: D56013293
  • Loading branch information
mbasmanova authored and facebook-github-bot committed Apr 11, 2024
1 parent d4f8d85 commit f97f82f
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 24 deletions.
60 changes: 53 additions & 7 deletions velox/functions/prestosql/tests/JsonCastTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,52 @@ TEST_F(JsonCastTest, orderOfKeys) {
testCast(data, map);
}

TEST_F(JsonCastTest, toRowOfArray) {
auto data = makeFlatVector<std::string>(
{
R"({"c0": [1, 2, 3], "c1": 1.2})",
R"({"c0": [], "c1": 1.3})",
R"({"c0": [10, null, 20, null], "c1": 1.4})",
},
JSON());

auto expected = makeRowVector({
makeArrayVectorFromJson<int64_t>({
"[1, 2, 3]",
"[]",
"[10, null, 20, null]",
}),
});

testCast(data, expected);
}

TEST_F(JsonCastTest, toRowDuplicateKey) {
std::vector<std::optional<std::string>> jsonStrings = {
R"({"c0": 1, "c1": 1.1})",
R"({"c0": 2, "c1": 1.2, "C0": 45})", // Duplicate keys: c0, C0.
R"({"c0": 3, "c1": 1.3, "c0": 55})", // Duplicate keys: c0, c0.
R"({"c0": 4, "c1": 1.4, "c2": 65})",
};

testThrow<std::string>(
JSON(),
ROW({"c0", "c1"}, {INTEGER(), REAL()}),
jsonStrings,
"Duplicate field: c0");

auto data = makeNullableFlatVector<std::string>(jsonStrings, JSON());

auto expected = makeRowVector({
makeFlatVector<int32_t>({1, 0, 0, 4}),
makeFlatVector<float>({1.1, 0.0, 0.0, 1.4}),
});
expected->setNull(1, true);
expected->setNull(2, true);

testCast(data, expected, true /*try_cast*/);
}

TEST_F(JsonCastTest, toRow) {
// Test casting to ROW from JSON arrays.
auto array = makeNullableFlatVector<JsonNativeType>(
Expand All @@ -1053,7 +1099,7 @@ TEST_F(JsonCastTest, toRow) {
auto map = makeNullableFlatVector<JsonNativeType>(
{R"({"c0":123,"c1":"abc","c2":true})"_sv,
R"({"c1":"abc","c2":true,"c0":123})"_sv,
R"({"c0":123,"c2":true,"c0":456})"_sv,
R"({"c10":123,"c2":true,"c0":456})"_sv,
R"({"c3":123,"c4":"abc","c2":false})"_sv,
R"({"c0":null,"c2":false})"_sv,
R"({"c0":null,"c2":null,"c1":null})"_sv},
Expand All @@ -1074,17 +1120,17 @@ TEST_F(JsonCastTest, toRow) {

// Use a mix of lower case and upper case JSON keys.
map = makeNullableFlatVector<JsonNativeType>(
{R"({"c0":123,"c1":"abc","c2":true})"_sv,
R"({"c1":"abc","c2":true,"c0":123})"_sv,
R"({"c0":123,"c2":true,"c0":456})"_sv,
R"({"c3":123,"c4":"abc","c2":false})"_sv,
{R"({"C0":123,"C1":"abc","C2":true})"_sv,
R"({"c1":"abc","C2":true,"c0":123})"_sv,
R"({"C10":123,"C2":true,"c0":456})"_sv,
R"({"c3":123,"C4":"abc","c2":false})"_sv,
R"({"c0":null,"c2":false})"_sv,
R"({"c0":null,"c2":null,"c1":null})"_sv},
R"({"c0":null,"c2":null,"C1":null})"_sv},
JSON());
testCast(map, makeRowVector({child4, child5, child6}));

// Use a mix of lower case and upper case field names in target ROW type.
testCast(map, makeRowVector({child4, child5, child6}));
testCast(map, makeRowVector({"c0", "C1", "C2"}, {child4, child5, child6}));

// Test casting to ROW from JSON null.
auto null = makeNullableFlatVector<JsonNativeType>({"null"_sv}, JSON());
Expand Down
48 changes: 31 additions & 17 deletions velox/functions/prestosql/types/JsonType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -856,30 +856,44 @@ struct CastFromJsonTypedImpl {
}
} else {
SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object());
folly::F14FastMap<std::string, simdjson::ondemand::value> lowerCaseKeys(
object.count_fields());

// TODO Populate this mapping once, not per-row.
// Mapping from lower-case field names of the target RowType to their
// indices.
folly::F14FastMap<std::string, int32_t> fieldIndices;
const auto size = rowType.size();
for (auto i = 0; i < size; ++i) {
auto key = rowType.nameOf(i);
boost::algorithm::to_lower(key);
fieldIndices[key] = i;
}

std::string key;
for (auto fieldResult : object) {
SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldResult);
if (!field.value().is_null()) {
SIMDJSON_ASSIGN_OR_RAISE(key, field.unescaped_key(true));
boost::algorithm::to_lower(key);
lowerCaseKeys[key] = field.value();

auto it = fieldIndices.find(key);
if (it != fieldIndices.end()) {
const auto index = it->second;

VELOX_USER_CHECK_GE(index, 0, "Duplicate field: {}", key);
it->second = -1;

SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH(
CastFromJsonTypedImpl<simdjson::ondemand::value>::apply,
rowType.childAt(index)->kind(),
field.value(),
writerTyped.get_writer_at(index)));
}
}
}
for (column_index_t numFields = rowType.size(), i = 0; i < numFields;
++i) {
key = rowType.nameOf(i);
boost::algorithm::to_lower(key);
auto it = lowerCaseKeys.find(key);
if (it == lowerCaseKeys.end()) {
writerTyped.set_null_at(i);
} else {
SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH(
CastFromJsonTypedImpl<simdjson::ondemand::value>::apply,
rowType.childAt(i)->kind(),
it->second,
writerTyped.get_writer_at(i)));

for (const auto& [key, index] : fieldIndices) {
if (index >= 0) {
writerTyped.set_null_at(index);
}
}
}
Expand Down Expand Up @@ -1038,7 +1052,7 @@ class JsonCastOperator : public exec::CastOperator {
maxSize = std::max(maxSize, input.size());
});
paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING);
rows.applyToSelected([&](auto row) {
context.applyToSelectedNoThrow(rows, [&](auto row) {
writer.setOffset(row);
if (inputVector->isNullAt(row)) {
writer.commitNull();
Expand Down

0 comments on commit f97f82f

Please sign in to comment.