From dc3cead329bd073f62167a81b9eb634d05ab09bf Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Thu, 11 Apr 2024 11:47:08 -0700 Subject: [PATCH] Fix CAST(JSON as ROW(ARRAY)) (#9447) Summary: CAST(JSON as ROW(ARRAY()) used to fail with ``` OUT_OF_ORDER_ITERATION: Objects and arrays can only be iterated when they are first encountered. ``` According to simdjson documentation, https://github.com/simdjson/simdjson/blob/master/doc/basics.md, it is not allowed to store object values for later processing. These must be consumed or copied before proceeding. Also, fixed behavior when JSON object contains duplicate keys. Presto throws, but previous implementation used to allow duplicates. Also, fix the test to actually verify JSON objects with mixed case keys. Reviewed By: xiaoxmeng, Yuhta Differential Revision: D56013293 --- .../prestosql/tests/JsonCastTest.cpp | 60 ++++++++++++++++--- velox/functions/prestosql/types/JsonType.cpp | 48 +++++++++------ 2 files changed, 84 insertions(+), 24 deletions(-) diff --git a/velox/functions/prestosql/tests/JsonCastTest.cpp b/velox/functions/prestosql/tests/JsonCastTest.cpp index 95943d7286cc9..cbfa048867376 100644 --- a/velox/functions/prestosql/tests/JsonCastTest.cpp +++ b/velox/functions/prestosql/tests/JsonCastTest.cpp @@ -1033,6 +1033,52 @@ TEST_F(JsonCastTest, orderOfKeys) { testCast(data, map); } +TEST_F(JsonCastTest, toRowOfArray) { + auto data = makeFlatVector( + { + R"({"c0": [1, 2, 3], "c1": 1.2})", + R"({"c0": [], "c1": 1.3})", + R"({"c0": [10, null, 20, null], "c1": 1.4})", + }, + JSON()); + + auto expected = makeRowVector({ + makeArrayVectorFromJson({ + "[1, 2, 3]", + "[]", + "[10, null, 20, null]", + }), + }); + + testCast(data, expected); +} + +TEST_F(JsonCastTest, toRowDuplicateKey) { + std::vector> jsonStrings = { + R"({"c0": 1, "c1": 1.1})", + R"({"c0": 2, "c1": 1.2, "C0": 45})", // Duplicate keys: c0, C0. + R"({"c0": 3, "c1": 1.3, "c0": 55})", // Duplicate keys: c0, c0. + R"({"c0": 4, "c1": 1.4, "c2": 65})", + }; + + testThrow( + JSON(), + ROW({"c0", "c1"}, {INTEGER(), REAL()}), + jsonStrings, + "Duplicate field: c0"); + + auto data = makeNullableFlatVector(jsonStrings, JSON()); + + auto expected = makeRowVector({ + makeFlatVector({1, 0, 0, 4}), + makeFlatVector({1.1, 0.0, 0.0, 1.4}), + }); + expected->setNull(1, true); + expected->setNull(2, true); + + testCast(data, expected, true /*try_cast*/); +} + TEST_F(JsonCastTest, toRow) { // Test casting to ROW from JSON arrays. auto array = makeNullableFlatVector( @@ -1053,7 +1099,7 @@ TEST_F(JsonCastTest, toRow) { auto map = makeNullableFlatVector( {R"({"c0":123,"c1":"abc","c2":true})"_sv, R"({"c1":"abc","c2":true,"c0":123})"_sv, - R"({"c0":123,"c2":true,"c0":456})"_sv, + R"({"c10":123,"c2":true,"c0":456})"_sv, R"({"c3":123,"c4":"abc","c2":false})"_sv, R"({"c0":null,"c2":false})"_sv, R"({"c0":null,"c2":null,"c1":null})"_sv}, @@ -1074,17 +1120,17 @@ TEST_F(JsonCastTest, toRow) { // Use a mix of lower case and upper case JSON keys. map = makeNullableFlatVector( - {R"({"c0":123,"c1":"abc","c2":true})"_sv, - R"({"c1":"abc","c2":true,"c0":123})"_sv, - R"({"c0":123,"c2":true,"c0":456})"_sv, - R"({"c3":123,"c4":"abc","c2":false})"_sv, + {R"({"C0":123,"C1":"abc","C2":true})"_sv, + R"({"c1":"abc","C2":true,"c0":123})"_sv, + R"({"C10":123,"C2":true,"c0":456})"_sv, + R"({"c3":123,"C4":"abc","c2":false})"_sv, R"({"c0":null,"c2":false})"_sv, - R"({"c0":null,"c2":null,"c1":null})"_sv}, + R"({"c0":null,"c2":null,"C1":null})"_sv}, JSON()); testCast(map, makeRowVector({child4, child5, child6})); // Use a mix of lower case and upper case field names in target ROW type. - testCast(map, makeRowVector({child4, child5, child6})); + testCast(map, makeRowVector({"c0", "C1", "C2"}, {child4, child5, child6})); // Test casting to ROW from JSON null. auto null = makeNullableFlatVector({"null"_sv}, JSON()); diff --git a/velox/functions/prestosql/types/JsonType.cpp b/velox/functions/prestosql/types/JsonType.cpp index 4f90f52008662..07c629e7f0520 100644 --- a/velox/functions/prestosql/types/JsonType.cpp +++ b/velox/functions/prestosql/types/JsonType.cpp @@ -856,30 +856,44 @@ struct CastFromJsonTypedImpl { } } else { SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); - folly::F14FastMap lowerCaseKeys( - object.count_fields()); + + // TODO Populate this mapping once, not per-row. + // Mapping from lower-case field names of the target RowType to their + // indices. + folly::F14FastMap fieldIndices; + const auto size = rowType.size(); + for (auto i = 0; i < size; ++i) { + auto key = rowType.nameOf(i); + boost::algorithm::to_lower(key); + fieldIndices[key] = i; + } + std::string key; for (auto fieldResult : object) { SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldResult); if (!field.value().is_null()) { SIMDJSON_ASSIGN_OR_RAISE(key, field.unescaped_key(true)); boost::algorithm::to_lower(key); - lowerCaseKeys[key] = field.value(); + + auto it = fieldIndices.find(key); + if (it != fieldIndices.end()) { + const auto index = it->second; + + VELOX_USER_CHECK_GE(index, 0, "Duplicate field: {}", key); + it->second = -1; + + SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( + CastFromJsonTypedImpl::apply, + rowType.childAt(index)->kind(), + field.value(), + writerTyped.get_writer_at(index))); + } } } - for (column_index_t numFields = rowType.size(), i = 0; i < numFields; - ++i) { - key = rowType.nameOf(i); - boost::algorithm::to_lower(key); - auto it = lowerCaseKeys.find(key); - if (it == lowerCaseKeys.end()) { - writerTyped.set_null_at(i); - } else { - SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( - CastFromJsonTypedImpl::apply, - rowType.childAt(i)->kind(), - it->second, - writerTyped.get_writer_at(i))); + + for (const auto& [key, index] : fieldIndices) { + if (index >= 0) { + writerTyped.set_null_at(index); } } } @@ -1038,7 +1052,7 @@ class JsonCastOperator : public exec::CastOperator { maxSize = std::max(maxSize, input.size()); }); paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); - rows.applyToSelected([&](auto row) { + context.applyToSelectedNoThrow(rows, [&](auto row) { writer.setOffset(row); if (inputVector->isNullAt(row)) { writer.commitNull();