diff --git a/velox/functions/prestosql/tests/JsonCastTest.cpp b/velox/functions/prestosql/tests/JsonCastTest.cpp index 95943d7286cc9..cbfa048867376 100644 --- a/velox/functions/prestosql/tests/JsonCastTest.cpp +++ b/velox/functions/prestosql/tests/JsonCastTest.cpp @@ -1033,6 +1033,52 @@ TEST_F(JsonCastTest, orderOfKeys) { testCast(data, map); } +TEST_F(JsonCastTest, toRowOfArray) { + auto data = makeFlatVector( + { + R"({"c0": [1, 2, 3], "c1": 1.2})", + R"({"c0": [], "c1": 1.3})", + R"({"c0": [10, null, 20, null], "c1": 1.4})", + }, + JSON()); + + auto expected = makeRowVector({ + makeArrayVectorFromJson({ + "[1, 2, 3]", + "[]", + "[10, null, 20, null]", + }), + }); + + testCast(data, expected); +} + +TEST_F(JsonCastTest, toRowDuplicateKey) { + std::vector> jsonStrings = { + R"({"c0": 1, "c1": 1.1})", + R"({"c0": 2, "c1": 1.2, "C0": 45})", // Duplicate keys: c0, C0. + R"({"c0": 3, "c1": 1.3, "c0": 55})", // Duplicate keys: c0, c0. + R"({"c0": 4, "c1": 1.4, "c2": 65})", + }; + + testThrow( + JSON(), + ROW({"c0", "c1"}, {INTEGER(), REAL()}), + jsonStrings, + "Duplicate field: c0"); + + auto data = makeNullableFlatVector(jsonStrings, JSON()); + + auto expected = makeRowVector({ + makeFlatVector({1, 0, 0, 4}), + makeFlatVector({1.1, 0.0, 0.0, 1.4}), + }); + expected->setNull(1, true); + expected->setNull(2, true); + + testCast(data, expected, true /*try_cast*/); +} + TEST_F(JsonCastTest, toRow) { // Test casting to ROW from JSON arrays. auto array = makeNullableFlatVector( @@ -1053,7 +1099,7 @@ TEST_F(JsonCastTest, toRow) { auto map = makeNullableFlatVector( {R"({"c0":123,"c1":"abc","c2":true})"_sv, R"({"c1":"abc","c2":true,"c0":123})"_sv, - R"({"c0":123,"c2":true,"c0":456})"_sv, + R"({"c10":123,"c2":true,"c0":456})"_sv, R"({"c3":123,"c4":"abc","c2":false})"_sv, R"({"c0":null,"c2":false})"_sv, R"({"c0":null,"c2":null,"c1":null})"_sv}, @@ -1074,17 +1120,17 @@ TEST_F(JsonCastTest, toRow) { // Use a mix of lower case and upper case JSON keys. map = makeNullableFlatVector( - {R"({"c0":123,"c1":"abc","c2":true})"_sv, - R"({"c1":"abc","c2":true,"c0":123})"_sv, - R"({"c0":123,"c2":true,"c0":456})"_sv, - R"({"c3":123,"c4":"abc","c2":false})"_sv, + {R"({"C0":123,"C1":"abc","C2":true})"_sv, + R"({"c1":"abc","C2":true,"c0":123})"_sv, + R"({"C10":123,"C2":true,"c0":456})"_sv, + R"({"c3":123,"C4":"abc","c2":false})"_sv, R"({"c0":null,"c2":false})"_sv, - R"({"c0":null,"c2":null,"c1":null})"_sv}, + R"({"c0":null,"c2":null,"C1":null})"_sv}, JSON()); testCast(map, makeRowVector({child4, child5, child6})); // Use a mix of lower case and upper case field names in target ROW type. - testCast(map, makeRowVector({child4, child5, child6})); + testCast(map, makeRowVector({"c0", "C1", "C2"}, {child4, child5, child6})); // Test casting to ROW from JSON null. auto null = makeNullableFlatVector({"null"_sv}, JSON()); diff --git a/velox/functions/prestosql/types/JsonType.cpp b/velox/functions/prestosql/types/JsonType.cpp index 4f90f52008662..682d70d348f60 100644 --- a/velox/functions/prestosql/types/JsonType.cpp +++ b/velox/functions/prestosql/types/JsonType.cpp @@ -856,30 +856,46 @@ struct CastFromJsonTypedImpl { } } else { SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); - folly::F14FastMap lowerCaseKeys( - object.count_fields()); + + // TODO Populate this mapping once, not per-row. + // Mapping from lower-case field names of the target RowType to their + // indices. + folly::F14FastMap fieldIndices; + for (auto i = 0; i < rowType.size(); ++i) { + auto key = rowType.nameOf(i); + boost::algorithm::to_lower(key); + fieldIndices[key] = i; + } + + std::vector foundIndices(rowType.size(), false); + std::string key; for (auto fieldResult : object) { SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldResult); if (!field.value().is_null()) { SIMDJSON_ASSIGN_OR_RAISE(key, field.unescaped_key(true)); boost::algorithm::to_lower(key); - lowerCaseKeys[key] = field.value(); + + auto it = fieldIndices.find(key); + if (it != fieldIndices.end()) { + const auto index = it->second; + + VELOX_USER_CHECK( + !foundIndices[index], "Duplicate field: {}", key); + foundIndices[index] = true; + + SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( + CastFromJsonTypedImpl::apply, + rowType.childAt(index)->kind(), + field.value(), + writerTyped.get_writer_at(index))); + } } } - for (column_index_t numFields = rowType.size(), i = 0; i < numFields; - ++i) { - key = rowType.nameOf(i); - boost::algorithm::to_lower(key); - auto it = lowerCaseKeys.find(key); - if (it == lowerCaseKeys.end()) { + + for (auto i = 0; i < foundIndices.size(); ++i) { + if (!foundIndices[i]) { writerTyped.set_null_at(i); - } else { - SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( - CastFromJsonTypedImpl::apply, - rowType.childAt(i)->kind(), - it->second, - writerTyped.get_writer_at(i))); } } } @@ -1038,7 +1054,7 @@ class JsonCastOperator : public exec::CastOperator { maxSize = std::max(maxSize, input.size()); }); paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); - rows.applyToSelected([&](auto row) { + context.applyToSelectedNoThrow(rows, [&](auto row) { writer.setOffset(row); if (inputVector->isNullAt(row)) { writer.commitNull();