From 829d20c79e5f133a3cdb1f3795693c6211d718f5 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Thu, 11 Apr 2024 04:10:11 -0700 Subject: [PATCH] Fix CAST(JSON as ROW(ARRAY)) (#9447) Summary: CAST(JSON as ROW(ARRAY()) used to fail with ``` OUT_OF_ORDER_ITERATION: Objects and arrays can only be iterated when they are first encountered. ``` According to simdjson documentation, https://github.com/simdjson/simdjson/blob/master/doc/basics.md, it is not allowed to store object values for later processing. These must be consumed or copied before proceeding. Also, fixed behavior when JSON object contains duplicate keys. Presto throws, but previous implementation used to allow duplicates. Also, fix the test to actually verify JSON objects with mixed case keys. Differential Revision: D56013293 --- .../prestosql/tests/JsonCastTest.cpp | 60 ++++++++++++++++--- velox/functions/prestosql/types/JsonType.cpp | 48 ++++++++++----- 2 files changed, 85 insertions(+), 23 deletions(-) diff --git a/velox/functions/prestosql/tests/JsonCastTest.cpp b/velox/functions/prestosql/tests/JsonCastTest.cpp index 95943d7286cc9..cbfa048867376 100644 --- a/velox/functions/prestosql/tests/JsonCastTest.cpp +++ b/velox/functions/prestosql/tests/JsonCastTest.cpp @@ -1033,6 +1033,52 @@ TEST_F(JsonCastTest, orderOfKeys) { testCast(data, map); } +TEST_F(JsonCastTest, toRowOfArray) { + auto data = makeFlatVector( + { + R"({"c0": [1, 2, 3], "c1": 1.2})", + R"({"c0": [], "c1": 1.3})", + R"({"c0": [10, null, 20, null], "c1": 1.4})", + }, + JSON()); + + auto expected = makeRowVector({ + makeArrayVectorFromJson({ + "[1, 2, 3]", + "[]", + "[10, null, 20, null]", + }), + }); + + testCast(data, expected); +} + +TEST_F(JsonCastTest, toRowDuplicateKey) { + std::vector> jsonStrings = { + R"({"c0": 1, "c1": 1.1})", + R"({"c0": 2, "c1": 1.2, "C0": 45})", // Duplicate keys: c0, C0. + R"({"c0": 3, "c1": 1.3, "c0": 55})", // Duplicate keys: c0, c0. + R"({"c0": 4, "c1": 1.4, "c2": 65})", + }; + + testThrow( + JSON(), + ROW({"c0", "c1"}, {INTEGER(), REAL()}), + jsonStrings, + "Duplicate field: c0"); + + auto data = makeNullableFlatVector(jsonStrings, JSON()); + + auto expected = makeRowVector({ + makeFlatVector({1, 0, 0, 4}), + makeFlatVector({1.1, 0.0, 0.0, 1.4}), + }); + expected->setNull(1, true); + expected->setNull(2, true); + + testCast(data, expected, true /*try_cast*/); +} + TEST_F(JsonCastTest, toRow) { // Test casting to ROW from JSON arrays. auto array = makeNullableFlatVector( @@ -1053,7 +1099,7 @@ TEST_F(JsonCastTest, toRow) { auto map = makeNullableFlatVector( {R"({"c0":123,"c1":"abc","c2":true})"_sv, R"({"c1":"abc","c2":true,"c0":123})"_sv, - R"({"c0":123,"c2":true,"c0":456})"_sv, + R"({"c10":123,"c2":true,"c0":456})"_sv, R"({"c3":123,"c4":"abc","c2":false})"_sv, R"({"c0":null,"c2":false})"_sv, R"({"c0":null,"c2":null,"c1":null})"_sv}, @@ -1074,17 +1120,17 @@ TEST_F(JsonCastTest, toRow) { // Use a mix of lower case and upper case JSON keys. map = makeNullableFlatVector( - {R"({"c0":123,"c1":"abc","c2":true})"_sv, - R"({"c1":"abc","c2":true,"c0":123})"_sv, - R"({"c0":123,"c2":true,"c0":456})"_sv, - R"({"c3":123,"c4":"abc","c2":false})"_sv, + {R"({"C0":123,"C1":"abc","C2":true})"_sv, + R"({"c1":"abc","C2":true,"c0":123})"_sv, + R"({"C10":123,"C2":true,"c0":456})"_sv, + R"({"c3":123,"C4":"abc","c2":false})"_sv, R"({"c0":null,"c2":false})"_sv, - R"({"c0":null,"c2":null,"c1":null})"_sv}, + R"({"c0":null,"c2":null,"C1":null})"_sv}, JSON()); testCast(map, makeRowVector({child4, child5, child6})); // Use a mix of lower case and upper case field names in target ROW type. - testCast(map, makeRowVector({child4, child5, child6})); + testCast(map, makeRowVector({"c0", "C1", "C2"}, {child4, child5, child6})); // Test casting to ROW from JSON null. auto null = makeNullableFlatVector({"null"_sv}, JSON()); diff --git a/velox/functions/prestosql/types/JsonType.cpp b/velox/functions/prestosql/types/JsonType.cpp index 4f90f52008662..682d70d348f60 100644 --- a/velox/functions/prestosql/types/JsonType.cpp +++ b/velox/functions/prestosql/types/JsonType.cpp @@ -856,30 +856,46 @@ struct CastFromJsonTypedImpl { } } else { SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); - folly::F14FastMap lowerCaseKeys( - object.count_fields()); + + // TODO Populate this mapping once, not per-row. + // Mapping from lower-case field names of the target RowType to their + // indices. + folly::F14FastMap fieldIndices; + for (auto i = 0; i < rowType.size(); ++i) { + auto key = rowType.nameOf(i); + boost::algorithm::to_lower(key); + fieldIndices[key] = i; + } + + std::vector foundIndices(rowType.size(), false); + std::string key; for (auto fieldResult : object) { SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldResult); if (!field.value().is_null()) { SIMDJSON_ASSIGN_OR_RAISE(key, field.unescaped_key(true)); boost::algorithm::to_lower(key); - lowerCaseKeys[key] = field.value(); + + auto it = fieldIndices.find(key); + if (it != fieldIndices.end()) { + const auto index = it->second; + + VELOX_USER_CHECK( + !foundIndices[index], "Duplicate field: {}", key); + foundIndices[index] = true; + + SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( + CastFromJsonTypedImpl::apply, + rowType.childAt(index)->kind(), + field.value(), + writerTyped.get_writer_at(index))); + } } } - for (column_index_t numFields = rowType.size(), i = 0; i < numFields; - ++i) { - key = rowType.nameOf(i); - boost::algorithm::to_lower(key); - auto it = lowerCaseKeys.find(key); - if (it == lowerCaseKeys.end()) { + + for (auto i = 0; i < foundIndices.size(); ++i) { + if (!foundIndices[i]) { writerTyped.set_null_at(i); - } else { - SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( - CastFromJsonTypedImpl::apply, - rowType.childAt(i)->kind(), - it->second, - writerTyped.get_writer_at(i))); } } } @@ -1038,7 +1054,7 @@ class JsonCastOperator : public exec::CastOperator { maxSize = std::max(maxSize, input.size()); }); paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); - rows.applyToSelected([&](auto row) { + context.applyToSelectedNoThrow(rows, [&](auto row) { writer.setOffset(row); if (inputVector->isNullAt(row)) { writer.commitNull();