diff --git a/velox/functions/prestosql/JsonFunctions.cpp b/velox/functions/prestosql/JsonFunctions.cpp index 2387a38a9b09..444605fc00f4 100644 --- a/velox/functions/prestosql/JsonFunctions.cpp +++ b/velox/functions/prestosql/JsonFunctions.cpp @@ -474,6 +474,49 @@ class JsonParseFunction : public exec::VectorFunction { mutable std::vector fastSortKeys_; }; +// This function is called when $internal$json_string_to_array/map/row +// is called. It is used for expressions like 'Cast(json_parse(x) as +// ARRAY<...>)' etc. This is an optimization to avoid parsing the json string +// twice. +class JsonInternalCastFunction : public exec::VectorFunction { + public: + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& resultType, + exec::EvalCtx& context, + VectorPtr& result) const override { + VELOX_CHECK_EQ(args.size(), 1); + const auto& arg = *args[0]; + jsonCastOperator_.castFrom(arg, context, rows, resultType, result); + } + + static std::vector> + signaturesArray() { + return {exec::FunctionSignatureBuilder() + .argumentType("varchar") + .returnType("array(unknown)") + .build()}; + } + + static std::vector> signaturesMap() { + return {exec::FunctionSignatureBuilder() + .argumentType("varchar") + .returnType("map(unknown, unknown)") + .build()}; + } + + static std::vector> signaturesRow() { + return {exec::FunctionSignatureBuilder() + .argumentType("varchar") + .returnType("row(unknown)") + .build()}; + } + + private: + mutable JsonCastOperator jsonCastOperator_; +}; + } // namespace VELOX_DECLARE_VECTOR_FUNCTION( @@ -490,4 +533,19 @@ VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION( return std::make_shared(); }); +VELOX_DECLARE_VECTOR_FUNCTION( + udf_$internal$_json_string_to_array, + JsonInternalCastFunction::signaturesArray(), + std::make_unique()); + +VELOX_DECLARE_VECTOR_FUNCTION( + udf_$internal$_json_string_to_map, + JsonInternalCastFunction::signaturesMap(), + std::make_unique()); + +VELOX_DECLARE_VECTOR_FUNCTION( + udf_$internal$_json_string_to_row, + JsonInternalCastFunction::signaturesRow(), + std::make_unique()); + } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp b/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp index e9f2073224fe..1e0c90cff290 100644 --- a/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp @@ -71,6 +71,18 @@ void registerJsonFunctions(const std::string& prefix) { VELOX_REGISTER_VECTOR_FUNCTION(udf_json_format, prefix + "json_format"); VELOX_REGISTER_VECTOR_FUNCTION(udf_json_parse, prefix + "json_parse"); + + VELOX_REGISTER_VECTOR_FUNCTION( + udf_$internal$_json_string_to_array, + prefix + "$internal$json_string_to_array_cast"); + + VELOX_REGISTER_VECTOR_FUNCTION( + udf_$internal$_json_string_to_map, + prefix + "$internal$json_string_to_map_cast"); + + VELOX_REGISTER_VECTOR_FUNCTION( + udf_$internal$_json_string_to_row, + prefix + "$internal$json_string_to_row_cast"); } } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index c336cfd76170..1d250ae56b85 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -142,6 +142,26 @@ class JsonFunctionsTest : public functions::test::FunctionBaseTest { EXPECT_EQ(jsonResult, varcharResult); return jsonResult; } + + void checkInternalFn( + const std::string& functionName, + const TypePtr& returnType, + const RowVectorPtr& data, + const VectorPtr& expected) { + auto inputFeild = + std::make_shared(VARCHAR(), "c0"); + + auto expression = std::make_shared( + returnType, std::vector{inputFeild}, functionName); + + SelectivityVector rows(data->size()); + std::vector result(1); + exec::ExprSet exprSet({expression}, &execCtx_); + exec::EvalCtx evalCtx(&execCtx_, &exprSet, data.get()); + + exprSet.eval(rows, evalCtx, result); + velox::test::assertEqualVectors(expected, result[0]); + }; }; TEST_F(JsonFunctionsTest, jsonFormat) { @@ -876,6 +896,79 @@ TEST_F(JsonFunctionsTest, jsonExtract) { VELOX_ASSERT_THROW(jsonExtract(kJson, "$.store.keys()"), "Invalid JSON path"); } +// The following tests ensure that the internal json functions +// $internal$json_string_to_array/map/row_cast can be invoked without issues +// from Prestissimo. The actual functionality is tested in JsonCastTest. + +TEST_F(JsonFunctionsTest, jsonStringToArrayCast) { + // Array of strings. + auto data = makeRowVector({makeNullableFlatVector( + {R"(["red","blue"])"_sv, + R"([null,null,"purple"])"_sv, + "[]"_sv, + "null"_sv})}); + auto expected = makeNullableArrayVector( + {{{"red"_sv, "blue"_sv}}, + {{std::nullopt, std::nullopt, "purple"_sv}}, + {{}}, + std::nullopt}); + + checkInternalFn( + "$internal$json_string_to_array_cast", ARRAY(VARCHAR()), data, expected); + + // Array of integers. + data = makeRowVector({makeNullableFlatVector( + {R"(["10212","1015353"])"_sv, R"(["10322","285000"])"})}); + expected = + makeNullableArrayVector({{10212, 1015353}, {10322, 285000}}); + + checkInternalFn( + "$internal$json_string_to_array_cast", ARRAY(BIGINT()), data, expected); +} + +TEST_F(JsonFunctionsTest, jsonStringToMapCast) { + // Map of strings. + auto data = makeRowVector({makeFlatVector( + {R"({"red":1,"blue":2})"_sv, + R"({"green":3,"magenta":4})"_sv, + R"({"violet":1,"blue":2})"_sv, + R"({"yellow":1,"blue":2})"_sv, + R"({"purple":10,"cyan":5})"_sv})}); + + auto expected = makeMapVector( + {{{"red"_sv, 1}, {"blue"_sv, 2}}, + {{"green"_sv, 3}, {"magenta"_sv, 4}}, + {{"violet"_sv, 1}, {"blue"_sv, 2}}, + {{"yellow"_sv, 1}, {"blue"_sv, 2}}, + {{"purple"_sv, 10}, {"cyan"_sv, 5}}}); + + checkInternalFn( + "$internal$json_string_to_map_cast", + MAP(VARCHAR(), BIGINT()), + data, + expected); +} + +TEST_F(JsonFunctionsTest, jsonStringToRowCast) { + // Row of strings. + auto data = makeRowVector({makeFlatVector( + {R"({"red":1,"blue":2})"_sv, + R"({"red":3,"blue":4})"_sv, + R"({"red":1,"blue":2})"_sv, + R"({"red":1,"blue":2})"_sv, + R"({"red":10,"blue":5})"_sv})}); + + auto expected = makeRowVector( + {makeFlatVector({1, 3, 1, 1, 10}), + makeFlatVector({2, 4, 2, 2, 5})}); + + checkInternalFn( + "$internal$json_string_to_row_cast", + ROW({{"red", BIGINT()}, {"blue", BIGINT()}}), + data, + expected); +} + } // namespace } // namespace facebook::velox::functions::prestosql diff --git a/velox/functions/prestosql/types/JsonType.cpp b/velox/functions/prestosql/types/JsonType.cpp index 5c128815f27a..83a6ba140ad3 100644 --- a/velox/functions/prestosql/types/JsonType.cpp +++ b/velox/functions/prestosql/types/JsonType.cpp @@ -33,7 +33,6 @@ #include "velox/functions/lib/RowsTranslationUtil.h" #include "velox/functions/lib/string/StringCore.h" #include "velox/functions/prestosql/json/JsonStringUtil.h" -#include "velox/functions/prestosql/json/SIMDJsonUtil.h" #include "velox/type/Conversions.h" #include "velox/type/Type.h" @@ -1096,79 +1095,7 @@ bool isSupportedBasicType(const TypePtr& type) { } } -/// Custom operator for casts from and to Json type. -class JsonCastOperator : public exec::CastOperator { - public: - bool isSupportedFromType(const TypePtr& other) const override; - - bool isSupportedToType(const TypePtr& other) const override; - - void castTo( - const BaseVector& input, - exec::EvalCtx& context, - const SelectivityVector& rows, - const TypePtr& resultType, - VectorPtr& result) const override; - - void castTo( - const BaseVector& input, - exec::EvalCtx& context, - const SelectivityVector& rows, - const TypePtr& resultType, - VectorPtr& result, - const std::shared_ptr& hooks) const override; - - void castFrom( - const BaseVector& input, - exec::EvalCtx& context, - const SelectivityVector& rows, - const TypePtr& resultType, - VectorPtr& result) const override; - - private: - template - void castFromJson( - const BaseVector& input, - exec::EvalCtx& context, - const SelectivityVector& rows, - BaseVector& result) const { - // Result is guaranteed to be a flat writable vector. - auto* flatResult = result.as::type>(); - exec::VectorWriter writer; - writer.init(*flatResult); - // Input is guaranteed to be in flat or constant encodings when passed in. - auto* inputVector = input.as>(); - size_t maxSize = 0; - rows.applyToSelected([&](auto row) { - if (inputVector->isNullAt(row)) { - return; - } - auto& input = inputVector->valueAt(row); - maxSize = std::max(maxSize, input.size()); - }); - paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); - context.applyToSelectedNoThrow(rows, [&](auto row) { - writer.setOffset(row); - if (inputVector->isNullAt(row)) { - writer.commitNull(); - return; - } - auto& input = inputVector->valueAt(row); - memcpy(paddedInput_.data(), input.data(), input.size()); - simdjson::padded_string_view paddedInput( - paddedInput_.data(), input.size(), paddedInput_.size()); - if (auto error = castFromJsonOneRow(paddedInput, writer)) { - context.setVeloxExceptionError(row, errors_[error]); - writer.commitNull(); - } - }); - writer.finish(); - } - - mutable folly::once_flag initializeErrors_; - mutable std::exception_ptr errors_[simdjson::NUM_ERROR_CODES]; - mutable std::string paddedInput_; -}; +} // namespace bool JsonCastOperator::isSupportedFromType(const TypePtr& other) const { if (isSupportedBasicType(other)) { @@ -1197,6 +1124,45 @@ bool JsonCastOperator::isSupportedFromType(const TypePtr& other) const { } } +template +void JsonCastOperator::castFromJson( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + BaseVector& result) const { + // Result is guaranteed to be a flat writable vector. + auto* flatResult = result.as::type>(); + exec::VectorWriter writer; + writer.init(*flatResult); + // Input is guaranteed to be in flat or constant encodings when passed in. + auto* inputVector = input.as>(); + size_t maxSize = 0; + rows.applyToSelected([&](auto row) { + if (inputVector->isNullAt(row)) { + return; + } + auto& input = inputVector->valueAt(row); + maxSize = std::max(maxSize, input.size()); + }); + paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); + context.applyToSelectedNoThrow(rows, [&](auto row) { + writer.setOffset(row); + if (inputVector->isNullAt(row)) { + writer.commitNull(); + return; + } + auto& input = inputVector->valueAt(row); + memcpy(paddedInput_.data(), input.data(), input.size()); + simdjson::padded_string_view paddedInput( + paddedInput_.data(), input.size(), paddedInput_.size()); + if (auto error = castFromJsonOneRow(paddedInput, writer)) { + context.setVeloxExceptionError(row, errors_[error]); + writer.commitNull(); + } + }); + writer.finish(); +} + bool JsonCastOperator::isSupportedToType(const TypePtr& other) const { if (other->isDate()) { return false; @@ -1315,8 +1281,6 @@ class JsonTypeFactories : public CustomTypeFactories { } }; -} // namespace - void registerJsonType() { registerCustomType("json", std::make_unique()); } diff --git a/velox/functions/prestosql/types/JsonType.h b/velox/functions/prestosql/types/JsonType.h index 224679d64208..902fc48c7bd0 100644 --- a/velox/functions/prestosql/types/JsonType.h +++ b/velox/functions/prestosql/types/JsonType.h @@ -16,6 +16,7 @@ #pragma once #include "velox/expression/CastExpr.h" +#include "velox/functions/prestosql/json/SIMDJsonUtil.h" #include "velox/type/SimpleFunctionApi.h" #include "velox/type/Type.h" @@ -72,4 +73,46 @@ using Json = CustomType; void registerJsonType(); +/// Custom operator for casts from and to Json type. +class JsonCastOperator : public exec::CastOperator { + public: + bool isSupportedFromType(const TypePtr& other) const override; + + bool isSupportedToType(const TypePtr& other) const override; + + void castTo( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + const TypePtr& resultType, + VectorPtr& result) const override; + + void castTo( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + const TypePtr& resultType, + VectorPtr& result, + const std::shared_ptr& hooks) const override; + + void castFrom( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + const TypePtr& resultType, + VectorPtr& result) const override; + + private: + template + void castFromJson( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + BaseVector& result) const; + + mutable folly::once_flag initializeErrors_; + mutable std::exception_ptr errors_[simdjson::NUM_ERROR_CODES]; + mutable std::string paddedInput_; +}; + } // namespace facebook::velox