From 437bfcda671e6b3003c14298f7fcc670c5cfc05e Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Tue, 11 Apr 2023 15:43:56 +0800 Subject: [PATCH] improve json path compatibility --- .../Parsers/ParserJSONPathMemberAccess.cpp | 50 +++++- ...arserJSONPathMemberSquareBracketAccess.cpp | 51 ++++++ .../ParserJSONPathMemberSquareBracketAccess.h | 17 ++ .../JSONPath/Parsers/ParserJSONPathQuery.cpp | 3 + .../Functions/FunctionGetJsonObject.cpp | 15 ++ .../Functions/FunctionGetJsonObject.h | 148 ++++++++++++++++++ .../Parser/SerializedPlanParser.cpp | 2 +- .../Parser/SerializedPlanParser.h | 2 +- 8 files changed, 278 insertions(+), 10 deletions(-) create mode 100644 src/Functions/JSONPath/Parsers/ParserJSONPathMemberSquareBracketAccess.cpp create mode 100644 src/Functions/JSONPath/Parsers/ParserJSONPathMemberSquareBracketAccess.h create mode 100644 utils/local-engine/Functions/FunctionGetJsonObject.cpp create mode 100644 utils/local-engine/Functions/FunctionGetJsonObject.h diff --git a/src/Functions/JSONPath/Parsers/ParserJSONPathMemberAccess.cpp b/src/Functions/JSONPath/Parsers/ParserJSONPathMemberAccess.cpp index f0ed178e1c2e..f139989fb728 100644 --- a/src/Functions/JSONPath/Parsers/ParserJSONPathMemberAccess.cpp +++ b/src/Functions/JSONPath/Parsers/ParserJSONPathMemberAccess.cpp @@ -1,9 +1,13 @@ +#include #include #include +#include #include #include #include +#include + namespace DB { @@ -16,18 +20,48 @@ namespace DB */ bool ParserJSONPathMemberAccess::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { - if (pos->type != TokenType::Dot) + // There's a specical case, that a path member can begin with number + if (pos->type != TokenType::Dot && pos->type != TokenType::Number) return false; + if (pos->type != TokenType::Number) + ++pos; - ++pos; + ASTPtr member_name; - if (pos->type != TokenType::BareWord && pos->type !=TokenType::QuotedIdentifier) - return false; + if (pos->type == TokenType::Number)[[unlikely]] + { + for (const auto * c = pos->begin; c != pos->end; ++c) + { + if (*c == '.' && c == pos->begin) + continue; + if (!isNumericASCII(*c)) + { + return false; + } + } + const auto * last_begin = *pos->begin == '.' ? pos->begin + 1 : pos->begin; + const auto * last_end = pos->end; + ++pos; - ParserIdentifier name_p; - ASTPtr member_name; - if (!name_p.parse(pos, member_name, expected)) - return false; + if (pos.isValid() && pos->type == TokenType::BareWord && pos->begin == last_end) + { + member_name = std::make_shared(String(last_begin, pos->end)); + ++pos; + } + else + { + return false; + } + } + else + { + if (pos->type != TokenType::BareWord && pos->type != TokenType::QuotedIdentifier) + return false; + + ParserIdentifier name_p; + if (!name_p.parse(pos, member_name, expected)) + return false; + } auto member_access = std::make_shared(); node = member_access; diff --git a/src/Functions/JSONPath/Parsers/ParserJSONPathMemberSquareBracketAccess.cpp b/src/Functions/JSONPath/Parsers/ParserJSONPathMemberSquareBracketAccess.cpp new file mode 100644 index 000000000000..b3059fcfd948 --- /dev/null +++ b/src/Functions/JSONPath/Parsers/ParserJSONPathMemberSquareBracketAccess.cpp @@ -0,0 +1,51 @@ +#include "ParserJSONPathMemberSquareBracketAccess.h" +#include +#include +#include +#include +#include +#include + +namespace DB +{ +bool ParserJSONPathMemberSquareBracketAccess::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + if (pos->type != TokenType::OpeningSquareBracket) + return false; + ++pos; + ASTPtr member_name; + if (pos->type == TokenType::BareWord || pos->type == TokenType::QuotedIdentifier) + { + ParserIdentifier name_p; + if (!name_p.parse(pos, member_name, expected)) + return false; + } + else if (pos->type == TokenType::StringLiteral) + { + try + { + ReadBufferFromMemory in(pos->begin, pos->size()); + String name; + readQuotedStringWithSQLStyle(name, in); + member_name = std::make_shared(name); + ++pos; + } + catch (const Exception &) + { + return false; + } + } + else + { + return false; + } + if (pos->type != TokenType::ClosingSquareBracket) + { + return false; + } + ++pos; + auto member_access = std::make_shared(); + node = member_access; + return tryGetIdentifierNameInto(member_name, member_access->member_name); +} +} diff --git a/src/Functions/JSONPath/Parsers/ParserJSONPathMemberSquareBracketAccess.h b/src/Functions/JSONPath/Parsers/ParserJSONPathMemberSquareBracketAccess.h new file mode 100644 index 000000000000..b682ec5bb966 --- /dev/null +++ b/src/Functions/JSONPath/Parsers/ParserJSONPathMemberSquareBracketAccess.h @@ -0,0 +1,17 @@ +#pragma once +#include +// cases +// - [ident] +// - ['ident'] +// - ["ident"] +namespace DB +{ +class ParserJSONPathMemberSquareBracketAccess : public IParserBase +{ +private: + const char * getName() const override { return "ParserJSONPathMemberSquareBracketAccess"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +public: + explicit ParserJSONPathMemberSquareBracketAccess() = default; +}; +} diff --git a/src/Functions/JSONPath/Parsers/ParserJSONPathQuery.cpp b/src/Functions/JSONPath/Parsers/ParserJSONPathQuery.cpp index c18b2ad9b319..d8d633a1ec90 100644 --- a/src/Functions/JSONPath/Parsers/ParserJSONPathQuery.cpp +++ b/src/Functions/JSONPath/Parsers/ParserJSONPathQuery.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ bool ParserJSONPathQuery::parseImpl(Pos & pos, ASTPtr & query, Expected & expect { query = std::make_shared(); ParserJSONPathMemberAccess parser_jsonpath_member_access; + ParserJSONPathMemberSquareBracketAccess parser_jsonpath_member_square_bracket_access; ParserJSONPathRange parser_jsonpath_range; ParserJSONPathStar parser_jsonpath_star; ParserJSONPathRoot parser_jsonpath_root; @@ -32,6 +34,7 @@ bool ParserJSONPathQuery::parseImpl(Pos & pos, ASTPtr & query, Expected & expect ASTPtr accessor; while (parser_jsonpath_member_access.parse(pos, accessor, expected) + || parser_jsonpath_member_square_bracket_access.parse(pos, accessor, expected) || parser_jsonpath_range.parse(pos, accessor, expected) || parser_jsonpath_star.parse(pos, accessor, expected)) { diff --git a/utils/local-engine/Functions/FunctionGetJsonObject.cpp b/utils/local-engine/Functions/FunctionGetJsonObject.cpp new file mode 100644 index 000000000000..f7e0079ed9b3 --- /dev/null +++ b/utils/local-engine/Functions/FunctionGetJsonObject.cpp @@ -0,0 +1,15 @@ +#include "FunctionGetJsonObject.h" +#include + + +using DB::Token; +using DB::TokenType; + +namespace local_engine +{ + +REGISTER_FUNCTION(GetJsonObject) +{ + factory.registerFunction>(); +} +} diff --git a/utils/local-engine/Functions/FunctionGetJsonObject.h b/utils/local-engine/Functions/FunctionGetJsonObject.h new file mode 100644 index 000000000000..5525d1672857 --- /dev/null +++ b/utils/local-engine/Functions/FunctionGetJsonObject.h @@ -0,0 +1,148 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} +namespace local_engine +{ +// We notice that, `get_json_object` have different behavior with `JSON_VALUE/JSON_QUERY`. +// - ('{"x":[{"y":1},{"y":2}]}' '$.x[*].y'), `json_value` return only one element, but `get_json_object` return +// return a list. +// - ('{"x":[{"y":1}]}' '$.x[*].y'), `json_query`'s result is '[1]', +// but `get_json_object`'s result is '1' +// + + +struct GetJsonOject +{ + static constexpr auto name{"get_json_object"}; +}; + +template +class GetJsonObjectImpl +{ +public: + using Element = typename JSONParser::Element; + + static DB::DataTypePtr getReturnType(const char *, const DB::ColumnsWithTypeAndName &) + { + auto nested_type = std::make_shared(); + return std::make_shared(nested_type); + } + + static size_t getNumberOfIndexArguments(const DB::ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + bool insertResultToColumn(DB::IColumn & dest, const Element & root, DB::ASTPtr & query_ptr) + { + if (!(has_array_wildcard_flag & 0x01)) [[unlikely]] + { + setupArrayWildcardFlag(query_ptr); + } + DB::GeneratorJSONPath generator_json_path(query_ptr); + Element current_element = root; + DB::VisitorStatus status; + std::stringstream out; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + /// Create json array of results: [res1, res2, ...] + bool success = false; + size_t element_count = 0; + out << "["; + while ((status = generator_json_path.getNextItem(current_element)) != DB::VisitorStatus::Exhausted) + { + if (status == DB::VisitorStatus::Ok) + { + if (success) + { + out << ", "; + } + success = true; + element_count++; + out << current_element.getElement(); + } + else if (status == DB::VisitorStatus::Error) + { + /// ON ERROR + /// Here it is possible to handle errors with ON ERROR (as described in ISO/IEC TR 19075-6), + /// however this functionality is not implemented yet + } + current_element = root; + } + out << "]"; + if (!success) + { + return false; + } + DB::ColumnNullable & col_str = assert_cast(dest); + auto output_str = out.str(); + std::string_view final_out_str; + assert(elelement_count); + if (element_count == 1) + { + std::string_view output_str_view(output_str.data() + 1, output_str.size() - 2); + if (output_str_view.size() >= 2 && output_str_view.front() == '\"' && output_str_view.back() == '\"') + { + final_out_str = std::string_view(output_str_view.data() + 1, output_str_view.size() - 2); + } + else + final_out_str = std::string_view(output_str); + } + else + { + final_out_str = std::string_view(output_str); + } + col_str.insertData(final_out_str.data(), final_out_str.size()); + return true; + } +private: + UInt8 has_array_wildcard_flag = 0; + + void setupArrayWildcardFlag(DB::ASTPtr & query_ptr) + { + has_array_wildcard_flag |= 0x01; + const auto * path = query_ptr->as(); + if (!path) + { + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Invalid path"); + } + const auto * query = path->jsonpath_query; + + for (const auto & child_ast : query->children) + { + if (auto * range_ast = typeid_cast(child_ast.get())) + { + if (range_ast->is_star) + { + has_array_wildcard_flag |= 0x02; + break; + } + for (const auto & range : range_ast->ranges) + { + if (range.first != range.second - 1) + { + has_array_wildcard_flag |= 0x02; + break; + } + } + } + else if (typeid_cast(child_ast.get())) + { + has_array_wildcard_flag |= 0x02; + break; + } + } + } +}; + + +} diff --git a/utils/local-engine/Parser/SerializedPlanParser.cpp b/utils/local-engine/Parser/SerializedPlanParser.cpp index 01d4b6e31f42..d9bcc00f2a6f 100644 --- a/utils/local-engine/Parser/SerializedPlanParser.cpp +++ b/utils/local-engine/Parser/SerializedPlanParser.cpp @@ -1527,7 +1527,7 @@ const ActionsDAG::Node * SerializedPlanParser::parseFunctionWithDAG( SerializedPlanParser::parseType(rel.scalar_function().output_type())->getName(), function_node->result_name); } - if (function_name == "JSON_VALUE") + if (function_name == "get_json_object") { result_node->function->setResolver(function_builder); } diff --git a/utils/local-engine/Parser/SerializedPlanParser.h b/utils/local-engine/Parser/SerializedPlanParser.h index d6fd8d90fe56..31e5d47b1cd6 100644 --- a/utils/local-engine/Parser/SerializedPlanParser.h +++ b/utils/local-engine/Parser/SerializedPlanParser.h @@ -188,7 +188,7 @@ static const std::map SCALAR_FUNCTIONS = { {"posexplode", "arrayJoin"}, // json functions - {"get_json_object", "JSON_VALUE"}, + {"get_json_object", "get_json_object"}, {"to_json", "toJSONString"}, {"from_json", "JSONExtract"}, };