From 22dc4fdcb5197e7c4a7fdfd768f5abf7a85b354f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 27 Jun 2024 20:34:23 +0800 Subject: [PATCH] [GLUTEN-2790][CH] Fix diff between ch char and spark chr (#6236) [CH] Fix diff between ch char and spark chr --- .../Parser/SerializedPlanParser.h | 1 - .../Parser/scalar_function_parser/chr.cpp | 71 +++++++++++++++++++ .../clickhouse/ClickHouseTestSettings.scala | 1 - .../clickhouse/ClickHouseTestSettings.scala | 1 - .../clickhouse/ClickHouseTestSettings.scala | 1 - .../clickhouse/ClickHouseTestSettings.scala | 1 - 6 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.h b/cpp-ch/local-engine/Parser/SerializedPlanParser.h index 184065836e65..1785f64ee17c 100644 --- a/cpp-ch/local-engine/Parser/SerializedPlanParser.h +++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.h @@ -133,7 +133,6 @@ static const std::map SCALAR_FUNCTIONS {"replace", "replaceAll"}, {"regexp_replace", "replaceRegexpAll"}, {"regexp_extract_all", "regexpExtractAllSpark"}, - {"chr", "char"}, {"rlike", "match"}, {"ascii", "ascii"}, {"split", "splitByRegexp"}, diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp new file mode 100644 index 000000000000..d168e63d11dc --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +} + +namespace local_engine +{ +class FunctionParserChr : public FunctionParser +{ +public: + explicit FunctionParserChr(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) { } + ~FunctionParserChr() override = default; + static constexpr auto name = "chr"; + String getName() const override { return name; } + + const ActionsDAG::Node * parse( + const substrait::Expression_ScalarFunction & substrait_func, + ActionsDAGPtr & actions_dag) const override + { + auto parsed_args = parseFunctionArguments(substrait_func, "", actions_dag); + if (parsed_args.size() != 1) + throw Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires two or three arguments", getName()); + + /* + parse chr(number) as if(number < 0, '', convertCharset(char(0, number), 'unicode', 'utf-8')) + */ + const auto & num_arg = parsed_args[0]; + const auto * const_zero_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 0); + const auto * const_empty_node = addColumnToActionsDAG(actions_dag, std::make_shared(), ""); + const auto * const_four_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 4); + const auto * const_unicode_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "unicode"); + const auto * const_utf8_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "utf-8"); + + const auto * less_node = toFunctionNode(actions_dag, "less", {num_arg, const_zero_node}); + + const auto * char_node = toFunctionNode(actions_dag, "char", {const_zero_node, num_arg}); + const auto * convert_charset_node = toFunctionNode(actions_dag, "convertCharset", {char_node, const_unicode_node, const_utf8_node}); + + const auto * if_node = toFunctionNode(actions_dag, "if", {less_node, const_empty_node, convert_charset_node}); + const auto * result_node = convertNodeTypeIfNeeded(substrait_func, if_node, actions_dag); + return result_node; + } +}; + +static FunctionParserRegister register_chr; +} diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index d12a40b764f8..3048c3f9cab5 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -897,7 +897,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("Substring") .exclude("string substring_index function") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("overlay for string") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 52e7ebcbda49..769707d4eb5f 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -857,7 +857,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("overlay for string") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 38ed2c53463b..268f22fe6981 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -760,7 +760,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("Levenshtein distance") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 38ed2c53463b..268f22fe6981 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -760,7 +760,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("Levenshtein distance")