From f837f060f60bb8e5ad173ce92d2a85c294da5d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E9=98=B3=E9=98=B3?= Date: Thu, 27 Jun 2024 19:52:06 +0800 Subject: [PATCH 1/4] [UT] Remove isVeloxBackendLoaded usage from file metadata UT (#6249) --- .../datasources/GlutenFileMetadataStructSuite.scala | 8 ++++---- .../datasources/GlutenFileMetadataStructSuite.scala | 8 ++++---- .../datasources/GlutenFileMetadataStructSuite.scala | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala index efa0fbae062b..ed347d024c1c 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala @@ -16,8 +16,8 @@ */ package org.apache.spark.sql.execution.datasources +import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.{FileSourceScanExecTransformer, FilterExecTransformer} -import org.apache.gluten.utils.BackendTestUtils import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.GlutenSQLTestsBaseTrait @@ -108,7 +108,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS METADATA_FILE_MODIFICATION_TIME, "age") dfWithMetadata.collect - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](dfWithMetadata) } else { checkOperatorMatch[FileSourceScanExec](dfWithMetadata) @@ -133,7 +133,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS .where(Column(METADATA_FILE_NAME) === f0((METADATA_FILE_NAME))) val ret = filterDF.collect assert(ret.size == 1) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) @@ -149,7 +149,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS Row(f1(METADATA_FILE_PATH)) ) ) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala index b3b9ea7393c3..6e47a94e3c13 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala @@ -16,8 +16,8 @@ */ package org.apache.spark.sql.execution.datasources +import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.{FileSourceScanExecTransformer, FilterExecTransformer} -import org.apache.gluten.utils.BackendTestUtils import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.GlutenSQLTestsBaseTrait @@ -109,7 +109,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS METADATA_FILE_MODIFICATION_TIME, "age") dfWithMetadata.collect - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](dfWithMetadata) } else { checkOperatorMatch[FileSourceScanExec](dfWithMetadata) @@ -134,7 +134,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS .where(Column(METADATA_FILE_NAME) === f0((METADATA_FILE_NAME))) val ret = filterDF.collect assert(ret.size == 1) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) @@ -150,7 +150,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS Row(f1(METADATA_FILE_PATH)) ) ) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala index efa0fbae062b..ed347d024c1c 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenFileMetadataStructSuite.scala @@ -16,8 +16,8 @@ */ package org.apache.spark.sql.execution.datasources +import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.execution.{FileSourceScanExecTransformer, FilterExecTransformer} -import org.apache.gluten.utils.BackendTestUtils import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.GlutenSQLTestsBaseTrait @@ -108,7 +108,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS METADATA_FILE_MODIFICATION_TIME, "age") dfWithMetadata.collect - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](dfWithMetadata) } else { checkOperatorMatch[FileSourceScanExec](dfWithMetadata) @@ -133,7 +133,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS .where(Column(METADATA_FILE_NAME) === f0((METADATA_FILE_NAME))) val ret = filterDF.collect assert(ret.size == 1) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) @@ -149,7 +149,7 @@ class GlutenFileMetadataStructSuite extends FileMetadataStructSuite with GlutenS Row(f1(METADATA_FILE_PATH)) ) ) - if (BackendTestUtils.isVeloxBackendLoaded()) { + if (BackendsApiManager.getSettings.supportNativeMetadataColumns()) { checkOperatorMatch[FileSourceScanExecTransformer](filterDF) } else { checkOperatorMatch[FileSourceScanExec](filterDF) From eaace6c16f5355ae30872bfd947ca36b550cd497 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E9=98=B3=E9=98=B3?= Date: Thu, 27 Jun 2024 19:56:42 +0800 Subject: [PATCH 2/4] [CORE] Log unknown fallback reason (#6237) --- .../org/apache/spark/sql/execution/GlutenFallbackReporter.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenFallbackReporter.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenFallbackReporter.scala index 00b0248aee77..721a30eb4f40 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenFallbackReporter.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenFallbackReporter.scala @@ -89,6 +89,7 @@ case class GlutenFallbackReporter(glutenConfig: GlutenConfig, spark: SparkSessio logicalPlan.setTagValue(FALLBACK_REASON_TAG, newReason) } case TRANSFORM_UNSUPPORTED(_, _) => + logFallbackReason(validationLogLevel, p.nodeName, "unknown reason") case _ => throw new IllegalStateException("Unreachable code") } From 22dc4fdcb5197e7c4a7fdfd768f5abf7a85b354f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 27 Jun 2024 20:34:23 +0800 Subject: [PATCH 3/4] [GLUTEN-2790][CH] Fix diff between ch char and spark chr (#6236) [CH] Fix diff between ch char and spark chr --- .../Parser/SerializedPlanParser.h | 1 - .../Parser/scalar_function_parser/chr.cpp | 71 +++++++++++++++++++ .../clickhouse/ClickHouseTestSettings.scala | 1 - .../clickhouse/ClickHouseTestSettings.scala | 1 - .../clickhouse/ClickHouseTestSettings.scala | 1 - .../clickhouse/ClickHouseTestSettings.scala | 1 - 6 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.h b/cpp-ch/local-engine/Parser/SerializedPlanParser.h index 184065836e65..1785f64ee17c 100644 --- a/cpp-ch/local-engine/Parser/SerializedPlanParser.h +++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.h @@ -133,7 +133,6 @@ static const std::map SCALAR_FUNCTIONS {"replace", "replaceAll"}, {"regexp_replace", "replaceRegexpAll"}, {"regexp_extract_all", "regexpExtractAllSpark"}, - {"chr", "char"}, {"rlike", "match"}, {"ascii", "ascii"}, {"split", "splitByRegexp"}, diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp new file mode 100644 index 000000000000..d168e63d11dc --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/chr.cpp @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +} + +namespace local_engine +{ +class FunctionParserChr : public FunctionParser +{ +public: + explicit FunctionParserChr(SerializedPlanParser * plan_parser_) : FunctionParser(plan_parser_) { } + ~FunctionParserChr() override = default; + static constexpr auto name = "chr"; + String getName() const override { return name; } + + const ActionsDAG::Node * parse( + const substrait::Expression_ScalarFunction & substrait_func, + ActionsDAGPtr & actions_dag) const override + { + auto parsed_args = parseFunctionArguments(substrait_func, "", actions_dag); + if (parsed_args.size() != 1) + throw Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires two or three arguments", getName()); + + /* + parse chr(number) as if(number < 0, '', convertCharset(char(0, number), 'unicode', 'utf-8')) + */ + const auto & num_arg = parsed_args[0]; + const auto * const_zero_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 0); + const auto * const_empty_node = addColumnToActionsDAG(actions_dag, std::make_shared(), ""); + const auto * const_four_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 4); + const auto * const_unicode_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "unicode"); + const auto * const_utf8_node = addColumnToActionsDAG(actions_dag, std::make_shared(), "utf-8"); + + const auto * less_node = toFunctionNode(actions_dag, "less", {num_arg, const_zero_node}); + + const auto * char_node = toFunctionNode(actions_dag, "char", {const_zero_node, num_arg}); + const auto * convert_charset_node = toFunctionNode(actions_dag, "convertCharset", {char_node, const_unicode_node, const_utf8_node}); + + const auto * if_node = toFunctionNode(actions_dag, "if", {less_node, const_empty_node, convert_charset_node}); + const auto * result_node = convertNodeTypeIfNeeded(substrait_func, if_node, actions_dag); + return result_node; + } +}; + +static FunctionParserRegister register_chr; +} diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index d12a40b764f8..3048c3f9cab5 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -897,7 +897,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("Substring") .exclude("string substring_index function") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("overlay for string") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 52e7ebcbda49..769707d4eb5f 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -857,7 +857,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("overlay for string") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 38ed2c53463b..268f22fe6981 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -760,7 +760,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("Levenshtein distance") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 38ed2c53463b..268f22fe6981 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -760,7 +760,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("string substring_index function") .exclude("SPARK-40213: ascii for Latin-1 Supplement characters") .exclude("ascii for string") - .exclude("string for ascii") .exclude("base64/unbase64 for string") .exclude("encode/decode for string") .exclude("Levenshtein distance") From b6776b6960fe4ea0cc2a524d5c254ad64e87a508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E9=98=B3=E9=98=B3?= Date: Fri, 28 Jun 2024 08:25:28 +0800 Subject: [PATCH 4/4] [CORE] Rename isTransformable API to maybeTransformable (#6233) --- .../extension/columnar/OffloadSingleNode.scala | 11 ++++------- .../extension/columnar/TransformHintRule.scala | 16 +++++----------- .../rewrite/RewriteSparkPlanRulesManager.scala | 2 +- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala index 8cd2a5fb67bd..75da28e30d39 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala @@ -213,18 +213,15 @@ case class OffloadFilter() extends OffloadSingleNode with LogLevelUtil { // Push down the left conditions in Filter into FileSourceScan. val newChild: SparkPlan = filter.child match { case scan @ (_: FileSourceScanExec | _: BatchScanExec) => - if (TransformHints.isTransformable(scan)) { + if (TransformHints.maybeTransformable(scan)) { val newScan = FilterHandler.pushFilterToScan(filter.condition, scan) newScan match { case ts: TransformSupport if ts.doValidate().isValid => ts - // TODO remove the call - case _ => replace.doReplace(scan) + case _ => scan } - } else { - replace.doReplace(scan) - } - case _ => replace.doReplace(filter.child) + } else scan + case _ => filter.child } logDebug(s"Columnar Processing for ${filter.getClass} is currently supported.") BackendsApiManager.getSparkPlanExecApiInstance diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/TransformHintRule.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/TransformHintRule.scala index d32cf2d22eb4..aa7aab759ef8 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/TransformHintRule.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/TransformHintRule.scala @@ -79,18 +79,12 @@ object TransformHints { } /** - * NOTE: To be deprecated. Do not create new usages of this method. - * - * Since it's usually not safe to consider a plan "transformable" during validation phase. Another - * validation rule could turn "transformable" to "non-transformable" before implementing the plan - * within Gluten transformers. + * If true, it implies the plan maybe transformable during validation phase but not guaranteed, + * since another validation rule could turn it to "non-transformable" before implementing the plan + * within Gluten transformers. If false, the plan node will be guaranteed fallback to Vanilla plan + * node while being implemented. */ - def isTransformable(plan: SparkPlan): Boolean = { - getHintOption(plan) match { - case None => true - case _ => false - } - } + def maybeTransformable(plan: SparkPlan): Boolean = !isNotTransformable(plan) def tag(plan: SparkPlan, hint: TransformHint): Unit = { val mergedHint = getHintOption(plan) diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSparkPlanRulesManager.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSparkPlanRulesManager.scala index ac663314bead..8706e5618f6b 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSparkPlanRulesManager.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/rewrite/RewriteSparkPlanRulesManager.scala @@ -49,7 +49,7 @@ class RewriteSparkPlanRulesManager private (rewriteRules: Seq[RewriteSingleNode] extends Rule[SparkPlan] { private def mayNeedRewrite(plan: SparkPlan): Boolean = { - TransformHints.isTransformable(plan) && { + TransformHints.maybeTransformable(plan) && { plan match { case _: SortExec => true case _: TakeOrderedAndProjectExec => true