diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala index f765a75d2f7d..826a0deb5b29 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala @@ -925,4 +925,12 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging { limit, mode, child) + + override def genStringSplitTransformer( + substraitExprName: String, + srcExpr: ExpressionTransformer, + regexExpr: ExpressionTransformer, + limitExpr: ExpressionTransformer, + original: StringSplit): ExpressionTransformer = + CHStringSplitTransformer(substraitExprName, Seq(srcExpr, regexExpr, limitExpr), original) } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHGenerateExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHGenerateExecTransformer.scala index 44cb0deca523..fc7da0a6ddee 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHGenerateExecTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHGenerateExecTransformer.scala @@ -76,6 +76,7 @@ case class CHGenerateExecTransformer( inputRel, generatorNode, requiredChildOutputNodes.asJava, + outer, context, context.nextOperatorId(this.nodeName)) } else { @@ -84,6 +85,7 @@ case class CHGenerateExecTransformer( generatorNode, requiredChildOutputNodes.asJava, getExtensionNodeForValidation, + outer, context, context.nextOperatorId(this.nodeName)) } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHExpressionTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHExpressionTransformer.scala index 7b389ead0091..0851e6aa8a37 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHExpressionTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHExpressionTransformer.scala @@ -243,3 +243,12 @@ case class GetArrayItemTransformer( ConverterUtils.getTypeNode(getArrayItem.dataType, getArrayItem.nullable)) } } +case class CHStringSplitTransformer( + substraitExprName: String, + children: Seq[ExpressionTransformer], + original: Expression, + override val dataType: DataType = ArrayType(StringType, containsNull = true)) + extends ExpressionTransformer { + // In Spark: split return Array(String), while Array is nullable + // In CH: splitByXXX return Array(Nullable(String)) +} diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala index be76ba54ed72..a81d812d9cde 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala @@ -91,6 +91,7 @@ case class GenerateExecTransformer( generatorNode, requiredChildOutputNodes.asJava, getExtensionNode(validation), + outer, context, operatorId) } diff --git a/cpp-ch/local-engine/Parser/RelParsers/ProjectRelParser.cpp b/cpp-ch/local-engine/Parser/RelParsers/ProjectRelParser.cpp index 6fb1f3d961cc..59c29dc24f16 100644 --- a/cpp-ch/local-engine/Parser/RelParsers/ProjectRelParser.cpp +++ b/cpp-ch/local-engine/Parser/RelParsers/ProjectRelParser.cpp @@ -199,7 +199,7 @@ ProjectRelParser::parseGenerate(DB::QueryPlanPtr query_plan, const substrait::Re /// ARRAY JOIN NameSet array_joined_columns{findArrayJoinNode(splitted_actions_dags.array_join)->result_name}; - auto array_join_action = std::make_shared(array_joined_columns, false, getContext()); + auto array_join_action = std::make_shared(array_joined_columns, generate_rel.outer(), getContext()); auto array_join_step = std::make_unique(query_plan->getCurrentDataStream(), array_join_action); array_join_step->setStepDescription("ARRAY JOIN In Generate"); steps.emplace_back(array_join_step.get()); diff --git a/gluten-substrait/src/main/java/org/apache/gluten/substrait/rel/GenerateRelNode.java b/gluten-substrait/src/main/java/org/apache/gluten/substrait/rel/GenerateRelNode.java index dbe1f62ab10d..efd470891905 100644 --- a/gluten-substrait/src/main/java/org/apache/gluten/substrait/rel/GenerateRelNode.java +++ b/gluten-substrait/src/main/java/org/apache/gluten/substrait/rel/GenerateRelNode.java @@ -31,20 +31,24 @@ public class GenerateRelNode implements RelNode, Serializable { private final ExpressionNode generator; private final List childOutput; private final AdvancedExtensionNode extensionNode; + private final boolean outer; - GenerateRelNode(RelNode input, ExpressionNode generator, List childOutput) { - this(input, generator, childOutput, null); + GenerateRelNode( + RelNode input, ExpressionNode generator, List childOutput, boolean outer) { + this(input, generator, childOutput, null, outer); } GenerateRelNode( RelNode input, ExpressionNode generator, List childOutput, - AdvancedExtensionNode extensionNode) { + AdvancedExtensionNode extensionNode, + boolean outer) { this.input = input; this.generator = generator; this.childOutput = childOutput; this.extensionNode = extensionNode; + this.outer = outer; } @Override @@ -67,6 +71,8 @@ public Rel toProtobuf() { generateRelBuilder.addChildOutput(node.toProtobuf()); } + generateRelBuilder.setOuter(outer); + if (extensionNode != null) { generateRelBuilder.setAdvancedExtension(extensionNode.toProtobuf()); } diff --git a/gluten-substrait/src/main/java/org/apache/gluten/substrait/rel/RelBuilder.java b/gluten-substrait/src/main/java/org/apache/gluten/substrait/rel/RelBuilder.java index b784e3e7f273..def1dca0a028 100644 --- a/gluten-substrait/src/main/java/org/apache/gluten/substrait/rel/RelBuilder.java +++ b/gluten-substrait/src/main/java/org/apache/gluten/substrait/rel/RelBuilder.java @@ -299,10 +299,11 @@ public static RelNode makeGenerateRel( RelNode input, ExpressionNode generator, List childOutput, + boolean outer, SubstraitContext context, Long operatorId) { context.registerRelToOperator(operatorId); - return new GenerateRelNode(input, generator, childOutput); + return new GenerateRelNode(input, generator, childOutput, outer); } public static RelNode makeGenerateRel( @@ -310,9 +311,10 @@ public static RelNode makeGenerateRel( ExpressionNode generator, List childOutput, AdvancedExtensionNode extensionNode, + boolean outer, SubstraitContext context, Long operatorId) { context.registerRelToOperator(operatorId); - return new GenerateRelNode(input, generator, childOutput, extensionNode); + return new GenerateRelNode(input, generator, childOutput, extensionNode, outer); } } diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala index dd4150806cfc..71fef82722c6 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala @@ -693,4 +693,12 @@ trait SparkPlanExecApi { attributeSeq: Seq[Attribute]): ExpressionTransformer = { HiveUDFTransformer.replaceWithExpressionTransformer(expr, attributeSeq) } + + def genStringSplitTransformer( + substraitExprName: String, + srcExpr: ExpressionTransformer, + regexExpr: ExpressionTransformer, + limitExpr: ExpressionTransformer, + original: StringSplit): ExpressionTransformer = + GenericExpressionTransformer(substraitExprName, Seq(srcExpr, regexExpr, limitExpr), original) } diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala index 606cbd96e026..98a556c11736 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala @@ -689,6 +689,14 @@ object ExpressionConverter extends SQLConfHelper with Logging { timeAdd.children, timeAdd ) + case ss: StringSplit => + BackendsApiManager.getSparkPlanExecApiInstance.genStringSplitTransformer( + substraitExprName, + replaceWithExpressionTransformer0(ss.str, attributeSeq, expressionsMap), + replaceWithExpressionTransformer0(ss.regex, attributeSeq, expressionsMap), + replaceWithExpressionTransformer0(ss.limit, attributeSeq, expressionsMap), + ss + ) case expr => GenericExpressionTransformer( substraitExprName, diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala index dd14a18553b9..e7d573ca5e7d 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala @@ -105,4 +105,21 @@ class GlutenHiveSQLQueryCHSuite extends GlutenHiveSQLQuerySuiteBase { purge = false) } + testGluten("GLUTEN-7116: Support outer explode") { + sql("create table if not exists test_7116 (id int, name string)") + sql("insert into test_7116 values (1, 'a,b'), (2, null), (null, 'c,d'), (3, '')") + val query = + """ + |select id, col_name + |from test_7116 lateral view outer explode(split(name, ',')) as col_name + |""".stripMargin + val df = sql(query) + checkAnswer( + df, + Seq(Row(1, "a"), Row(1, "b"), Row(2, null), Row(null, "c"), Row(null, "d"), Row(3, ""))) + spark.sessionState.catalog.dropTable( + TableIdentifier("test_7116"), + ignoreIfNotExists = true, + purge = false) + } } diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala index dd14a18553b9..e7d573ca5e7d 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala @@ -105,4 +105,21 @@ class GlutenHiveSQLQueryCHSuite extends GlutenHiveSQLQuerySuiteBase { purge = false) } + testGluten("GLUTEN-7116: Support outer explode") { + sql("create table if not exists test_7116 (id int, name string)") + sql("insert into test_7116 values (1, 'a,b'), (2, null), (null, 'c,d'), (3, '')") + val query = + """ + |select id, col_name + |from test_7116 lateral view outer explode(split(name, ',')) as col_name + |""".stripMargin + val df = sql(query) + checkAnswer( + df, + Seq(Row(1, "a"), Row(1, "b"), Row(2, null), Row(null, "c"), Row(null, "d"), Row(3, ""))) + spark.sessionState.catalog.dropTable( + TableIdentifier("test_7116"), + ignoreIfNotExists = true, + purge = false) + } } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala index dd14a18553b9..e7d573ca5e7d 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala @@ -105,4 +105,21 @@ class GlutenHiveSQLQueryCHSuite extends GlutenHiveSQLQuerySuiteBase { purge = false) } + testGluten("GLUTEN-7116: Support outer explode") { + sql("create table if not exists test_7116 (id int, name string)") + sql("insert into test_7116 values (1, 'a,b'), (2, null), (null, 'c,d'), (3, '')") + val query = + """ + |select id, col_name + |from test_7116 lateral view outer explode(split(name, ',')) as col_name + |""".stripMargin + val df = sql(query) + checkAnswer( + df, + Seq(Row(1, "a"), Row(1, "b"), Row(2, null), Row(null, "c"), Row(null, "d"), Row(3, ""))) + spark.sessionState.catalog.dropTable( + TableIdentifier("test_7116"), + ignoreIfNotExists = true, + purge = false) + } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala index dd14a18553b9..e7d573ca5e7d 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQueryCHSuite.scala @@ -105,4 +105,21 @@ class GlutenHiveSQLQueryCHSuite extends GlutenHiveSQLQuerySuiteBase { purge = false) } + testGluten("GLUTEN-7116: Support outer explode") { + sql("create table if not exists test_7116 (id int, name string)") + sql("insert into test_7116 values (1, 'a,b'), (2, null), (null, 'c,d'), (3, '')") + val query = + """ + |select id, col_name + |from test_7116 lateral view outer explode(split(name, ',')) as col_name + |""".stripMargin + val df = sql(query) + checkAnswer( + df, + Seq(Row(1, "a"), Row(1, "b"), Row(2, null), Row(null, "c"), Row(null, "d"), Row(3, ""))) + spark.sessionState.catalog.dropTable( + TableIdentifier("test_7116"), + ignoreIfNotExists = true, + purge = false) + } }