From f863d8c3a46a345e18e1f583bebf6131f3462a05 Mon Sep 17 00:00:00 2001 From: rui-mo Date: Fri, 23 Feb 2024 10:26:15 +0800 Subject: [PATCH] Fix split function --- .../velox/SparkPlanExecApiImpl.scala | 13 +------- .../expression/ExpressionTransformer.scala | 30 ------------------- .../execution/VeloxStringFunctionsSuite.scala | 3 +- ep/build-velox/src/get_velox.sh | 4 +-- .../backendsapi/SparkPlanExecApi.scala | 10 ------- .../expression/ExpressionConverter.scala | 8 ----- 6 files changed, 4 insertions(+), 64 deletions(-) diff --git a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/SparkPlanExecApiImpl.scala b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/SparkPlanExecApiImpl.scala index cd53b74560a49..e4e074acc3c5c 100644 --- a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/SparkPlanExecApiImpl.scala +++ b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/SparkPlanExecApiImpl.scala @@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.{AggregateFunctionRewriteRule, FlushableHas import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast, CreateNamedStruct, ElementAt, Expression, ExpressionInfo, GetArrayItem, GetMapValue, GetStructField, If, IsNaN, Literal, Murmur3Hash, NamedExpression, NaNvl, Round, StringSplit, StringTrim} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast, CreateNamedStruct, ElementAt, Expression, ExpressionInfo, GetArrayItem, GetMapValue, GetStructField, If, IsNaN, Literal, Murmur3Hash, NamedExpression, NaNvl, Round, StringTrim} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, HLLAdapter} import org.apache.spark.sql.catalyst.optimizer.BuildSide import org.apache.spark.sql.catalyst.plans.JoinType @@ -406,17 +406,6 @@ class SparkPlanExecApiImpl extends SparkPlanExecApi { DecimalRoundTransformer(substraitExprName, child, original) } - /** Generate StringSplit transformer. */ - override def genStringSplitTransformer( - substraitExprName: String, - srcExpr: ExpressionTransformer, - regexExpr: ExpressionTransformer, - limitExpr: ExpressionTransformer, - original: StringSplit): ExpressionTransformer = { - // In velox, split function just support tow args, not support limit arg for now - VeloxStringSplitTransformer(substraitExprName, srcExpr, regexExpr, limitExpr, original) - } - /** * Generate Alias transformer. * diff --git a/backends-velox/src/main/scala/io/glutenproject/expression/ExpressionTransformer.scala b/backends-velox/src/main/scala/io/glutenproject/expression/ExpressionTransformer.scala index 34db622835f08..2a01fb0f5a808 100644 --- a/backends-velox/src/main/scala/io/glutenproject/expression/ExpressionTransformer.scala +++ b/backends-velox/src/main/scala/io/glutenproject/expression/ExpressionTransformer.scala @@ -113,33 +113,3 @@ case class VeloxHashExpressionTransformer( ExpressionBuilder.makeScalarFunction(functionId, nodes, typeNode) } } - -case class VeloxStringSplitTransformer( - substraitExprName: String, - srcExpr: ExpressionTransformer, - regexExpr: ExpressionTransformer, - limitExpr: ExpressionTransformer, - original: StringSplit) - extends ExpressionTransformer { - - override def doTransform(args: java.lang.Object): ExpressionNode = { - if ( - !regexExpr.isInstanceOf[LiteralTransformer] || - !limitExpr.isInstanceOf[LiteralTransformer] - ) { - throw new UnsupportedOperationException( - "Gluten only supports literal input as limit/regex for split function.") - } - - val limit = limitExpr.doTransform(args).asInstanceOf[IntLiteralNode].getValue - val regex = regexExpr.doTransform(args).asInstanceOf[StringLiteralNode].getValue - if (limit > 0 || regex.length > 1) { - throw new UnsupportedOperationException( - s"$original supported single-length regex and negative limit, but given $limit and $regex") - } - - // TODO: split function support limit arg - GenericExpressionTransformer(substraitExprName, Seq(srcExpr, regexExpr), original) - .doTransform(args) - } -} diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala index c306d70ac5190..a386cea4cfe62 100644 --- a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala +++ b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala @@ -523,12 +523,11 @@ class VeloxStringFunctionsSuite extends VeloxWholeStageTransformerSuite { s"from $LINEITEM_TABLE limit 5") { _ => } } - ignore("split") { + test("split") { runQueryAndCompare( s"select l_orderkey, l_comment, split(l_comment, ' ', 3) " + s"from $LINEITEM_TABLE limit 5") { _ => } - // todo incorrect results runQueryAndCompare( s"select l_orderkey, l_comment, split(l_comment, '[a]', 3) " + s"from $LINEITEM_TABLE limit 5") { _ => } diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh index 1544225ad3ff8..1ab4dc404e5ec 100755 --- a/ep/build-velox/src/get_velox.sh +++ b/ep/build-velox/src/get_velox.sh @@ -16,8 +16,8 @@ set -exu -VELOX_REPO=https://github.com/oap-project/velox.git -VELOX_BRANCH=2024_02_22 +VELOX_REPO=https://github.com/rui-mo/velox.git +VELOX_BRANCH=test_split VELOX_HOME="" #Set on run gluten on HDFS diff --git a/gluten-core/src/main/scala/io/glutenproject/backendsapi/SparkPlanExecApi.scala b/gluten-core/src/main/scala/io/glutenproject/backendsapi/SparkPlanExecApi.scala index 7df4a14e178e0..f3163ccffea64 100644 --- a/gluten-core/src/main/scala/io/glutenproject/backendsapi/SparkPlanExecApi.scala +++ b/gluten-core/src/main/scala/io/glutenproject/backendsapi/SparkPlanExecApi.scala @@ -143,16 +143,6 @@ trait SparkPlanExecApi { original: Expression): ExpressionTransformer = AliasTransformer(substraitExprName, child, original) - /** Generate SplitTransformer. */ - def genStringSplitTransformer( - substraitExprName: String, - srcExpr: ExpressionTransformer, - regexExpr: ExpressionTransformer, - limitExpr: ExpressionTransformer, - original: StringSplit): ExpressionTransformer = { - GenericExpressionTransformer(substraitExprName, Seq(srcExpr, regexExpr, limitExpr), original) - } - def genRandTransformer( substraitExprName: String, explicitSeed: ExpressionTransformer, diff --git a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala index f75d7fe74cdb2..6b1f62ef05420 100644 --- a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala +++ b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala @@ -353,14 +353,6 @@ object ExpressionConverter extends SQLConfHelper with Logging { replaceWithExpressionTransformerInternal(l.third, attributeSeq, expressionsMap), l ) - case s: StringSplit => - BackendsApiManager.getSparkPlanExecApiInstance.genStringSplitTransformer( - substraitExprName, - replaceWithExpressionTransformerInternal(s.str, attributeSeq, expressionsMap), - replaceWithExpressionTransformerInternal(s.regex, attributeSeq, expressionsMap), - replaceWithExpressionTransformerInternal(s.limit, attributeSeq, expressionsMap), - s - ) case r: RegExpReplace => RegExpReplaceTransformer( substraitExprName,