diff --git a/docs/Configuration.md b/docs/Configuration.md
index a80f0f716c5e..1985cc898190 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -9,54 +9,55 @@ nav_order: 3
There are many configurations could impact the Gluten Plugin performance and can be fine-tuned in Spark.
You can add these configurations into spark-defaults.conf to enable or disable the setting.
-| Parameters | Description | Recommend Setting |
-|---------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|
-| spark.driver.extraClassPath | To add Gluten Plugin jar file in Spark Driver | /path/to/jar_file |
-| spark.executor.extraClassPath | To add Gluten Plugin jar file in Spark Executor | /path/to/jar_file |
-| spark.executor.memory | To set up how much memory to be used for Spark Executor. | |
-| spark.memory.offHeap.size | To set up how much memory to be used for Java OffHeap.
Please notice Gluten Plugin will leverage this setting to allocate memory space for native usage even offHeap is disabled.
The value is based on your system and it is recommended to set it larger if you are facing Out of Memory issue in Gluten Plugin | 30G |
-| spark.sql.sources.useV1SourceList | Choose to use V1 source | avro |
-| spark.sql.join.preferSortMergeJoin | To turn off preferSortMergeJoin in Spark | false |
-| spark.plugins | To load Gluten's components by Spark's plug-in loader | com.intel.oap.GlutenPlugin |
-| spark.shuffle.manager | To turn on Gluten Columnar Shuffle Plugin | org.apache.spark.shuffle.sort.ColumnarShuffleManager |
-| spark.gluten.enabled | Enable Gluten, default is true. Just an experimental property. Recommend to enable/disable Gluten through the setting for `spark.plugins`. | true |
-| spark.gluten.sql.columnar.maxBatchSize | Number of rows to be processed in each batch. Default value is 4096. | 4096 |
-| spark.gluten.memory.isolation | (Experimental) Enable isolated memory mode. If true, Gluten controls the maximum off-heap memory can be used by each task to X, X = executor memory / max task slots. It's recommended to set true if Gluten serves concurrent queries within a single session, since not all memory Gluten allocated is guaranteed to be spillable. In the case, the feature should be enabled to avoid OOM. Note when true, setting spark.memory.storageFraction to a lower value is suggested since storage memory is considered non-usable by Gluten. | false |
-| spark.gluten.sql.columnar.scanOnly | When enabled, this config will overwrite all other operators' enabling, and only Scan and Filter pushdown will be offloaded to native. | false |
-| spark.gluten.sql.columnar.batchscan | Enable or Disable Columnar BatchScan, default is true | true |
-| spark.gluten.sql.columnar.hashagg | Enable or Disable Columnar Hash Aggregate, default is true | true |
-| spark.gluten.sql.columnar.project | Enable or Disable Columnar Project, default is true | true |
-| spark.gluten.sql.columnar.filter | Enable or Disable Columnar Filter, default is true | true |
-| spark.gluten.sql.columnar.sort | Enable or Disable Columnar Sort, default is true | true |
-| spark.gluten.sql.columnar.window | Enable or Disable Columnar Window, default is true | true |
-| spark.gluten.sql.columnar.shuffledHashJoin | Enable or Disable ShuffledHashJoin, default is true | true |
-| spark.gluten.sql.columnar.forceShuffledHashJoin | Force to use ShuffledHashJoin over SortMergeJoin, default is true | true |
-| spark.gluten.sql.columnar.sortMergeJoin | Enable or Disable Columnar Sort Merge Join, default is true | true |
-| spark.gluten.sql.columnar.union | Enable or Disable Columnar Union, default is true | true |
-| spark.gluten.sql.columnar.expand | Enable or Disable Columnar Expand, default is true | true |
-| spark.gluten.sql.columnar.generate | Enable or Disable Columnar Generate, default is true | true |
-| spark.gluten.sql.columnar.limit | Enable or Disable Columnar Limit, default is true | true |
-| spark.gluten.sql.columnar.tableCache | Enable or Disable Columnar Table Cache, default is false | true |
-| spark.gluten.sql.columnar.broadcastExchange | Enable or Disable Columnar Broadcast Exchange, default is true | true |
-| spark.gluten.sql.columnar.broadcastJoin | Enable or Disable Columnar BroadcastHashJoin, default is true | true |
-| spark.gluten.sql.columnar.shuffle.codec | Set up the codec to be used for Columnar Shuffle. If this configuration is not set, will check the value of spark.io.compression.codec. By default, Gluten use software compression. Valid options for software compression are lz4, zstd. Valid options for QAT and IAA is gzip. | lz4 |
-| spark.gluten.sql.columnar.shuffle.codecBackend | Enable using hardware accelerators for shuffle de/compression. Valid options are QAT and IAA. | |
-| spark.gluten.sql.columnar.shuffle.compressionMode | Setting different compression mode in shuffle, Valid options are buffer and rowvector, buffer option compress each buffer of RowVector individually into one pre-allocated large buffer, rowvector option first copies each buffer of RowVector to a large buffer and then compress the entire buffer in one go. | buffer |
-| spark.gluten.sql.columnar.shuffle.compression.threshold | If number of rows in a batch falls below this threshold, will copy all buffers into one buffer to compress. | 100 |
-| spark.gluten.sql.columnar.shuffle.realloc.threshold | Set the threshold to dynamically adjust the size of shuffle split buffers. The size of each split buffer is recalculated for each incoming batch of data. If the new size deviates from the current partition buffer size by a factor outside the range of [1 - threshold, 1 + threshold], the split buffer will be re-allocated using the newly calculated size | 0.25 |
-| spark.gluten.sql.columnar.numaBinding | Set up NUMABinding, default is false | true |
-| spark.gluten.sql.columnar.coreRange | Set up the core range for NUMABinding, only works when numaBinding set to true.
The setting is based on the number of cores in your system. Use 72 cores as an example. | 0-17,36-53 |18-35,54-71 |
-| spark.gluten.sql.native.bloomFilter | Enable or Disable native runtime bloom filter. | true |
-| spark.gluten.sql.columnar.wholeStage.fallback.threshold | Configure the threshold for whether whole stage will fall back in AQE supported case by counting the number of ColumnarToRow & vanilla leaf node | \>= 3 |
-| spark.gluten.sql.columnar.query.fallback.threshold | Configure the threshold for whether query will fall back by counting the number of ColumnarToRow & vanilla leaf node | \>= 1 |
-| spark.gluten.sql.columnar.maxBatchSize | Set the number of rows for the output batch | 4096 |
-| spark.gluten.shuffleWriter.bufferSize | Set the number of buffer rows for the shuffle writer | value of spark.gluten.sql.columnar.maxBatchSize |
-| spark.gluten.loadLibFromJar | Controls whether to load dynamic link library from a packed jar for gluten/cpp. Not applicable to static build and clickhouse backend. | false |
-| spark.gluten.sql.columnar.force.hashagg | Force to use hash agg to replace sort agg. | true |
-| spark.gluten.sql.columnar.vanillaReaders | Enable vanilla spark's vectorized reader. Please note it may bring perf. overhead due to extra data transition. We recommend to disable it if most queries can be fully offloaded to gluten. | false |
-| spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems | The default number of expected items for the velox bloomfilter. | 1000000L |
-| spark.gluten.sql.columnar.backend.velox.bloomFilter.numBits | The default number of bits to use for the velox bloom filter. | 8388608L |
-| spark.gluten.sql.columnar.backend.velox.bloomFilter.maxNumBits | The max number of bits to use for the velox bloom filter. | 4194304L |
+| Parameters | Description | Recommend Setting |
+|----------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|
+| spark.driver.extraClassPath | To add Gluten Plugin jar file in Spark Driver | /path/to/jar_file |
+| spark.executor.extraClassPath | To add Gluten Plugin jar file in Spark Executor | /path/to/jar_file |
+| spark.executor.memory | To set up how much memory to be used for Spark Executor. | |
+| spark.memory.offHeap.size | To set up how much memory to be used for Java OffHeap.
Please notice Gluten Plugin will leverage this setting to allocate memory space for native usage even offHeap is disabled.
The value is based on your system and it is recommended to set it larger if you are facing Out of Memory issue in Gluten Plugin | 30G |
+| spark.sql.sources.useV1SourceList | Choose to use V1 source | avro |
+| spark.sql.join.preferSortMergeJoin | To turn off preferSortMergeJoin in Spark | false |
+| spark.plugins | To load Gluten's components by Spark's plug-in loader | com.intel.oap.GlutenPlugin |
+| spark.shuffle.manager | To turn on Gluten Columnar Shuffle Plugin | org.apache.spark.shuffle.sort.ColumnarShuffleManager |
+| spark.gluten.enabled | Enable Gluten, default is true. Just an experimental property. Recommend to enable/disable Gluten through the setting for `spark.plugins`. | true |
+| spark.gluten.sql.columnar.maxBatchSize | Number of rows to be processed in each batch. Default value is 4096. | 4096 |
+| spark.gluten.memory.isolation | (Experimental) Enable isolated memory mode. If true, Gluten controls the maximum off-heap memory can be used by each task to X, X = executor memory / max task slots. It's recommended to set true if Gluten serves concurrent queries within a single session, since not all memory Gluten allocated is guaranteed to be spillable. In the case, the feature should be enabled to avoid OOM. Note when true, setting spark.memory.storageFraction to a lower value is suggested since storage memory is considered non-usable by Gluten. | false |
+| spark.gluten.sql.columnar.scanOnly | When enabled, this config will overwrite all other operators' enabling, and only Scan and Filter pushdown will be offloaded to native. | false |
+| spark.gluten.sql.columnar.batchscan | Enable or Disable Columnar BatchScan, default is true | true |
+| spark.gluten.sql.columnar.hashagg | Enable or Disable Columnar Hash Aggregate, default is true | true |
+| spark.gluten.sql.columnar.project | Enable or Disable Columnar Project, default is true | true |
+| spark.gluten.sql.columnar.filter | Enable or Disable Columnar Filter, default is true | true |
+| spark.gluten.sql.columnar.sort | Enable or Disable Columnar Sort, default is true | true |
+| spark.gluten.sql.columnar.window | Enable or Disable Columnar Window, default is true | true |
+| spark.gluten.sql.columnar.shuffledHashJoin | Enable or Disable ShuffledHashJoin, default is true | true |
+| spark.gluten.sql.columnar.forceShuffledHashJoin | Force to use ShuffledHashJoin over SortMergeJoin, default is true | true |
+| spark.gluten.sql.columnar.sortMergeJoin | Enable or Disable Columnar Sort Merge Join, default is true | true |
+| spark.gluten.sql.columnar.union | Enable or Disable Columnar Union, default is true | true |
+| spark.gluten.sql.columnar.expand | Enable or Disable Columnar Expand, default is true | true |
+| spark.gluten.sql.columnar.generate | Enable or Disable Columnar Generate, default is true | true |
+| spark.gluten.sql.columnar.limit | Enable or Disable Columnar Limit, default is true | true |
+| spark.gluten.sql.columnar.tableCache | Enable or Disable Columnar Table Cache, default is false | true |
+| spark.gluten.sql.columnar.broadcastExchange | Enable or Disable Columnar Broadcast Exchange, default is true | true |
+| spark.gluten.sql.columnar.broadcastJoin | Enable or Disable Columnar BroadcastHashJoin, default is true | true |
+| spark.gluten.sql.columnar.shuffle.codec | Set up the codec to be used for Columnar Shuffle. If this configuration is not set, will check the value of spark.io.compression.codec. By default, Gluten use software compression. Valid options for software compression are lz4, zstd. Valid options for QAT and IAA is gzip. | lz4 |
+| spark.gluten.sql.columnar.shuffle.codecBackend | Enable using hardware accelerators for shuffle de/compression. Valid options are QAT and IAA. | |
+| spark.gluten.sql.columnar.shuffle.compressionMode | Setting different compression mode in shuffle, Valid options are buffer and rowvector, buffer option compress each buffer of RowVector individually into one pre-allocated large buffer, rowvector option first copies each buffer of RowVector to a large buffer and then compress the entire buffer in one go. | buffer |
+| spark.gluten.sql.columnar.shuffle.compression.threshold | If number of rows in a batch falls below this threshold, will copy all buffers into one buffer to compress. | 100 |
+| spark.gluten.sql.columnar.shuffle.realloc.threshold | Set the threshold to dynamically adjust the size of shuffle split buffers. The size of each split buffer is recalculated for each incoming batch of data. If the new size deviates from the current partition buffer size by a factor outside the range of [1 - threshold, 1 + threshold], the split buffer will be re-allocated using the newly calculated size | 0.25 |
+| spark.gluten.sql.columnar.numaBinding | Set up NUMABinding, default is false | true |
+| spark.gluten.sql.columnar.coreRange | Set up the core range for NUMABinding, only works when numaBinding set to true.
The setting is based on the number of cores in your system. Use 72 cores as an example. | 0-17,36-53 |18-35,54-71 |
+| spark.gluten.sql.native.bloomFilter | Enable or Disable native runtime bloom filter. | true |
+| spark.gluten.sql.columnar.wholeStage.fallback.threshold | Configure the threshold for whether whole stage will fall back in AQE supported case by counting the number of ColumnarToRow & vanilla leaf node | \>= 3 |
+| spark.gluten.sql.columnar.query.fallback.threshold | Configure the threshold for whether query will fall back by counting the number of ColumnarToRow & vanilla leaf node | \>= 1 |
+| spark.gluten.sql.columnar.maxBatchSize | Set the number of rows for the output batch | 4096 |
+| spark.gluten.shuffleWriter.bufferSize | Set the number of buffer rows for the shuffle writer | value of spark.gluten.sql.columnar.maxBatchSize |
+| spark.gluten.loadLibFromJar | Controls whether to load dynamic link library from a packed jar for gluten/cpp. Not applicable to static build and clickhouse backend. | false |
+| spark.gluten.sql.columnar.force.hashagg | Force to use hash agg to replace sort agg. | true |
+| spark.gluten.sql.columnar.vanillaReaders | Enable vanilla spark's vectorized reader. Please note it may bring perf. overhead due to extra data transition. We recommend to disable it if most queries can be fully offloaded to gluten. | false |
+| spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems | The default number of expected items for the velox bloomfilter. | 1000000L |
+| spark.gluten.sql.columnar.backend.velox.bloomFilter.numBits | The default number of bits to use for the velox bloom filter. | 8388608L |
+| spark.gluten.sql.columnar.backend.velox.bloomFilter.maxNumBits | The max number of bits to use for the velox bloom filter. | 4194304L |
+| spark.gluten.expression.blacklist | A back list of expression to skip transform, multiple values separated by commas. | |
Below is an example for spark-default.conf:
diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/BasicPhysicalOperatorTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/execution/BasicPhysicalOperatorTransformer.scala
index 29654d99d709..4281038dd44f 100644
--- a/gluten-core/src/main/scala/io/glutenproject/execution/BasicPhysicalOperatorTransformer.scala
+++ b/gluten-core/src/main/scala/io/glutenproject/execution/BasicPhysicalOperatorTransformer.scala
@@ -213,10 +213,8 @@ case class ProjectExecTransformer private (projectList: Seq[NamedExpression], ch
input: RelNode,
validation: Boolean): RelNode = {
val args = context.registeredFunction
- val columnarProjExprs: Seq[ExpressionTransformer] = projectList.map(
- expr =>
- ExpressionConverter
- .replaceWithExpressionTransformer(expr, attributeSeq = originalInputAttributes))
+ val columnarProjExprs: Seq[ExpressionTransformer] = ExpressionConverter
+ .replaceWithExpressionTransformer(projectList, attributeSeq = originalInputAttributes)
val projExprNodeList = columnarProjExprs.map(_.doTransform(args)).asJava
val emitStartIndex = originalInputAttributes.size
if (!validation) {
diff --git a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala
index 1132771dc807..d4b9d0d63590 100644
--- a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala
+++ b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala
@@ -45,48 +45,70 @@ trait Transformable extends Expression {
}
object ExpressionConverter extends SQLConfHelper with Logging {
- def replacePythonUDFWithExpressionTransformer(
- udf: PythonUDF,
+
+ def replaceWithExpressionTransformer(
+ exprs: Seq[Expression],
+ attributeSeq: Seq[Attribute]): Seq[ExpressionTransformer] = {
+ val expressionsMap = ExpressionMappings.expressionsMap
+ exprs.map {
+ expr => replaceWithExpressionTransformerInternal(expr, attributeSeq, expressionsMap)
+ }.toSeq
+ }
+
+ def replaceWithExpressionTransformer(
+ expr: Expression,
attributeSeq: Seq[Attribute]): ExpressionTransformer = {
+ val expressionsMap = ExpressionMappings.expressionsMap
+ replaceWithExpressionTransformerInternal(expr, attributeSeq, expressionsMap)
+ }
+
+ private def replacePythonUDFWithExpressionTransformer(
+ udf: PythonUDF,
+ attributeSeq: Seq[Attribute],
+ expressionsMap: Map[Class[_], String]): ExpressionTransformer = {
val substraitExprName = UDFMappings.pythonUDFMap.get(udf.name)
substraitExprName match {
case Some(name) =>
GenericExpressionTransformer(
name,
- udf.children.map(replaceWithExpressionTransformer(_, attributeSeq)),
+ udf.children.map(
+ replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)),
udf)
case _ =>
throw new UnsupportedOperationException(s"Not supported python udf: $udf.")
}
}
- def replaceScalaUDFWithExpressionTransformer(
+ private def replaceScalaUDFWithExpressionTransformer(
udf: ScalaUDF,
- attributeSeq: Seq[Attribute]): ExpressionTransformer = {
+ attributeSeq: Seq[Attribute],
+ expressionsMap: Map[Class[_], String]): ExpressionTransformer = {
val substraitExprName = UDFMappings.scalaUDFMap.get(udf.udfName.get)
substraitExprName match {
case Some(name) =>
GenericExpressionTransformer(
name,
- udf.children.map(replaceWithExpressionTransformer(_, attributeSeq)),
+ udf.children.map(
+ replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)),
udf)
case _ =>
throw new UnsupportedOperationException(s"Not supported scala udf: $udf.")
}
}
- def replaceWithExpressionTransformer(
+ private def replaceWithExpressionTransformerInternal(
expr: Expression,
- attributeSeq: Seq[Attribute]): ExpressionTransformer = {
+ attributeSeq: Seq[Attribute],
+ expressionsMap: Map[Class[_], String]): ExpressionTransformer = {
logDebug(
s"replaceWithExpressionTransformer expr: $expr class: ${expr.getClass} " +
s"name: ${expr.prettyName}")
expr match {
case p: PythonUDF =>
- return replacePythonUDFWithExpressionTransformer(p, attributeSeq)
+ return replacePythonUDFWithExpressionTransformer(p, attributeSeq, expressionsMap)
case s: ScalaUDF =>
- return replaceScalaUDFWithExpressionTransformer(s, attributeSeq)
+ return replaceScalaUDFWithExpressionTransformer(s, attributeSeq, expressionsMap)
case _ if HiveSimpleUDFTransformer.isHiveSimpleUDF(expr) =>
return HiveSimpleUDFTransformer.replaceWithExpressionTransformer(expr, attributeSeq)
case _ =>
@@ -94,10 +116,11 @@ object ExpressionConverter extends SQLConfHelper with Logging {
TestStats.addExpressionClassName(expr.getClass.getName)
// Check whether Gluten supports this expression
- val substraitExprName = ExpressionMappings.expressionsMap.getOrElse(expr.getClass, "")
- if (substraitExprName.isEmpty) {
+ val substraitExprNameOpt = expressionsMap.get(expr.getClass)
+ if (substraitExprNameOpt.isEmpty) {
throw new UnsupportedOperationException(s"Not supported: $expr. ${expr.getClass}")
}
+ val substraitExprName = substraitExprNameOpt.get
// Check whether each backend supports this expression
if (!BackendsApiManager.getValidatorApiInstance.doExprValidate(substraitExprName, expr)) {
@@ -111,39 +134,43 @@ object ExpressionConverter extends SQLConfHelper with Logging {
ExpressionMappings.expressionExtensionTransformer
.replaceWithExtensionExpressionTransformer(substraitExprName, extendedExpr, attributeSeq)
case c: CreateArray =>
- val children = c.children.map(replaceWithExpressionTransformer(_, attributeSeq))
+ val children =
+ c.children.map(replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap))
CreateArrayTransformer(substraitExprName, children, true, c)
case g: GetArrayItem =>
GetArrayItemTransformer(
substraitExprName,
- replaceWithExpressionTransformer(g.left, attributeSeq),
- replaceWithExpressionTransformer(g.right, attributeSeq),
+ replaceWithExpressionTransformerInternal(g.left, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(g.right, attributeSeq, expressionsMap),
g.failOnError,
- g)
+ g
+ )
case c: CreateMap =>
- val children = c.children.map(replaceWithExpressionTransformer(_, attributeSeq))
+ val children =
+ c.children.map(replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap))
CreateMapTransformer(substraitExprName, children, c.useStringTypeWhenEmpty, c)
case g: GetMapValue =>
BackendsApiManager.getSparkPlanExecApiInstance.genGetMapValueTransformer(
substraitExprName,
- replaceWithExpressionTransformer(g.child, attributeSeq),
- replaceWithExpressionTransformer(g.key, attributeSeq),
- g)
+ replaceWithExpressionTransformerInternal(g.child, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(g.key, attributeSeq, expressionsMap),
+ g
+ )
case e: Explode =>
ExplodeTransformer(
substraitExprName,
- replaceWithExpressionTransformer(e.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(e.child, attributeSeq, expressionsMap),
e)
case p: PosExplode =>
PosExplodeTransformer(
substraitExprName,
- replaceWithExpressionTransformer(p.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(p.child, attributeSeq, expressionsMap),
p,
attributeSeq)
case a: Alias =>
BackendsApiManager.getSparkPlanExecApiInstance.genAliasTransformer(
substraitExprName,
- replaceWithExpressionTransformer(a.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(a.child, attributeSeq, expressionsMap),
a)
case a: AttributeReference =>
if (attributeSeq == null) {
@@ -175,52 +202,54 @@ object ExpressionConverter extends SQLConfHelper with Logging {
case f: FromUnixTime =>
FromUnixTimeTransformer(
substraitExprName,
- replaceWithExpressionTransformer(f.sec, attributeSeq),
- replaceWithExpressionTransformer(f.format, attributeSeq),
+ replaceWithExpressionTransformerInternal(f.sec, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(f.format, attributeSeq, expressionsMap),
f.timeZoneId,
- f)
+ f
+ )
case d: DateDiff =>
DateDiffTransformer(
substraitExprName,
- replaceWithExpressionTransformer(d.endDate, attributeSeq),
- replaceWithExpressionTransformer(d.startDate, attributeSeq),
- d)
+ replaceWithExpressionTransformerInternal(d.endDate, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(d.startDate, attributeSeq, expressionsMap),
+ d
+ )
case t: ToUnixTimestamp =>
BackendsApiManager.getSparkPlanExecApiInstance.genUnixTimestampTransformer(
substraitExprName,
- replaceWithExpressionTransformer(t.timeExp, attributeSeq),
- replaceWithExpressionTransformer(t.format, attributeSeq),
+ replaceWithExpressionTransformerInternal(t.timeExp, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(t.format, attributeSeq, expressionsMap),
t
)
case u: UnixTimestamp =>
BackendsApiManager.getSparkPlanExecApiInstance.genUnixTimestampTransformer(
substraitExprName,
- replaceWithExpressionTransformer(u.timeExp, attributeSeq),
- replaceWithExpressionTransformer(u.format, attributeSeq),
+ replaceWithExpressionTransformerInternal(u.timeExp, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(u.format, attributeSeq, expressionsMap),
ToUnixTimestamp(u.timeExp, u.format, u.timeZoneId, u.failOnError)
)
case t: TruncTimestamp =>
BackendsApiManager.getSparkPlanExecApiInstance.genTruncTimestampTransformer(
substraitExprName,
- replaceWithExpressionTransformer(t.format, attributeSeq),
- replaceWithExpressionTransformer(t.timestamp, attributeSeq),
+ replaceWithExpressionTransformerInternal(t.format, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(t.timestamp, attributeSeq, expressionsMap),
t.timeZoneId,
t
)
case m: MonthsBetween =>
MonthsBetweenTransformer(
substraitExprName,
- replaceWithExpressionTransformer(m.date1, attributeSeq),
- replaceWithExpressionTransformer(m.date2, attributeSeq),
- replaceWithExpressionTransformer(m.roundOff, attributeSeq),
+ replaceWithExpressionTransformerInternal(m.date1, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(m.date2, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(m.roundOff, attributeSeq, expressionsMap),
m.timeZoneId,
m
)
case i: If =>
IfTransformer(
- replaceWithExpressionTransformer(i.predicate, attributeSeq),
- replaceWithExpressionTransformer(i.trueValue, attributeSeq),
- replaceWithExpressionTransformer(i.falseValue, attributeSeq),
+ replaceWithExpressionTransformerInternal(i.predicate, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(i.trueValue, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(i.falseValue, attributeSeq, expressionsMap),
i
)
case cw: CaseWhen =>
@@ -229,27 +258,27 @@ object ExpressionConverter extends SQLConfHelper with Logging {
expr =>
{
(
- replaceWithExpressionTransformer(expr._1, attributeSeq),
- replaceWithExpressionTransformer(expr._2, attributeSeq))
+ replaceWithExpressionTransformerInternal(expr._1, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(expr._2, attributeSeq, expressionsMap))
}
},
cw.elseValue.map {
expr =>
{
- replaceWithExpressionTransformer(expr, attributeSeq)
+ replaceWithExpressionTransformerInternal(expr, attributeSeq, expressionsMap)
}
},
cw
)
case i: In =>
InTransformer(
- replaceWithExpressionTransformer(i.value, attributeSeq),
+ replaceWithExpressionTransformerInternal(i.value, attributeSeq, expressionsMap),
i.list,
i.value.dataType,
i)
case i: InSet =>
InSetTransformer(
- replaceWithExpressionTransformer(i.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(i.child, attributeSeq, expressionsMap),
i.hset,
i.child.dataType,
i)
@@ -260,7 +289,7 @@ object ExpressionConverter extends SQLConfHelper with Logging {
val newCast =
BackendsApiManager.getSparkPlanExecApiInstance.genCastWithNewChild(c)
CastTransformer(
- replaceWithExpressionTransformer(newCast.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(newCast.child, attributeSeq, expressionsMap),
newCast.dataType,
newCast.timeZoneId,
newCast)
@@ -272,86 +301,93 @@ object ExpressionConverter extends SQLConfHelper with Logging {
}
String2TrimExpressionTransformer(
substraitExprName,
- trimStr.map(replaceWithExpressionTransformer(_, attributeSeq)),
- replaceWithExpressionTransformer(srcStr, attributeSeq),
- s)
+ trimStr.map(replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)),
+ replaceWithExpressionTransformerInternal(srcStr, attributeSeq, expressionsMap),
+ s
+ )
case m: HashExpression[_] =>
BackendsApiManager.getSparkPlanExecApiInstance.genHashExpressionTransformer(
substraitExprName,
- m.children.map(expr => replaceWithExpressionTransformer(expr, attributeSeq)),
+ m.children.map(
+ expr => replaceWithExpressionTransformerInternal(expr, attributeSeq, expressionsMap)),
m)
case getStructField: GetStructField =>
// Different backends may have different result.
BackendsApiManager.getSparkPlanExecApiInstance.genGetStructFieldTransformer(
substraitExprName,
- replaceWithExpressionTransformer(getStructField.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(
+ getStructField.child,
+ attributeSeq,
+ expressionsMap),
getStructField.ordinal,
getStructField)
case t: StringTranslate =>
BackendsApiManager.getSparkPlanExecApiInstance.genStringTranslateTransformer(
substraitExprName,
- replaceWithExpressionTransformer(t.srcExpr, attributeSeq),
- replaceWithExpressionTransformer(t.matchingExpr, attributeSeq),
- replaceWithExpressionTransformer(t.replaceExpr, attributeSeq),
+ replaceWithExpressionTransformerInternal(t.srcExpr, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(t.matchingExpr, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(t.replaceExpr, attributeSeq, expressionsMap),
t
)
case l: StringLocate =>
BackendsApiManager.getSparkPlanExecApiInstance.genStringLocateTransformer(
substraitExprName,
- replaceWithExpressionTransformer(l.first, attributeSeq),
- replaceWithExpressionTransformer(l.second, attributeSeq),
- replaceWithExpressionTransformer(l.third, attributeSeq),
+ replaceWithExpressionTransformerInternal(l.first, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(l.second, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(l.third, attributeSeq, expressionsMap),
l
)
case s: StringSplit =>
BackendsApiManager.getSparkPlanExecApiInstance.genStringSplitTransformer(
substraitExprName,
- replaceWithExpressionTransformer(s.str, attributeSeq),
- replaceWithExpressionTransformer(s.regex, attributeSeq),
- replaceWithExpressionTransformer(s.limit, attributeSeq),
+ replaceWithExpressionTransformerInternal(s.str, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(s.regex, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(s.limit, attributeSeq, expressionsMap),
s
)
case r: RegExpReplace =>
RegExpReplaceTransformer(
substraitExprName,
- replaceWithExpressionTransformer(r.subject, attributeSeq),
- replaceWithExpressionTransformer(r.regexp, attributeSeq),
- replaceWithExpressionTransformer(r.rep, attributeSeq),
- replaceWithExpressionTransformer(r.pos, attributeSeq),
+ replaceWithExpressionTransformerInternal(r.subject, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(r.regexp, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(r.rep, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(r.pos, attributeSeq, expressionsMap),
r
)
case equal: EqualNullSafe =>
BackendsApiManager.getSparkPlanExecApiInstance.genEqualNullSafeTransformer(
substraitExprName,
- replaceWithExpressionTransformer(equal.left, attributeSeq),
- replaceWithExpressionTransformer(equal.right, attributeSeq),
+ replaceWithExpressionTransformerInternal(equal.left, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(equal.right, attributeSeq, expressionsMap),
equal
)
case md5: Md5 =>
BackendsApiManager.getSparkPlanExecApiInstance.genMd5Transformer(
substraitExprName,
- replaceWithExpressionTransformer(md5.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(md5.child, attributeSeq, expressionsMap),
md5)
case sha1: Sha1 =>
BackendsApiManager.getSparkPlanExecApiInstance.genSha1Transformer(
substraitExprName,
- replaceWithExpressionTransformer(sha1.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(sha1.child, attributeSeq, expressionsMap),
sha1)
case sha2: Sha2 =>
BackendsApiManager.getSparkPlanExecApiInstance.genSha2Transformer(
substraitExprName,
- replaceWithExpressionTransformer(sha2.left, attributeSeq),
- replaceWithExpressionTransformer(sha2.right, attributeSeq),
- sha2)
+ replaceWithExpressionTransformerInternal(sha2.left, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(sha2.right, attributeSeq, expressionsMap),
+ sha2
+ )
case size: Size =>
BackendsApiManager.getSparkPlanExecApiInstance.genSizeExpressionTransformer(
substraitExprName,
- replaceWithExpressionTransformer(size.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(size.child, attributeSeq, expressionsMap),
size)
case namedStruct: CreateNamedStruct =>
BackendsApiManager.getSparkPlanExecApiInstance.genNamedStructTransformer(
substraitExprName,
- namedStruct.children.map(replaceWithExpressionTransformer(_, attributeSeq)),
+ namedStruct.children.map(
+ replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)),
namedStruct,
attributeSeq)
case namedLambdaVariable: NamedLambdaVariable =>
@@ -365,49 +401,58 @@ object ExpressionConverter extends SQLConfHelper with Logging {
case lambdaFunction: LambdaFunction =>
LambdaFunctionTransformer(
substraitExprName,
- function = replaceWithExpressionTransformer(lambdaFunction.function, attributeSeq),
- arguments =
- lambdaFunction.arguments.map(replaceWithExpressionTransformer(_, attributeSeq)),
+ function = replaceWithExpressionTransformerInternal(
+ lambdaFunction.function,
+ attributeSeq,
+ expressionsMap),
+ arguments = lambdaFunction.arguments.map(
+ replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)),
hidden = false,
original = lambdaFunction
)
case j: JsonTuple =>
- val children = j.children.map(replaceWithExpressionTransformer(_, attributeSeq))
+ val children =
+ j.children.map(replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap))
JsonTupleExpressionTransformer(substraitExprName, children, j)
case l: Like =>
LikeTransformer(
substraitExprName,
- replaceWithExpressionTransformer(l.left, attributeSeq),
- replaceWithExpressionTransformer(l.right, attributeSeq),
- l)
+ replaceWithExpressionTransformerInternal(l.left, attributeSeq, expressionsMap),
+ replaceWithExpressionTransformerInternal(l.right, attributeSeq, expressionsMap),
+ l
+ )
case c: CheckOverflow =>
CheckOverflowTransformer(
substraitExprName,
- replaceWithExpressionTransformer(c.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(c.child, attributeSeq, expressionsMap),
c)
case m: MakeDecimal =>
MakeDecimalTransformer(
substraitExprName,
- replaceWithExpressionTransformer(m.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(m.child, attributeSeq, expressionsMap),
m)
case rand: Rand =>
BackendsApiManager.getSparkPlanExecApiInstance.genRandTransformer(
substraitExprName,
- replaceWithExpressionTransformer(rand.child, attributeSeq),
+ replaceWithExpressionTransformerInternal(rand.child, attributeSeq, expressionsMap),
rand)
case _: KnownFloatingPointNormalized | _: NormalizeNaNAndZero | _: PromotePrecision =>
ChildTransformer(
- replaceWithExpressionTransformer(expr.children.head, attributeSeq)
+ replaceWithExpressionTransformerInternal(expr.children.head, attributeSeq, expressionsMap)
)
case _: GetDateField | _: GetTimeField =>
ExtractDateTransformer(
substraitExprName,
- replaceWithExpressionTransformer(expr.children.head, attributeSeq),
+ replaceWithExpressionTransformerInternal(
+ expr.children.head,
+ attributeSeq,
+ expressionsMap),
expr)
case _: StringToMap =>
BackendsApiManager.getSparkPlanExecApiInstance.genStringToMapTransformer(
substraitExprName,
- expr.children.map(replaceWithExpressionTransformer(_, attributeSeq)),
+ expr.children.map(
+ replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)),
expr)
case b: BinaryArithmetic if DecimalArithmeticUtil.isDecimalArithmetic(b) =>
// PrecisionLoss=true: velox support / ch not support
@@ -426,8 +471,9 @@ object ExpressionConverter extends SQLConfHelper with Logging {
val (left, right) = DecimalArithmeticUtil.rescaleCastForDecimal(
DecimalArithmeticUtil.removeCastForDecimal(rescaleBinary.left),
DecimalArithmeticUtil.removeCastForDecimal(rescaleBinary.right))
- val leftChild = replaceWithExpressionTransformer(left, attributeSeq)
- val rightChild = replaceWithExpressionTransformer(right, attributeSeq)
+ val leftChild = replaceWithExpressionTransformerInternal(left, attributeSeq, expressionsMap)
+ val rightChild =
+ replaceWithExpressionTransformerInternal(right, attributeSeq, expressionsMap)
val resultType = DecimalArithmeticUtil.getResultTypeForOperation(
DecimalArithmeticUtil.getOperationType(b),
@@ -445,12 +491,14 @@ object ExpressionConverter extends SQLConfHelper with Logging {
resultType,
b)
case e: Transformable =>
- val childrenTransformers = e.children.map(replaceWithExpressionTransformer(_, attributeSeq))
+ val childrenTransformers =
+ e.children.map(replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap))
e.getTransformer(childrenTransformers)
case expr =>
GenericExpressionTransformer(
substraitExprName,
- expr.children.map(replaceWithExpressionTransformer(_, attributeSeq)),
+ expr.children.map(
+ replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)),
expr
)
}
diff --git a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionMappings.scala b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionMappings.scala
index 705ec28bf364..215da9b55e56 100644
--- a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionMappings.scala
+++ b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionMappings.scala
@@ -16,6 +16,7 @@
*/
package io.glutenproject.expression
+import io.glutenproject.GlutenConfig
import io.glutenproject.backendsapi.BackendsApiManager
import io.glutenproject.expression.ExpressionNames._
import io.glutenproject.extension.ExpressionExtensionTrait
@@ -277,9 +278,16 @@ object ExpressionMappings {
Sig[NthValue](NTH_VALUE)
)
- def expressionsMap: Map[Class[_], String] =
- defaultExpressionsMap ++
+ def expressionsMap: Map[Class[_], String] = {
+ val blacklist = GlutenConfig.getConf.expressionBlacklist
+ val supportedExprs = defaultExpressionsMap ++
expressionExtensionTransformer.extensionExpressionsMapping
+ if (blacklist.isEmpty) {
+ supportedExprs
+ } else {
+ supportedExprs.filterNot(kv => blacklist.contains(kv._2))
+ }
+ }
private lazy val defaultExpressionsMap: Map[Class[_], String] = {
(SCALAR_SIGS ++ AGGREGATE_SIGS ++ WINDOW_SIGS ++
diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveSimpleUDFTransformer.scala b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveSimpleUDFTransformer.scala
index 77100e5999d0..1672648b308c 100644
--- a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveSimpleUDFTransformer.scala
+++ b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveSimpleUDFTransformer.scala
@@ -44,7 +44,7 @@ object HiveSimpleUDFTransformer {
case Some(name) =>
GenericExpressionTransformer(
name,
- udf.children.map(ExpressionConverter.replaceWithExpressionTransformer(_, attributeSeq)),
+ ExpressionConverter.replaceWithExpressionTransformer(udf.children, attributeSeq),
udf)
case _ =>
throw new UnsupportedOperationException(
diff --git a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/clickhouse/ClickHouseTestSettings.scala
index 2a26440eba3b..a51e7a424980 100644
--- a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/clickhouse/ClickHouseTestSettings.scala
+++ b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/clickhouse/ClickHouseTestSettings.scala
@@ -1797,6 +1797,7 @@ class ClickHouseTestSettings extends BackendTestSettings {
"SELECT structFieldSimple.key, arrayFieldSimple[1] FROM tableWithSchema a where int_Field=1")
.exclude("SELECT structFieldComplex.Value.`value_(2)` FROM tableWithSchema")
enableSuite[SparkFunctionStatistics]
+ enableSuite[GlutenExpressionMappingSuite]
override def getSQLQueryTestSettings: SQLQueryTestSettings = ClickHouseSQLQueryTestSettings
}
diff --git a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
index 48a162468460..9948393609ca 100644
--- a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
@@ -19,7 +19,7 @@ package io.glutenproject.utils.velox
import io.glutenproject.utils.{BackendTestSettings, SQLQueryTestSettings}
import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.expressions.{GlutenArithmeticExpressionSuite, GlutenBitwiseExpressionsSuite, GlutenCastSuite, GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite, GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite, GlutenDecimalExpressionSuite, GlutenHashExpressionsSuite, GlutenIntervalExpressionsSuite, GlutenLiteralExpressionSuite, GlutenMathExpressionsSuite, GlutenMiscExpressionsSuite, GlutenNondeterministicSuite, GlutenNullExpressionsSuite, GlutenPredicateSuite, GlutenRandomSuite, GlutenRegexpExpressionsSuite, GlutenSortOrderExpressionsSuite, GlutenStringExpressionsSuite}
+import org.apache.spark.sql.catalyst.expressions.{GlutenArithmeticExpressionSuite, GlutenBitwiseExpressionsSuite, GlutenCastSuite, GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite, GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite, GlutenDecimalExpressionSuite, GlutenExpressionMappingSuite, GlutenHashExpressionsSuite, GlutenIntervalExpressionsSuite, GlutenLiteralExpressionSuite, GlutenMathExpressionsSuite, GlutenMiscExpressionsSuite, GlutenNondeterministicSuite, GlutenNullExpressionsSuite, GlutenPredicateSuite, GlutenRandomSuite, GlutenRegexpExpressionsSuite, GlutenSortOrderExpressionsSuite, GlutenStringExpressionsSuite}
import org.apache.spark.sql.connector.{GlutenDataSourceV2DataFrameSessionCatalogSuite, GlutenDataSourceV2DataFrameSuite, GlutenDataSourceV2FunctionSuite, GlutenDataSourceV2SQLSessionCatalogSuite, GlutenDataSourceV2SQLSuiteV1Filter, GlutenDataSourceV2SQLSuiteV2Filter, GlutenDataSourceV2Suite, GlutenDeleteFromTableSuite, GlutenFileDataSourceV2FallBackSuite, GlutenKeyGroupedPartitioningSuite, GlutenLocalScanSuite, GlutenMetadataColumnSuite, GlutenSupportsCatalogOptionsSuite, GlutenTableCapabilityCheckSuite, GlutenWriteDistributionAndOrderingSuite}
import org.apache.spark.sql.errors.{GlutenQueryCompilationErrorsDSv2Suite, GlutenQueryCompilationErrorsSuite, GlutenQueryExecutionErrorsSuite, GlutenQueryParsingErrorsSuite}
import org.apache.spark.sql.execution.{FallbackStrategiesSuite, GlutenBroadcastExchangeSuite, GlutenCoalesceShufflePartitionsSuite, GlutenExchangeSuite, GlutenReplaceHashWithSortAggSuite, GlutenReuseExchangeAndSubquerySuite, GlutenSameResultSuite, GlutenSortSuite, GlutenSQLWindowFunctionSuite, GlutenTakeOrderedAndProjectSuite}
@@ -1200,6 +1200,7 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenFallbackSuite]
enableSuite[GlutenHiveSQLQuerySuite]
enableSuite[GlutenCollapseProjectExecTransformerSuite]
+ enableSuite[GlutenExpressionMappingSuite]
override def getSQLQueryTestSettings: SQLQueryTestSettings = VeloxSQLQueryTestSettings
}
diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenExpressionMappingSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenExpressionMappingSuite.scala
new file mode 100644
index 000000000000..35eed5980f52
--- /dev/null
+++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenExpressionMappingSuite.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.expressions
+
+import io.glutenproject.GlutenConfig
+import io.glutenproject.execution.ProjectExecTransformer
+import io.glutenproject.expression.ExpressionMappings
+
+import org.apache.spark.sql.{GlutenSQLTestsTrait, Row}
+import org.apache.spark.sql.execution.ProjectExec
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+
+class GlutenExpressionMappingSuite extends GlutenSQLTestsTrait with AdaptiveSparkPlanHelper {
+
+ test("test expression blacklist") {
+ val names = ExpressionMappings.expressionsMap.values.toSet
+ assert(names.contains("regexp_replace"))
+ assert(names.contains("regexp_extract"))
+
+ withSQLConf(GlutenConfig.EXPRESSION_BLACK_LIST.key -> "regexp_replace,regexp_extract,add") {
+ val names = ExpressionMappings.expressionsMap.values.toSet
+ assert(!names.contains("regexp_replace"))
+ assert(!names.contains("regexp_extract"))
+ assert(names.contains("regexp_extract_all"))
+ assert(!names.contains("add"))
+ spark.sql("CREATE TABLE t USING PARQUET AS SELECT 1 as c")
+ withTable("t") {
+ val df = spark.sql("SELECT c + 1 FROM t")
+ checkAnswer(df, Row(2))
+ assert(find(df.queryExecution.executedPlan)(_.isInstanceOf[ProjectExecTransformer]).isEmpty)
+ assert(find(df.queryExecution.executedPlan)(_.isInstanceOf[ProjectExec]).isDefined)
+ }
+ }
+ }
+}
diff --git a/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala b/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala
index 27bd9389d4a5..03a34a04745f 100644
--- a/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala
+++ b/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala
@@ -262,6 +262,15 @@ class GlutenConfig(conf: SQLConf) extends Logging {
def extendedExpressionTransformer: String = conf.getConf(EXTENDED_EXPRESSION_TRAN_CONF)
+ def expressionBlacklist: Set[String] = {
+ val blacklist = conf.getConf(EXPRESSION_BLACK_LIST)
+ if (blacklist.isDefined) {
+ blacklist.get.toLowerCase(Locale.ROOT).trim.split(",").toSet
+ } else {
+ Set.empty
+ }
+ }
+
def printStackOnValidationFailure: Boolean =
conf.getConf(VALIDATION_PRINT_FAILURE_STACK_)
@@ -1252,6 +1261,12 @@ object GlutenConfig {
.stringConf
.createWithDefaultString("")
+ val EXPRESSION_BLACK_LIST =
+ buildConf("spark.gluten.expression.blacklist")
+ .doc("A back list of expression to skip transform, multiple values separated by commas.")
+ .stringConf
+ .createOptional
+
val FALLBACK_REPORTER_ENABLED =
buildConf("spark.gluten.sql.columnar.fallbackReporter")
.doc("When true, enable fallback reporter rule to print fallback reason")