Merge branch 'main' into patch-1

apache · Aug 26, 2024 · c570556 · c570556
2 parents 82632a8 + d4d7241
commit c570556
Show file tree

Hide file tree

Showing 34 changed files with 308 additions and 95 deletions.
diff --git a/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala
@@ -524,6 +524,8 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging {
             wrapChild(r2c)
           case union: ColumnarUnionExec =>
             wrapChild(union)
+          case ordered: TakeOrderedAndProjectExecTransformer =>
+            wrapChild(ordered)
           case other =>
             throw new GlutenNotSupportException(
               s"Not supported operator ${other.nodeName} for BroadcastRelation")

diff --git a/...nds-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala b/...nds-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala
@@ -34,15 +34,12 @@ import com.google.protobuf.{Any, StringValue}
 import io.substrait.proto.JoinRel
 
 object JoinTypeTransform {
-  def toNativeJoinType(joinType: JoinType): JoinType = {
-    joinType match {
-      case ExistenceJoin(_) =>
-        LeftSemi
-      case _ =>
-        joinType
-    }
-  }
 
+  // ExistenceJoin is introduced in #SPARK-14781. It returns all rows from the left table with
+  // a new column to indecate whether the row is matched in the right table.
+  // Indeed, the ExistenceJoin is transformed into left any join in CH.
+  // We don't have left any join in substrait, so use left semi join instead.
+  // and isExistenceJoin is set to true to indicate that it is an existence join.
   def toSubstraitJoinType(sparkJoin: JoinType, buildRight: Boolean): JoinRel.JoinType =
     sparkJoin match {
       case _: InnerLike =>
@@ -104,7 +101,7 @@ case class CHShuffledHashJoinExecTransformer(
   override protected def doValidateInternal(): ValidationResult = {
     val shouldFallback =
       CHJoinValidateUtil.shouldFallback(
-        ShuffleHashJoinStrategy(finalJoinType),
+        ShuffleHashJoinStrategy(joinType),
         left.outputSet,
         right.outputSet,
         condition)
@@ -113,7 +110,6 @@ case class CHShuffledHashJoinExecTransformer(
     }
     super.doValidateInternal()
   }
-  private val finalJoinType = JoinTypeTransform.toNativeJoinType(joinType)
 
   override def genJoinParameters(): Any = {
     val (isBHJ, isNullAwareAntiJoin, buildHashTableId): (Int, Int, String) = (0, 0, "")
@@ -226,7 +222,7 @@ case class CHBroadcastHashJoinExecTransformer(
   override protected def doValidateInternal(): ValidationResult = {
     val shouldFallback =
       CHJoinValidateUtil.shouldFallback(
-        BroadcastHashJoinStrategy(finalJoinType),
+        BroadcastHashJoinStrategy(joinType),
         left.outputSet,
         right.outputSet,
         condition)
@@ -255,7 +251,7 @@ case class CHBroadcastHashJoinExecTransformer(
     val context =
       BroadCastHashJoinContext(
         buildKeyExprs,
-        finalJoinType,
+        joinType,
         buildSide == BuildRight,
         isMixedCondition(condition),
         joinType.isInstanceOf[ExistenceJoin],
@@ -278,12 +274,6 @@ case class CHBroadcastHashJoinExecTransformer(
     res
   }
 
-  // ExistenceJoin is introduced in #SPARK-14781. It returns all rows from the left table with
-  // a new column to indecate whether the row is matched in the right table.
-  // Indeed, the ExistenceJoin is transformed into left any join in CH.
-  // We don't have left any join in substrait, so use left semi join instead.
-  // and isExistenceJoin is set to true to indicate that it is an existence join.
-  private val finalJoinType = JoinTypeTransform.toNativeJoinType(joinType)
   override protected lazy val substraitJoinType: JoinRel.JoinType = {
     JoinTypeTransform.toSubstraitJoinType(joinType, buildSide == BuildRight)
   }

diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHJoinValidateUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHJoinValidateUtil.scala
@@ -29,13 +29,10 @@ case class BroadcastHashJoinStrategy(joinType: JoinType) extends JoinStrategy {}
 case class SortMergeJoinStrategy(joinType: JoinType) extends JoinStrategy {}
 
 /**
- * The logic here is that if it is not an equi-join spark will create BNLJ, which will fallback, if
- * it is an equi-join, spark will create BroadcastHashJoin or ShuffleHashJoin, for these join types,
- * we need to filter For cases that cannot be handled by the backend, 1 there are at least two
- * different tables column and Literal in the condition Or condition for comparison, for example: (a
- * join b on a.a1 = b.b1 and (a.a2 > 1 or b.b2 < 2) ) 2 tow join key for inequality comparison (!= ,
- * > , <), for example: (a join b on a.a1 > b.b1) There will be a fallback for Nullaware Jion For
- * Existence Join which is just an optimization of exist subquery, it will also fallback
+ * BroadcastHashJoinStrategy and ShuffleHashJoinStrategy are relatively complete, They support
+ * left/right/inner full/anti/semi join, existence Join, and also support join contiditions with
+ * columns from both sides. e.g. (a join b on a.a1 = b.b1 and a.a2 > 1 and b.b2 < 2)
+ * SortMergeJoinStrategy is not fully supported for all cases in CH.
  */
 
 object CHJoinValidateUtil extends Logging {
@@ -52,33 +49,24 @@ object CHJoinValidateUtil extends Logging {
       leftOutputSet: AttributeSet,
       rightOutputSet: AttributeSet,
       condition: Option[Expression]): Boolean = {
-    var shouldFallback = false
-    val joinType = joinStrategy.joinType
 
-    if (!joinType.isInstanceOf[ExistenceJoin] && joinType.sql.contains("INNER")) {
-      shouldFallback = false;
-    } else if (
+    val hasMixedFilterCondition =
       condition.isDefined && hasTwoTableColumn(leftOutputSet, rightOutputSet, condition.get)
-    ) {
-      shouldFallback = joinStrategy match {
-        case BroadcastHashJoinStrategy(joinTy) =>
-          joinTy.sql.contains("SEMI") || joinTy.sql.contains("ANTI")
-        case SortMergeJoinStrategy(_) => true
-        case ShuffleHashJoinStrategy(joinTy) =>
-          joinTy.sql.contains("SEMI") || joinTy.sql.contains("ANTI")
-        case UnknownJoinStrategy(joinTy) =>
-          joinTy.sql.contains("SEMI") || joinTy.sql.contains("ANTI")
-      }
-    } else {
-      shouldFallback = joinStrategy match {
-        case SortMergeJoinStrategy(joinTy) =>
-          joinTy.sql.contains("SEMI") || joinTy.sql.contains("ANTI") || joinTy.toString.contains(
-            "ExistenceJoin")
-        case _ => false
-      }
+    val shouldFallback = joinStrategy match {
+      case SortMergeJoinStrategy(joinType) =>
+        if (!joinType.isInstanceOf[ExistenceJoin] && joinType.sql.contains("INNER")) {
+          false
+        } else {
+          joinType.sql.contains("SEMI") || joinType.sql.contains("ANTI") || joinType.toString
+            .contains("ExistenceJoin") || hasMixedFilterCondition
+        }
+      case UnknownJoinStrategy(joinType) =>
+        throw new IllegalArgumentException(s"Unknown join type $joinStrategy")
+      case _ => false
     }
+
     if (shouldFallback) {
-      logError(s"Fallback for join type $joinType")
+      logError(s"Fallback for join type $joinStrategy")
     }
     shouldFallback
   }

diff --git a/...st/scala/org/apache/gluten/execution/GlutenClickHouseColumnarMemorySortShuffleSuite.scala b/...st/scala/org/apache/gluten/execution/GlutenClickHouseColumnarMemorySortShuffleSuite.scala
@@ -119,7 +119,7 @@ class GlutenClickHouseColumnarMemorySortShuffleSuite
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("TPCH Q22") {

diff --git a/.../src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarShuffleAQESuite.scala b/.../src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarShuffleAQESuite.scala
@@ -163,7 +163,7 @@ class GlutenClickHouseColumnarShuffleAQESuite
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("TPCH Q22") {

diff --git a/...src/test/scala/org/apache/gluten/execution/GlutenClickHouseDSV2ColumnarShuffleSuite.scala b/...src/test/scala/org/apache/gluten/execution/GlutenClickHouseDSV2ColumnarShuffleSuite.scala
@@ -168,7 +168,7 @@ class GlutenClickHouseDSV2ColumnarShuffleSuite extends GlutenClickHouseTPCHAbstr
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("TPCH Q22") {

diff --git a/...nds-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDSV2Suite.scala b/...nds-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDSV2Suite.scala
@@ -126,7 +126,7 @@ class GlutenClickHouseDSV2Suite extends GlutenClickHouseTPCHAbstractSuite {
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("TPCH Q22") {

diff --git a/...-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDecimalSuite.scala b/...-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDecimalSuite.scala
@@ -343,7 +343,7 @@ class GlutenClickHouseDecimalSuite
               decimalTPCHTables.foreach {
                 dt =>
                   {
-                    val fallBack = (sql_num == 16 || sql_num == 21)
+                    val fallBack = (sql_num == 16)
                     val compareResult = !dt._2.contains(sql_num)
                     val native = if (fallBack) "fallback" else "native"
                     val compare = if (compareResult) "compare" else "noCompare"

diff --git a/...house/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSAbstractSuite.scala b/...house/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSAbstractSuite.scala
@@ -62,11 +62,7 @@ abstract class GlutenClickHouseTPCDSAbstractSuite
         })
 
   protected def fallbackSets(isAqe: Boolean): Set[Int] = {
-    val more = if (isSparkVersionGE("3.5")) Set(44, 67, 70) else Set.empty[Int]
-
-    // q16 smj + left semi + not condition
-    // Q94 BroadcastHashJoin, LeftSemi, NOT condition
-    Set(16, 94) | more
+    if (isSparkVersionGE("3.5")) Set(44, 67, 70) else Set.empty[Int]
   }
   protected def excludedTpcdsQueries: Set[String] = Set(
     "q66" // inconsistent results

diff --git a/.../scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableColumnarShuffleSuite.scala b/.../scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableColumnarShuffleSuite.scala
@@ -171,7 +171,7 @@ class GlutenClickHouseTPCHNullableColumnarShuffleSuite extends GlutenClickHouseT
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("TPCH Q22") {

diff --git a/...khouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableSuite.scala b/...khouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHNullableSuite.scala
@@ -174,7 +174,7 @@ class GlutenClickHouseTPCHNullableSuite extends GlutenClickHouseTPCHAbstractSuit
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("TPCH Q22") {

diff --git a/...nds-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala b/...nds-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala
@@ -175,7 +175,7 @@ class GlutenClickHouseTPCHSuite extends GlutenClickHouseTPCHAbstractSuite {
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("TPCH Q22") {

diff --git a/...g/apache/gluten/execution/tpcds/GlutenClickHouseTPCDSParquetColumnarShuffleAQESuite.scala b/...g/apache/gluten/execution/tpcds/GlutenClickHouseTPCDSParquetColumnarShuffleAQESuite.scala
@@ -239,6 +239,6 @@ class GlutenClickHouseTPCDSParquetColumnarShuffleAQESuite
         | LIMIT 100 ;
         |""".stripMargin
     // There are some BroadcastHashJoin with NOT condition
-    compareResultsAgainstVanillaSpark(sql, true, { df => }, false)
+    compareResultsAgainstVanillaSpark(sql, true, { df => })
   }
 }
diff --git a/...la/org/apache/gluten/execution/tpcds/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala b/...la/org/apache/gluten/execution/tpcds/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala
@@ -31,9 +31,10 @@ class GlutenClickHouseTPCDSParquetGraceHashJoinSuite extends GlutenClickHouseTPC
       .set("spark.io.compression.codec", "snappy")
       .set("spark.sql.shuffle.partitions", "5")
       .set("spark.sql.autoBroadcastJoinThreshold", "10MB")
-      .set("spark.memory.offHeap.size", "8g")
+      .set("spark.memory.offHeap.size", "6g")
       .set("spark.gluten.sql.columnar.backend.ch.runtime_settings.join_algorithm", "grace_hash")
       .set("spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_in_join", "314572800")
+      .setMaster("local[2]")
   }
 
   executeTPCDSTest(false)

diff --git a/...la/org/apache/gluten/execution/tpcds/GlutenClickHouseTPCDSParquetSortMergeJoinSuite.scala b/...la/org/apache/gluten/execution/tpcds/GlutenClickHouseTPCDSParquetSortMergeJoinSuite.scala
@@ -49,9 +49,10 @@ class GlutenClickHouseTPCDSParquetSortMergeJoinSuite extends GlutenClickHouseTPC
       .set("spark.shuffle.manager", "sort")
       .set("spark.io.compression.codec", "snappy")
       .set("spark.sql.shuffle.partitions", "5")
-      .set("spark.sql.autoBroadcastJoinThreshold", "10MB")
-      .set("spark.memory.offHeap.size", "8g")
+      .set("spark.sql.autoBroadcastJoinThreshold", "-1")
+      .set("spark.memory.offHeap.size", "6g")
       .set("spark.gluten.sql.columnar.forceShuffledHashJoin", "false")
+      .setMaster("local[2]")
   }
 
   executeTPCDSTest(false)

diff --git a/.../src/test/scala/org/apache/gluten/execution/tpcds/GlutenClickHouseTPCDSParquetSuite.scala b/.../src/test/scala/org/apache/gluten/execution/tpcds/GlutenClickHouseTPCDSParquetSuite.scala
@@ -336,5 +336,22 @@ class GlutenClickHouseTPCDSParquetSuite extends GlutenClickHouseTPCDSAbstractSui
     compareResultsAgainstVanillaSpark(sql5, compareResult = true, _ => {})
   }
 
+  test("TakeOrderedAndProjectExecTransformer in broadcastRelation") {
+    val q =
+      """
+        | with dd as (
+        | select d_date_sk, count(*) as cn
+        | from date_dim
+        | where d_date_sk is not null
+        | group by d_date_sk
+        | order by cn desc
+        | limit 10)
+        | select count(ss.ss_sold_date_sk)
+        | from store_sales ss, dd
+        | where ss_sold_date_sk=dd.d_date_sk+1
+        |""".stripMargin
+    runQueryAndCompare(q)(checkGlutenOperatorMatch[TakeOrderedAndProjectExecTransformer])
+  }
+
 }
 // scalastyle:on line.size.limit
diff --git a/...org/apache/gluten/execution/tpch/GlutenClickHouseTPCHColumnarShuffleParquetAQESuite.scala b/...org/apache/gluten/execution/tpch/GlutenClickHouseTPCHColumnarShuffleParquetAQESuite.scala
@@ -275,7 +275,7 @@ class GlutenClickHouseTPCHColumnarShuffleParquetAQESuite
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) {
+    runTPCHQuery(21) {
       df =>
         val plans = collect(df.queryExecution.executedPlan) {
           case scanExec: BasicScanExecTransformer => scanExec

diff --git a/...cala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHParquetAQEConcurrentSuite.scala b/...cala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHParquetAQEConcurrentSuite.scala
@@ -45,7 +45,6 @@ class GlutenClickHouseTPCHParquetAQEConcurrentSuite
       .set("spark.shuffle.manager", "sort")
       .set("spark.io.compression.codec", "snappy")
       .set("spark.sql.shuffle.partitions", "5")
-      .set("spark.sql.autoBroadcastJoinThreshold", "10MB")
       .set("spark.sql.adaptive.enabled", "true")
       .set("spark.sql.autoBroadcastJoinThreshold", "-1")
   }
@@ -82,5 +81,4 @@ class GlutenClickHouseTPCHParquetAQEConcurrentSuite
     queries.map(queryId => runTPCHQuery(queryId) { df => })
 
   }
-
 }
diff --git a/...src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHParquetAQESuite.scala b/...src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHParquetAQESuite.scala
@@ -209,7 +209,7 @@ class GlutenClickHouseTPCHParquetAQESuite
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("TPCH Q22") {

diff --git a/...est/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala b/...est/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
@@ -335,7 +335,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
   }
 
   test("TPCH Q21") {
-    runTPCHQuery(21, noFallBack = false) { df => }
+    runTPCHQuery(21) { df => }
   }
 
   test("GLUTEN-2115: Fix wrong number of records shuffle written") {

diff --git a/backends-velox/pom.xml b/backends-velox/pom.xml
@@ -140,6 +140,13 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <type>test-jar</type>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-hive_${scala.binary.version}</artifactId>
+      <version>${spark.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_${scala.binary.version}</artifactId>

diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
@@ -47,7 +47,7 @@ private object VeloxRuleApi {
     // Regular Spark rules.
     injector.injectOptimizerRule(CollectRewriteRule.apply)
     injector.injectOptimizerRule(HLLRewriteRule.apply)
-    UDFResolver.getFunctionSignatures.foreach(injector.injectFunction)
+    UDFResolver.getFunctionSignatures().foreach(injector.injectFunction)
     injector.injectPostHocResolutionRule(ArrowConvertorRule.apply)
   }
 

diff --git a/...ends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala b/...ends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala
@@ -50,6 +50,7 @@ import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.ArrowEvalPythonExec
 import org.apache.spark.sql.execution.utils.ExecUtil
 import org.apache.spark.sql.expression.{UDFExpression, UserDefinedAggregateFunction}
+import org.apache.spark.sql.hive.VeloxHiveUDFTransformer
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -819,4 +820,10 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi {
       case other => other
     }
   }
+
+  override def genHiveUDFTransformer(
+      expr: Expression,
+      attributeSeq: Seq[Attribute]): ExpressionTransformer = {
+    VeloxHiveUDFTransformer.replaceWithExpressionTransformer(expr, attributeSeq)
+  }
 }