apache · zhztheplayer · Jun 18, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 14, 2024
diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.gluten.extension
 
-import org.apache.gluten.execution.{FlushableHashAggregateExecTransformer, HashAggregateExecTransformer, ProjectExecTransformer, RegularHashAggregateExecTransformer}
+import org.apache.gluten.execution._
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.aggregate.{Partial, PartialMerge}
@@ -30,74 +30,55 @@ import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike
  * optimizations such as flushing and abandoning.
  */
 case class FlushableHashAggregateRule(session: SparkSession) extends Rule[SparkPlan] {
+  import FlushableHashAggregateRule._
   override def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case shuffle: ShuffleExchangeLike =>
+    case s: ShuffleExchangeLike =>
       // If an exchange follows a hash aggregate in which all functions are in partial mode,
       // then it's safe to convert the hash aggregate to flushable hash aggregate.
-      shuffle.child match {
-        case HashAggPropagatedToShuffle(proj, agg) =>
-          shuffle.withNewChildren(
-            Seq(proj.withNewChildren(Seq(FlushableHashAggregateExecTransformer(
-              agg.requiredChildDistributionExpressions,
-              agg.groupingExpressions,
-              agg.aggregateExpressions,
-              agg.aggregateAttributes,
-              agg.initialInputBufferOffset,
-              agg.resultExpressions,
-              agg.child
-            )))))
-        case HashAggWithShuffle(agg) =>
-          shuffle.withNewChildren(
-            Seq(FlushableHashAggregateExecTransformer(
-              agg.requiredChildDistributionExpressions,
-              agg.groupingExpressions,
-              agg.aggregateExpressions,
-              agg.aggregateAttributes,
-              agg.initialInputBufferOffset,
-              agg.resultExpressions,
-              agg.child
-            )))
-        case _ =>
-          shuffle
-      }
+      val out = s.withNewChildren(
+        List(
+          replaceEligibleAggregates(s.child) {
+            agg =>
+              FlushableHashAggregateExecTransformer(
+                agg.requiredChildDistributionExpressions,
+                agg.groupingExpressions,
+                agg.aggregateExpressions,
+                agg.aggregateAttributes,
+                agg.initialInputBufferOffset,
+                agg.resultExpressions,
+                agg.child
+              )
+          }
+        )
+      )
+      out
   }
-}
 
-object HashAggPropagatedToShuffle {
-  def unapply(
-      plan: SparkPlan): Option[(ProjectExecTransformer, RegularHashAggregateExecTransformer)] = {
-    if (!plan.isInstanceOf[ProjectExecTransformer]) {
-      return None
-    }
-    val proj = plan.asInstanceOf[ProjectExecTransformer]
-    val child = proj.child
-    if (!child.isInstanceOf[RegularHashAggregateExecTransformer]) {
-      return None
-    }
-    val agg = child.asInstanceOf[RegularHashAggregateExecTransformer]
-    if (!agg.aggregateExpressions.forall(p => p.mode == Partial || p.mode == PartialMerge)) {
-      return None
+  private def replaceEligibleAggregates(plan: SparkPlan)(
+      func: RegularHashAggregateExecTransformer => SparkPlan): SparkPlan = {
+    def transformDown: SparkPlan => SparkPlan = {
+      case agg: RegularHashAggregateExecTransformer
+          if !agg.aggregateExpressions.forall(p => p.mode == Partial || p.mode == PartialMerge) =>
+        // Not a intermediate agg. Skip.
+        agg
+      case agg: RegularHashAggregateExecTransformer
+          if isAggInputAlreadyDistributedWithAggKeys(agg) =>
+        // Data already grouped by aggregate keys, Skip.
+        agg
+      case agg: RegularHashAggregateExecTransformer =>
+        func(agg)
+      case p if !canPropagate(p) => p
+      case other => other.withNewChildren(other.children.map(transformDown))
     }
-    if (FlushableHashAggregateRule.isAggInputAlreadyDistributedWithAggKeys(agg)) {
-      return None
-    }
-    Some((proj, agg))
+
+    val out = transformDown(plan)
+    out
   }
-}
 
-object HashAggWithShuffle {
-  def unapply(plan: SparkPlan): Option[RegularHashAggregateExecTransformer] = {
-    if (!plan.isInstanceOf[RegularHashAggregateExecTransformer]) {
-      return None
-    }
-    val agg = plan.asInstanceOf[RegularHashAggregateExecTransformer]
-    if (!agg.aggregateExpressions.forall(p => p.mode == Partial || p.mode == PartialMerge)) {
-      return None
-    }
-    if (FlushableHashAggregateRule.isAggInputAlreadyDistributedWithAggKeys(agg)) {
-      return None
-    }
-    Some(agg)
+  private def canPropagate(plan: SparkPlan): Boolean = plan match {
+    case _: ProjectExecTransformer => true
+    case _: VeloxAppendBatchesExec => true
+    case _ => false
   }
 }
 
@@ -112,7 +93,8 @@ object FlushableHashAggregateRule {
    * only on a single partition among the whole cluster. Spark's planner may use this information to
    * perform optimizations like doing "partial_count(a, b, c)" directly on the output data.
    */
-  def isAggInputAlreadyDistributedWithAggKeys(agg: HashAggregateExecTransformer): Boolean = {
+  private def isAggInputAlreadyDistributedWithAggKeys(
+      agg: HashAggregateExecTransformer): Boolean = {
     if (agg.groupingExpressions.isEmpty) {
       // Empty grouping set () should not be satisfied by any partitioning patterns.
       //   E.g.,