Revert "Provided interface to inject ResourceProfile for Columnar stage"

This reverts commit 228429d.
apache · Dec 13, 2024 · 8716bc4 · 8716bc4
1 parent 228429d
commit 8716bc4
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 137 deletions.
diff --git a/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala
@@ -32,7 +32,6 @@ import org.apache.gluten.vectorized.CHColumnarBatchSerializer
 import org.apache.spark.ShuffleDependency
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
-import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{GenShuffleWriterParameters, GlutenShuffleWriterWrapper, HashPartitioningWrapper}
 import org.apache.spark.shuffle.utils.CHShuffleUtil
@@ -470,8 +469,7 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging {
       mode: BroadcastMode,
       child: SparkPlan,
       numOutputRows: SQLMetric,
-      dataSize: SQLMetric,
-      resourceProfile: Option[ResourceProfile] = None): BuildSideRelation = {
+      dataSize: SQLMetric): BuildSideRelation = {
 
     val (buildKeys, isNullAware) = mode match {
       case mode1: HashedRelationBroadcastMode =>

diff --git a/...ends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala b/...ends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala
@@ -29,7 +29,6 @@ import org.apache.gluten.vectorized.{ColumnarBatchSerializer, ColumnarBatchSeria
 import org.apache.spark.{ShuffleDependency, SparkException}
 import org.apache.spark.api.python.{ColumnarArrowEvalPythonExec, PullOutArrowEvalPythonPreProjectHelper}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{GenShuffleWriterParameters, GlutenShuffleWriterWrapper}
 import org.apache.spark.shuffle.utils.ShuffleUtil
@@ -621,23 +620,12 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi {
       mode: BroadcastMode,
       child: SparkPlan,
       numOutputRows: SQLMetric,
-      dataSize: SQLMetric,
-      resourceProfile: Option[ResourceProfile] = None): BuildSideRelation = {
-    val serialized: Array[ColumnarBatchSerializeResult] =
-      if (resourceProfile.isDefined) {
-        child
-          .executeColumnar()
-          .withResources(resourceProfile.get)
-          .mapPartitions(itr => Iterator(BroadcastUtils.serializeStream(itr)))
-          .filter(_.getNumRows != 0)
-          .collect
-      } else {
-        child
-          .executeColumnar()
-          .mapPartitions(itr => Iterator(BroadcastUtils.serializeStream(itr)))
-          .filter(_.getNumRows != 0)
-          .collect
-      }
+      dataSize: SQLMetric): BuildSideRelation = {
+    val serialized: Array[ColumnarBatchSerializeResult] = child
+      .executeColumnar()
+      .mapPartitions(itr => Iterator(BroadcastUtils.serializeStream(itr)))
+      .filter(_.getNumRows != 0)
+      .collect
     val rawSize = serialized.map(_.getSerialized.length).sum
     if (rawSize >= BroadcastExchangeExec.MAX_BROADCAST_TABLE_BYTES) {
       throw new SparkException(

diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala
@@ -24,7 +24,6 @@ import org.apache.gluten.substrait.expression.{ExpressionBuilder, ExpressionNode
 
 import org.apache.spark.ShuffleDependency
 import org.apache.spark.rdd.RDD
-import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{GenShuffleWriterParameters, GlutenShuffleWriterWrapper}
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
@@ -358,8 +357,7 @@ trait SparkPlanExecApi {
       mode: BroadcastMode,
       child: SparkPlan,
       numOutputRows: SQLMetric,
-      dataSize: SQLMetric,
-      resourceProfile: Option[ResourceProfile] = None): BuildSideRelation
+      dataSize: SQLMetric): BuildSideRelation
 
   def doCanonicalizeForBroadcastMode(mode: BroadcastMode): BroadcastMode = {
     mode.canonicalized

diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/execution/WholeStageTransformer.scala b/gluten-substrait/src/main/scala/org/apache/gluten/execution/WholeStageTransformer.scala
@@ -33,7 +33,6 @@ import org.apache.gluten.utils.SubstraitPlanPrinterUtil
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.softaffinity.SoftAffinity
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
@@ -195,22 +194,10 @@ trait UnaryTransformSupport extends TransformSupport with UnaryExecNode {
   }
 }
 
-/** Base interface for a query plan that can be used to set ResourceProfile. */
-trait WithResourceProfileSupport {
-  private var resourceProfile: Option[ResourceProfile] = None
-
-  def withResourceProfile(resourceProfile: ResourceProfile): Unit = {
-    this.resourceProfile = Some(resourceProfile)
-  }
-
-  def getResourceProfile: Option[ResourceProfile] = resourceProfile
-}
-
 case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = false)(
     val transformStageId: Int
 ) extends WholeStageTransformerGenerateTreeStringShim
-  with UnaryTransformSupport
-  with WithResourceProfileSupport {
+  with UnaryTransformSupport {
 
   def stageId: Int = transformStageId
 
@@ -379,99 +366,93 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f
     // Check if BatchScan exists.
     val basicScanExecTransformers = findAllScanTransformers()
 
-    var finalRdd =
-      if (basicScanExecTransformers.nonEmpty) {
-
-        /**
-         * If containing scan exec transformer this "whole stage" generates a RDD which itself takes
-         * care of SCAN there won't be any other RDD for SCAN. As a result, genFirstStageIterator
-         * rather than genFinalStageIterator will be invoked
-         */
-        val allScanPartitions = basicScanExecTransformers.map(_.getPartitions.toIndexedSeq)
-        val allScanSplitInfos =
-          getSplitInfosFromPartitions(basicScanExecTransformers, allScanPartitions)
-        if (GlutenConfig.getConf.enableHdfsViewfs) {
-          allScanSplitInfos.foreach {
-            splitInfos =>
-              splitInfos.foreach {
-                case splitInfo: LocalFilesNode =>
-                  val paths = splitInfo.getPaths.asScala
-                  if (paths.nonEmpty && paths.head.startsWith("viewfs")) {
-                    // Convert the viewfs path into hdfs
-                    val newPaths = paths.map {
-                      viewfsPath =>
-                        val viewPath = new Path(viewfsPath)
-                        val viewFileSystem =
-                          FileSystem.get(viewPath.toUri, serializableHadoopConf.value)
-                        viewFileSystem.resolvePath(viewPath).toString
-                    }
-                    splitInfo.setPaths(newPaths.asJava)
+    if (basicScanExecTransformers.nonEmpty) {
+
+      /**
+       * If containing scan exec transformer this "whole stage" generates a RDD which itself takes
+       * care of SCAN there won't be any other RDD for SCAN. As a result, genFirstStageIterator
+       * rather than genFinalStageIterator will be invoked
+       */
+      val allScanPartitions = basicScanExecTransformers.map(_.getPartitions.toIndexedSeq)
+      val allScanSplitInfos =
+        getSplitInfosFromPartitions(basicScanExecTransformers, allScanPartitions)
+      if (GlutenConfig.getConf.enableHdfsViewfs) {
+        allScanSplitInfos.foreach {
+          splitInfos =>
+            splitInfos.foreach {
+              case splitInfo: LocalFilesNode =>
+                val paths = splitInfo.getPaths.asScala
+                if (paths.nonEmpty && paths.head.startsWith("viewfs")) {
+                  // Convert the viewfs path into hdfs
+                  val newPaths = paths.map {
+                    viewfsPath =>
+                      val viewPath = new Path(viewfsPath)
+                      val viewFileSystem =
+                        FileSystem.get(viewPath.toUri, serializableHadoopConf.value)
+                      viewFileSystem.resolvePath(viewPath).toString
                   }
-              }
-          }
+                  splitInfo.setPaths(newPaths.asJava)
+                }
+            }
         }
+      }
 
-        val inputPartitions =
-          BackendsApiManager.getIteratorApiInstance.genPartitions(
-            wsCtx,
-            allScanSplitInfos,
-            basicScanExecTransformers)
-
-        val rdd = new GlutenWholeStageColumnarRDD(
-          sparkContext,
-          inputPartitions,
-          inputRDDs,
-          pipelineTime,
-          leafInputMetricsUpdater(),
-          BackendsApiManager.getMetricsApiInstance.metricsUpdatingFunction(
-            child,
-            wsCtx.substraitContext.registeredRelMap,
-            wsCtx.substraitContext.registeredJoinParams,
-            wsCtx.substraitContext.registeredAggregationParams
-          )
-        )
-        (0 until allScanPartitions.head.size).foreach(
-          i => {
-            val currentPartitions = allScanPartitions.map(_(i))
-            currentPartitions.indices.foreach(
-              i =>
-                currentPartitions(i) match {
-                  case f: FilePartition =>
-                    SoftAffinity.updateFilePartitionLocations(f, rdd.id)
-                  case _ =>
-                })
-          })
-        rdd
-      } else {
-
-        /**
-         * the whole stage contains NO BasicScanExecTransformer. this the default case for:
-         *   1. SCAN with clickhouse backend (check
-         *      ColumnarCollapseTransformStages#separateScanRDD()) 2. test case where query plan is
-         *      constructed from simple dataframes (e.g. GlutenDataFrameAggregateSuite) in these
-         *      cases, separate RDDs takes care of SCAN as a result, genFinalStageIterator rather
-         *      than genFirstStageIterator will be invoked
-         */
-        new WholeStageZippedPartitionsRDD(
-          sparkContext,
-          inputRDDs,
-          numaBindingInfo,
-          sparkConf,
+      val inputPartitions =
+        BackendsApiManager.getIteratorApiInstance.genPartitions(
           wsCtx,
-          pipelineTime,
-          BackendsApiManager.getMetricsApiInstance.metricsUpdatingFunction(
-            child,
-            wsCtx.substraitContext.registeredRelMap,
-            wsCtx.substraitContext.registeredJoinParams,
-            wsCtx.substraitContext.registeredAggregationParams
-          ),
-          materializeInput
+          allScanSplitInfos,
+          basicScanExecTransformers)
+
+      val rdd = new GlutenWholeStageColumnarRDD(
+        sparkContext,
+        inputPartitions,
+        inputRDDs,
+        pipelineTime,
+        leafInputMetricsUpdater(),
+        BackendsApiManager.getMetricsApiInstance.metricsUpdatingFunction(
+          child,
+          wsCtx.substraitContext.registeredRelMap,
+          wsCtx.substraitContext.registeredJoinParams,
+          wsCtx.substraitContext.registeredAggregationParams
         )
-      }
-    if (getResourceProfile.isDefined) {
-      finalRdd = finalRdd.withResources(getResourceProfile.get)
+      )
+      (0 until allScanPartitions.head.size).foreach(
+        i => {
+          val currentPartitions = allScanPartitions.map(_(i))
+          currentPartitions.indices.foreach(
+            i =>
+              currentPartitions(i) match {
+                case f: FilePartition =>
+                  SoftAffinity.updateFilePartitionLocations(f, rdd.id)
+                case _ =>
+              })
+        })
+      rdd
+    } else {
+
+      /**
+       * the whole stage contains NO BasicScanExecTransformer. this the default case for:
+       *   1. SCAN with clickhouse backend (check ColumnarCollapseTransformStages#separateScanRDD())
+       *      2. test case where query plan is constructed from simple dataframes (e.g.
+       *      GlutenDataFrameAggregateSuite) in these cases, separate RDDs takes care of SCAN as a
+       *      result, genFinalStageIterator rather than genFirstStageIterator will be invoked
+       */
+      new WholeStageZippedPartitionsRDD(
+        sparkContext,
+        inputRDDs,
+        numaBindingInfo,
+        sparkConf,
+        wsCtx,
+        pipelineTime,
+        BackendsApiManager.getMetricsApiInstance.metricsUpdatingFunction(
+          child,
+          wsCtx.substraitContext.registeredRelMap,
+          wsCtx.substraitContext.registeredJoinParams,
+          wsCtx.substraitContext.registeredAggregationParams
+        ),
+        materializeInput
+      )
     }
-    finalRdd
   }
 
   override def metricsUpdater(): MetricsUpdater = {

diff --git a/...bstrait/src/main/scala/org/apache/spark/sql/execution/ColumnarBroadcastExchangeExec.scala b/...bstrait/src/main/scala/org/apache/spark/sql/execution/ColumnarBroadcastExchangeExec.scala
@@ -17,7 +17,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.gluten.backendsapi.BackendsApiManager
-import org.apache.gluten.execution.{ValidatablePlan, WithResourceProfileSupport}
+import org.apache.gluten.execution.ValidatablePlan
 import org.apache.gluten.extension.columnar.transition.Convention
 import org.apache.gluten.metrics.GlutenTimeMetric
 import org.apache.gluten.sql.shims.SparkShimLoader
@@ -42,8 +42,7 @@ import scala.util.control.NonFatal
 
 case class ColumnarBroadcastExchangeExec(mode: BroadcastMode, child: SparkPlan)
   extends BroadcastExchangeLike
-  with ValidatablePlan
-  with WithResourceProfileSupport {
+  with ValidatablePlan {
 
   // Note: "metrics" is made transient to avoid sending driver-side metrics to tasks.
   @transient override lazy val metrics: Map[String, SQLMetric] =
@@ -76,8 +75,7 @@ case class ColumnarBroadcastExchangeExec(mode: BroadcastMode, child: SparkPlan)
               mode,
               child,
               longMetric("numOutputRows"),
-              longMetric("dataSize"),
-              getResourceProfile)
+              longMetric("dataSize"))
         }
 
         val broadcasted = GlutenTimeMetric.millis(longMetric("broadcastTime")) {

diff --git a/...substrait/src/main/scala/org/apache/spark/sql/execution/ColumnarShuffleExchangeExec.scala b/...substrait/src/main/scala/org/apache/spark/sql/execution/ColumnarShuffleExchangeExec.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.execution
 
 import org.apache.gluten.GlutenConfig
 import org.apache.gluten.backendsapi.BackendsApiManager
-import org.apache.gluten.execution.{ValidatablePlan, WithResourceProfileSupport}
+import org.apache.gluten.execution.ValidatablePlan
 import org.apache.gluten.extension.ValidationResult
 import org.apache.gluten.extension.columnar.transition.Convention
 import org.apache.gluten.sql.shims.SparkShimLoader
@@ -47,8 +47,7 @@ case class ColumnarShuffleExchangeExec(
     projectOutputAttributes: Seq[Attribute],
     advisoryPartitionSize: Option[Long] = None)
   extends ShuffleExchangeLike
-  with ValidatablePlan
-  with WithResourceProfileSupport {
+  with ValidatablePlan {
   private[sql] lazy val writeMetrics =
     SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext)
 
@@ -162,10 +161,6 @@ case class ColumnarShuffleExchangeExec(
   override def doExecuteColumnar(): RDD[ColumnarBatch] = {
     if (cachedShuffleRDD == null) {
       cachedShuffleRDD = new ShuffledColumnarBatchRDD(columnarShuffleDependency, readMetrics)
-      if (getResourceProfile.isDefined) {
-        log.info(s"Set resource profile for $child to ${getResourceProfile.get}")
-        cachedShuffleRDD.withResources(getResourceProfile.get)
-      }
     }
     cachedShuffleRDD
   }