add bnlj

apache · Jul 8, 2024 · ebb67fa · ebb67fa
1 parent 8300f3b
commit ebb67fa
Show file tree

Hide file tree

Showing 21 changed files with 1,112 additions and 98 deletions.
diff --git a/...nds-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala b/...nds-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala
@@ -348,16 +348,33 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil {
       metrics: Map[String, SQLMetric]): MetricsUpdater = new HashJoinMetricsUpdater(metrics)
 
   override def genNestedLoopJoinTransformerMetrics(
-      sparkContext: SparkContext): Map[String, SQLMetric] = {
-    throw new UnsupportedOperationException(
-      s"NestedLoopJoinTransformer metrics update is not supported in CH backend")
-  }
+      sparkContext: SparkContext): Map[String, SQLMetric] = Map(
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+    "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"),
+    "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"),
+    "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows"),
+    "inputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of input bytes"),
+    "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"),
+    "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"),
+    "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"),
+    "streamPreProjectionTime" ->
+      SQLMetrics.createTimingMetric(sparkContext, "time of stream side preProjection"),
+    "buildPreProjectionTime" ->
+      SQLMetrics.createTimingMetric(sparkContext, "time of build side preProjection"),
+    "postProjectTime" ->
+      SQLMetrics.createTimingMetric(sparkContext, "time of postProjection"),
+    "probeTime" ->
+      SQLMetrics.createTimingMetric(sparkContext, "time of probe"),
+    "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time"),
+    "fillingRightJoinSideTime" -> SQLMetrics.createTimingMetric(
+      sparkContext,
+      "filling right join side time"),
+    "conditionTime" -> SQLMetrics.createTimingMetric(sparkContext, "join condition time")
+  )
 
   override def genNestedLoopJoinTransformerMetricsUpdater(
-      metrics: Map[String, SQLMetric]): MetricsUpdater = {
-    throw new UnsupportedOperationException(
-      s"NestedLoopJoinTransformer metrics update is not supported in CH backend")
-  }
+      metrics: Map[String, SQLMetric]): MetricsUpdater = new BroadcastNestedLoopJoinMetricsUpdater(
+    metrics)
 
   override def genSampleTransformerMetrics(sparkContext: SparkContext): Map[String, SQLMetric] = {
     throw new UnsupportedOperationException(

diff --git a/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala
@@ -373,8 +373,13 @@ class CHSparkPlanExecApi extends SparkPlanExecApi {
       buildSide: BuildSide,
       joinType: JoinType,
       condition: Option[Expression]): BroadcastNestedLoopJoinExecTransformer =
-    throw new GlutenNotSupportException(
-      "BroadcastNestedLoopJoinExecTransformer is not supported in ch backend.")
+    CHBroadcastNestedLoopJoinExecTransformer(
+      left,
+      right,
+      buildSide,
+      joinType,
+      condition
+    )
 
   override def genSampleExecTransformer(
       lowerBound: Double,
@@ -460,16 +465,23 @@ class CHSparkPlanExecApi extends SparkPlanExecApi {
       child: SparkPlan,
       numOutputRows: SQLMetric,
       dataSize: SQLMetric): BuildSideRelation = {
-    val hashedRelationBroadcastMode = mode.asInstanceOf[HashedRelationBroadcastMode]
+
+    val buildKeys: Seq[Expression] = mode match {
+      case mode1: HashedRelationBroadcastMode =>
+        mode1.key
+      case _ =>
+        // IdentityBroadcastMode
+        Seq.empty
+    }
+
     val (newChild, newOutput, newBuildKeys) =
       if (
-        hashedRelationBroadcastMode.key
+        buildKeys
           .forall(k => k.isInstanceOf[AttributeReference] || k.isInstanceOf[BoundReference])
       ) {
         (child, child.output, Seq.empty[Expression])
       } else {
         // pre projection in case of expression join keys
-        val buildKeys = hashedRelationBroadcastMode.key
         val appendedProjections = new ArrayBuffer[NamedExpression]()
         val preProjectionBuildKeys = buildKeys.zipWithIndex.map {
           case (e, idx) =>

diff --git a/...src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala b/...src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.execution
+
+import org.apache.gluten.backendsapi.BackendsApiManager
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.rpc.GlutenDriverEndpoint
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.optimizer.BuildSide
+import org.apache.spark.sql.catalyst.plans.JoinType
+import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
+import org.apache.spark.sql.execution.joins.{BuildSideRelation, HashJoin}
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import com.google.protobuf.{Any, StringValue}
+
+case class CHBroadcastNestedLoopJoinExecTransformer(
+    left: SparkPlan,
+    right: SparkPlan,
+    buildSide: BuildSide,
+    joinType: JoinType,
+    condition: Option[Expression])
+  extends BroadcastNestedLoopJoinExecTransformer(
+    left,
+    right,
+    buildSide,
+    joinType,
+    condition
+  ) {
+  // Unique ID for builded table
+  lazy val buildBroadcastTableId: String = "BuiltBroadcastTable-" + buildPlan.id
+
+  lazy val (buildKeyExprs, streamedKeyExprs) = {
+    require(
+      leftKeys.length == rightKeys.length &&
+        leftKeys
+          .map(_.dataType)
+          .zip(rightKeys.map(_.dataType))
+          .forall(types => sameType(types._1, types._2)),
+      "Join keys from two sides should have same length and types"
+    )
+    // Spark has an improvement which would patch integer joins keys to a Long value.
+    // But this improvement would cause add extra project before hash join in velox,
+    // disabling this improvement as below would help reduce the project.
+    val (lkeys, rkeys) = if (BackendsApiManager.getSettings.enableJoinKeysRewrite()) {
+      (HashJoin.rewriteKeyExpr(leftKeys), HashJoin.rewriteKeyExpr(rightKeys))
+    } else {
+      (leftKeys, rightKeys)
+    }
+    if (needSwitchChildren) {
+      (lkeys, rkeys)
+    } else {
+      (rkeys, lkeys)
+    }
+  }
+
+  override def columnarInputRDDs: Seq[RDD[ColumnarBatch]] = {
+    val streamedRDD = getColumnarInputRDDs(streamedPlan)
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    if (executionId != null) {
+      GlutenDriverEndpoint.collectResources(executionId, buildBroadcastTableId)
+    } else {
+      logWarning(
+        s"Can't not trace broadcast table data $buildBroadcastTableId" +
+          s" because execution id is null." +
+          s" Will clean up until expire time.")
+    }
+    val broadcast = buildPlan.executeBroadcast[BuildSideRelation]()
+    val context =
+      BroadCastHashJoinContext(Seq.empty, joinType, false, buildPlan.output, buildBroadcastTableId)
+    val broadcastRDD = CHBroadcastBuildSideRDD(sparkContext, broadcast, context)
+    // FIXME: Do we have to make build side a RDD?
+    streamedRDD :+ broadcastRDD
+  }
+
+  override protected def withNewChildrenInternal(
+      newLeft: SparkPlan,
+      newRight: SparkPlan): CHBroadcastNestedLoopJoinExecTransformer =
+    copy(left = newLeft, right = newRight)
+
+  def isMixedCondition(cond: Option[Expression]): Boolean = {
+    val res = if (cond.isDefined) {
+      val leftOutputSet = left.outputSet
+      val rightOutputSet = right.outputSet
+      val allReferences = cond.get.references
+      !(allReferences.subsetOf(leftOutputSet) || allReferences.subsetOf(rightOutputSet))
+    } else {
+      false
+    }
+    res
+  }
+
+  def sameType(from: DataType, to: DataType): Boolean = {
+    (from, to) match {
+      case (ArrayType(fromElement, _), ArrayType(toElement, _)) =>
+        sameType(fromElement, toElement)
+
+      case (MapType(fromKey, fromValue, _), MapType(toKey, toValue, _)) =>
+        sameType(fromKey, toKey) &&
+        sameType(fromValue, toValue)
+
+      case (StructType(fromFields), StructType(toFields)) =>
+        fromFields.length == toFields.length &&
+        fromFields.zip(toFields).forall {
+          case (l, r) =>
+            l.name.equalsIgnoreCase(r.name) &&
+            sameType(l.dataType, r.dataType)
+        }
+
+      case (fromDataType, toDataType) => fromDataType == toDataType
+    }
+  }
+
+  override def genJoinParameters(): Any = {
+    val joinParametersStr = new StringBuffer("JoinParameters:")
+    joinParametersStr
+      .append("buildHashTableId=")
+      .append(buildBroadcastTableId)
+      .append("\n")
+    val message = StringValue
+      .newBuilder()
+      .setValue(joinParametersStr.toString)
+      .build()
+    BackendsApiManager.getTransformerApiInstance.packPBMessage(message)
+  }
+
+}
diff --git a/...ouse/src/main/scala/org/apache/gluten/metrics/BroadcastNestedLoopJoinMetricsUpdater.scala b/...ouse/src/main/scala/org/apache/gluten/metrics/BroadcastNestedLoopJoinMetricsUpdater.scala
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.metrics
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.execution.metric.SQLMetric
+
+class BroadcastNestedLoopJoinMetricsUpdater(val metrics: Map[String, SQLMetric])
+  extends MetricsUpdater
+  with Logging {
+
+  override def updateNativeMetrics(opMetrics: IOperatorMetrics): Unit = {
+    try {
+      if (opMetrics != null) {
+        val operatorMetrics = opMetrics.asInstanceOf[OperatorMetrics]
+        if (!operatorMetrics.metricsList.isEmpty && operatorMetrics.joinParams != null) {
+          val joinParams = operatorMetrics.joinParams
+          var currentIdx = operatorMetrics.metricsList.size() - 1
+          var totalTime = 0L
+
+          // build side pre projection
+          if (joinParams.buildPreProjectionNeeded) {
+            metrics("buildPreProjectionTime") +=
+              (operatorMetrics.metricsList.get(currentIdx).time / 1000L).toLong
+            metrics("outputVectors") += operatorMetrics.metricsList.get(currentIdx).outputVectors
+            totalTime += operatorMetrics.metricsList.get(currentIdx).time
+            currentIdx -= 1
+          }
+
+          // stream side pre projection
+          if (joinParams.streamPreProjectionNeeded) {
+            metrics("streamPreProjectionTime") +=
+              (operatorMetrics.metricsList.get(currentIdx).time / 1000L).toLong
+            metrics("outputVectors") += operatorMetrics.metricsList.get(currentIdx).outputVectors
+            totalTime += operatorMetrics.metricsList.get(currentIdx).time
+            currentIdx -= 1
+          }
+
+          // update fillingRightJoinSideTime
+          MetricsUtil
+            .getAllProcessorList(operatorMetrics.metricsList.get(currentIdx))
+            .foreach(
+              processor => {
+                if (processor.name.equalsIgnoreCase("FillingRightJoinSide")) {
+                  metrics("fillingRightJoinSideTime") += (processor.time / 1000L).toLong
+                }
+              })
+
+          // joining
+          val joinMetricsData = operatorMetrics.metricsList.get(currentIdx)
+          metrics("outputVectors") += joinMetricsData.outputVectors
+          metrics("inputWaitTime") += (joinMetricsData.inputWaitTime / 1000L).toLong
+          metrics("outputWaitTime") += (joinMetricsData.outputWaitTime / 1000L).toLong
+          totalTime += joinMetricsData.time
+
+          MetricsUtil
+            .getAllProcessorList(joinMetricsData)
+            .foreach(
+              processor => {
+                if (processor.name.equalsIgnoreCase("FillingRightJoinSide")) {
+                  metrics("fillingRightJoinSideTime") += (processor.time / 1000L).toLong
+                }
+                if (processor.name.equalsIgnoreCase("FilterTransform")) {
+                  metrics("conditionTime") += (processor.time / 1000L).toLong
+                }
+                if (processor.name.equalsIgnoreCase("JoiningTransform")) {
+                  metrics("probeTime") += (processor.time / 1000L).toLong
+                }
+                if (
+                  !BroadcastNestedLoopJoinMetricsUpdater.INCLUDING_PROCESSORS.contains(
+                    processor.name)
+                ) {
+                  metrics("extraTime") += (processor.time / 1000L).toLong
+                }
+                if (
+                  BroadcastNestedLoopJoinMetricsUpdater.CH_PLAN_NODE_NAME.contains(processor.name)
+                ) {
+                  metrics("numOutputRows") += processor.outputRows
+                  metrics("outputBytes") += processor.outputBytes
+                  metrics("numInputRows") += processor.inputRows
+                  metrics("inputBytes") += processor.inputBytes
+                }
+              })
+
+          currentIdx -= 1
+
+          // post projection
+          if (joinParams.postProjectionNeeded) {
+            metrics("postProjectTime") +=
+              (operatorMetrics.metricsList.get(currentIdx).time / 1000L).toLong
+            metrics("outputVectors") += operatorMetrics.metricsList.get(currentIdx).outputVectors
+            totalTime += operatorMetrics.metricsList.get(currentIdx).time
+            currentIdx -= 1
+          }
+          metrics("totalTime") += (totalTime / 1000L).toLong
+        }
+      }
+    } catch {
+      case e: Exception =>
+        logError(s"Updating native metrics failed due to ${e.getCause}.")
+        throw e
+    }
+  }
+}
+
+object BroadcastNestedLoopJoinMetricsUpdater {
+  val INCLUDING_PROCESSORS = Array("JoiningTransform", "FillingRightJoinSide", "FilterTransform")
+  val CH_PLAN_NODE_NAME = Array("JoiningTransform")
+}
diff --git a/...house/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSAbstractSuite.scala b/...house/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCDSAbstractSuite.scala
@@ -57,16 +57,13 @@ abstract class GlutenClickHouseTPCDSAbstractSuite
           }
           val noFallBack = queryNum match {
             case i
-                if i == 10 || i == 16 || i == 28 || i == 35 || i == 45 || i == 77 ||
-                  i == 88 || i == 90 || i == 94 =>
+                if i == 10 || i == 16 || i == 35 || i == 45 || i == 77 ||
+                  i == 94 =>
               // Q10 BroadcastHashJoin, ExistenceJoin
               // Q16 ShuffledHashJoin, NOT condition
-              // Q28 BroadcastNestedLoopJoin
               // Q35 BroadcastHashJoin, ExistenceJoin
               // Q45 BroadcastHashJoin, ExistenceJoin
               // Q77 CartesianProduct
-              // Q88 BroadcastNestedLoopJoin
-              // Q90 BroadcastNestedLoopJoin
               // Q94 BroadcastHashJoin, LeftSemi, NOT condition
               (false, false)
             case j if j == 38 || j == 87 =>

diff --git a/cpp-ch/local-engine/Common/CHUtil.cpp b/cpp-ch/local-engine/Common/CHUtil.cpp
@@ -128,6 +128,27 @@ DB::Block BlockUtil::buildHeader(const DB::NamesAndTypesList & names_types_list)
     return DB::Block(cols);
 }
 
+/// The column names may be different in two blocks.
+/// and the nullability also could be different, with TPCDS-Q1 as an example.
+DB::ColumnWithTypeAndName
+BlockUtil::convertColumnAsNecessary(const DB::ColumnWithTypeAndName & column, const DB::ColumnWithTypeAndName & sample_column)
+{
+    if (sample_column.type->equals(*column.type))
+        return {column.column, column.type, sample_column.name};
+    else if (sample_column.type->isNullable() && !column.type->isNullable() && DB::removeNullable(sample_column.type)->equals(*column.type))
+    {
+        auto nullable_column = column;
+        DB::JoinCommon::convertColumnToNullable(nullable_column);
+        return {nullable_column.column, sample_column.type, sample_column.name};
+    }
+    else
+        throw DB::Exception(
+            DB::ErrorCodes::LOGICAL_ERROR,
+            "Columns have different types. original:{} expected:{}",
+            column.dumpStructure(),
+            sample_column.dumpStructure());
+}
+
 /**
  * There is a special case with which we need be careful. In spark, struct/map/list are always
  * wrapped in Nullable, but this should not happen in clickhouse.