[GLUTEN-7180][CH] Fix ut `Eliminate NAAJ when BuildSide is HashedRela…

…tionWithAllNullKeys` for the CH backend when the aqe is on Fix ut Eliminate NAAJ when BuildSide is HashedRelationWithAllNullKeys for the CH backend when the aqe is on Close #7180.
apache · Sep 10, 2024 · 8406c33 · 8406c33
1 parent 4ce5162
commit 8406c33
Show file tree

Hide file tree

Showing 19 changed files with 275 additions and 68 deletions.
diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHNativeBlock.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHNativeBlock.java
@@ -71,6 +71,12 @@ public long totalBytes() {
 
   public native void nativeClose(long blockAddress);
 
+  public native BlockStats nativeBlockStats(long blockAddress, int columnPosition);
+
+  public BlockStats getBlockStats(int columnPosition) {
+    return nativeBlockStats(blockAddress, columnPosition);
+  }
+
   public void close() {
     if (blockAddress != 0) {
       nativeClose(blockAddress);

diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java
@@ -48,7 +48,8 @@ private static native long nativeBuild(
       boolean hasMixedFiltCondition,
       boolean isExistenceJoin,
       byte[] namedStruct,
-      boolean isNullAwareAntiJoin);
+      boolean isNullAwareAntiJoin,
+      boolean hasNullKeyValues);
 
   private StorageJoinBuilder() {}
 
@@ -58,7 +59,8 @@ public static long build(
       long rowCount,
       BroadCastHashJoinContext broadCastContext,
       List<Expression> newBuildKeys,
-      List<Attribute> newOutput) {
+      List<Attribute> newOutput,
+      boolean hasNullKeyValues) {
     ConverterUtils$ converter = ConverterUtils$.MODULE$;
     List<Expression> keys;
     List<Attribute> output;
@@ -96,7 +98,8 @@ public static long build(
         broadCastContext.hasMixedFiltCondition(),
         broadCastContext.isExistenceJoin(),
         toNameStruct(output).toByteArray(),
-        broadCastContext.isNullAwareAntiJoin());
+        broadCastContext.isNullAwareAntiJoin(),
+        hasNullKeyValues);
   }
 
   /** create table named struct */

diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala
@@ -44,6 +44,7 @@ private object CHRuleApi {
   def injectSpark(injector: SparkInjector): Unit = {
     // Regular Spark rules.
     injector.injectQueryStagePrepRule(FallbackBroadcastHashJoinPrepQueryStage.apply)
+    injector.injectQueryStagePrepRule(spark => CHAQEPropagateEmptyRelation(spark))
     injector.injectParser(
       (spark, parserInterface) => new GlutenCacheFilesSqlParser(spark, parserInterface))
     injector.injectParser(

diff --git a/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala
@@ -474,12 +474,12 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging {
       numOutputRows: SQLMetric,
       dataSize: SQLMetric): BuildSideRelation = {
 
-    val buildKeys: Seq[Expression] = mode match {
+    val (buildKeys, isNullAware) = mode match {
       case mode1: HashedRelationBroadcastMode =>
-        mode1.key
+        (mode1.key, mode1.isNullAware)
       case _ =>
         // IdentityBroadcastMode
-        Seq.empty
+        (Seq.empty, false)
     }
 
     val (newChild, newOutput, newBuildKeys) =
@@ -532,8 +532,27 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging {
         }
         (newChild, (child.output ++ appendedProjections).map(_.toAttribute), preProjectionBuildKeys)
       }
+
+    // find the key index in the output
+    val keyColumnIndex = if (isNullAware) {
+      def findKeyOrdinal(key: Expression, output: Seq[Attribute]): Int = {
+        key match {
+          case b: BoundReference => b.ordinal
+          case n: NamedExpression =>
+            output.indexWhere(o => (o.name.equals(n.name) && o.exprId == n.exprId))
+          case _ => throw new GlutenException(s"Cannot find $key in the child's output: $output")
+        }
+      }
+      if (newBuildKeys.isEmpty) {
+        findKeyOrdinal(buildKeys(0), newOutput)
+      } else {
+        findKeyOrdinal(newBuildKeys(0), newOutput)
+      }
+    } else {
+      0
+    }
     val countsAndBytes =
-      CHExecUtil.buildSideRDD(dataSize, newChild).collect
+      CHExecUtil.buildSideRDD(dataSize, newChild, isNullAware, keyColumnIndex).collect
 
     val batches = countsAndBytes.map(_._2)
     val totalBatchesSize = batches.map(_.length).sum
@@ -548,8 +567,15 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging {
           s" written bytes is correct.")
     }
     val rowCount = countsAndBytes.map(_._1).sum
+    val hasNullKeyValues = countsAndBytes.map(_._3).foldLeft[Boolean](false)((b, a) => { b || a })
     numOutputRows += rowCount
-    ClickHouseBuildSideRelation(mode, newOutput, batches.flatten, rowCount, newBuildKeys)
+    ClickHouseBuildSideRelation(
+      mode,
+      newOutput,
+      batches.flatten,
+      rowCount,
+      newBuildKeys,
+      hasNullKeyValues)
   }
 
   /** Define backend specfic expression mappings. */

diff --git a/...s-clickhouse/src/main/scala/org/apache/gluten/extension/CHAQEPropagateEmptyRelation.scala b/...s-clickhouse/src/main/scala/org/apache/gluten/extension/CHAQEPropagateEmptyRelation.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.extension
+
+import org.apache.gluten.utils.PhysicalPlanSelector
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.plans.LeftAnti
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.{ColumnarBroadcastExchangeExec, LocalTableScanExec, SparkPlan}
+import org.apache.spark.sql.execution.adaptive.BroadcastQueryStageExec
+import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, ClickHouseBuildSideRelation}
+
+case class CHAQEPropagateEmptyRelation(session: SparkSession) extends Rule[SparkPlan] {
+
+  def apply(plan: SparkPlan): SparkPlan = PhysicalPlanSelector.maybe(session, plan) {
+    plan.transform {
+      case bhj @ BroadcastHashJoinExec(_, _, joinType, _, _, left, right, isNullAwareAntiJoin)
+          if (joinType == LeftAnti) && isNullAwareAntiJoin =>
+        right match {
+          case BroadcastQueryStageExec(_, plan: SparkPlan, _) =>
+            val columnarBroadcast = plan match {
+              case c: ColumnarBroadcastExchangeExec => c
+              case ReusedExchangeExec(_, c: ColumnarBroadcastExchangeExec) => c
+            }
+            val chBuildSideRelation = columnarBroadcast.relationFuture.get().value
+            chBuildSideRelation match {
+              case c: ClickHouseBuildSideRelation if c.hasNullKeyValues =>
+                LocalTableScanExec(bhj.output, Seq.empty)
+              case _ => bhj
+            }
+          case o => bhj
+        }
+      case other => other
+    }
+  }
+}
diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/vectorized/BlockStats.java b/backends-clickhouse/src/main/scala/org/apache/gluten/vectorized/BlockStats.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.vectorized;
+
+public class BlockStats {
+  private final long blockRecordCount;
+  private final boolean hasNullKeyValues;
+
+  public BlockStats(long blockRecordCount, boolean hasNullKeyValues) {
+    this.blockRecordCount = blockRecordCount;
+    this.hasNullKeyValues = hasNullKeyValues;
+  }
+
+  public long getBlockRecordCount() {
+    return blockRecordCount;
+  }
+
+  public boolean isHasNullKeyValues() {
+    return hasNullKeyValues;
+  }
+}
diff --git a/...use/src/main/scala/org/apache/spark/sql/execution/joins/ClickHouseBuildSideRelation.scala b/...use/src/main/scala/org/apache/spark/sql/execution/joins/ClickHouseBuildSideRelation.scala
@@ -35,7 +35,8 @@ case class ClickHouseBuildSideRelation(
     output: Seq[Attribute],
     batches: Array[Byte],
     numOfRows: Long,
-    newBuildKeys: Seq[Expression] = Seq.empty)
+    newBuildKeys: Seq[Expression] = Seq.empty,
+    hasNullKeyValues: Boolean = false)
   extends BuildSideRelation
   with Logging {
 
@@ -58,7 +59,8 @@ case class ClickHouseBuildSideRelation(
           numOfRows,
           broadCastContext,
           newBuildKeys.asJava,
-          output.asJava)
+          output.asJava,
+          hasNullKeyValues)
         (hashTableData, this)
       } else {
         (StorageJoinBuilder.nativeCloneBuildHashTable(hashTableData), null)

diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala
@@ -58,31 +58,49 @@ object CHExecUtil extends Logging {
   def toBytes(
       dataSize: SQLMetric,
       iter: Iterator[ColumnarBatch],
+      isNullAware: Boolean = false,
+      keyColumnIndex: Int = 0,
       compressionCodec: Option[String] = Some("lz4"),
       compressionLevel: Option[Int] = None,
-      bufferSize: Int = 4 << 10): Iterator[(Int, Array[Byte])] = {
-    var count = 0
+      bufferSize: Int = 4 << 10): Iterator[(Long, Array[Byte], Boolean)] = {
+    var count = 0L
+    var hasNullKeyValues = false
     val bos = new ByteArrayOutputStream()
     val buffer = new Array[Byte](bufferSize) // 4K
     val level = compressionLevel.getOrElse(Int.MinValue)
     val blockOutputStream =
       compressionCodec
         .map(new BlockOutputStream(bos, buffer, dataSize, true, _, level, bufferSize))
         .getOrElse(new BlockOutputStream(bos, buffer, dataSize, false, "", level, bufferSize))
-    while (iter.hasNext) {
-      val batch = iter.next()
-      count += batch.numRows
-      blockOutputStream.write(batch)
+    if (isNullAware) {
+      while (iter.hasNext) {
+        val batch = iter.next()
+        val blockStats = CHNativeBlock.fromColumnarBatch(batch).getBlockStats(keyColumnIndex)
+        count += blockStats.getBlockRecordCount
+        hasNullKeyValues = hasNullKeyValues || blockStats.isHasNullKeyValues
+        blockOutputStream.write(batch)
+      }
+    } else {
+      while (iter.hasNext) {
+        val batch = iter.next()
+        count += batch.numRows()
+        blockOutputStream.write(batch)
+      }
     }
     blockOutputStream.flush()
     blockOutputStream.close()
-    Iterator((count, bos.toByteArray))
+    Iterator((count, bos.toByteArray, hasNullKeyValues))
   }
 
-  def buildSideRDD(dataSize: SQLMetric, newChild: SparkPlan): RDD[(Int, Array[Byte])] = {
+  def buildSideRDD(
+      dataSize: SQLMetric,
+      newChild: SparkPlan,
+      isNullAware: Boolean,
+      keyColumnIndex: Int
+  ): RDD[(Long, Array[Byte], Boolean)] = {
     newChild
       .executeColumnar()
-      .mapPartitionsInternal(iter => toBytes(dataSize, iter))
+      .mapPartitionsInternal(iter => toBytes(dataSize, iter, isNullAware, keyColumnIndex))
   }
 
   private def buildRangePartitionSampleRDD(

diff --git a/...est/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala b/...est/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
@@ -72,7 +72,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
               salted_df = Some((salted_df match {
                 case Some(x) => x
                 case None => df
-              }).withColumn(c.name, when(rand() < 0.01, null).otherwise(col(c.name))))
+              }).withColumn(c.name, when(rand() < 0.5, null).otherwise(col(c.name))))
             }
 
             val currentSaltedTablePath = saltedTablesPath + "/" + tableName
@@ -226,6 +226,8 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
           case scanExec: BasicScanExecTransformer => scanExec
         }
         assert(scanExec.size == 8)
+
+        Thread.sleep(100000000)
     }
   }
 
@@ -2935,6 +2937,55 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
       df => {
         checkBHJWithIsNullAwareAntiJoin(df)
       })
+
+    withSQLConf(("spark.sql.adaptive.enabled", "true")) {
+      def checkAQEBHJWithIsNullAwareAntiJoin(df: DataFrame, isNullAwareBhjCnt: Int = 1): Unit = {
+        val bhjs = collect(df.queryExecution.executedPlan) {
+          case bhj: CHBroadcastHashJoinExecTransformer if bhj.isNullAwareAntiJoin => true
+        }
+        assert(bhjs.size == isNullAwareBhjCnt)
+      }
+
+      val sql6 =
+        s"""
+           |select * from partsupp
+           |where
+           |ps_suppkey NOT IN (SELECT suppkey FROM VALUES (null), (6) sub(suppkey))
+           |""".stripMargin
+      compareResultsAgainstVanillaSpark(
+        sql6,
+        true,
+        df => {
+          checkAQEBHJWithIsNullAwareAntiJoin(df, 0)
+        })
+
+      val sql7 =
+        s"""
+           |select * from partsupp
+           |where
+           |cast(ps_suppkey AS INT) NOT IN (SELECT suppkey FROM VALUES (null), (6) sub(suppkey))
+           |""".stripMargin
+      compareResultsAgainstVanillaSpark(
+        sql7,
+        true,
+        df => {
+          checkAQEBHJWithIsNullAwareAntiJoin(df, 0)
+        })
+
+      val sql8 =
+        s"""
+           |select * from partsupp
+           |where
+           |ps_suppkey NOT IN (SELECT suppkey FROM VALUES (5), (6) sub(suppkey))
+           |""".stripMargin
+      compareResultsAgainstVanillaSpark(
+        sql8,
+        true,
+        df => {
+          checkAQEBHJWithIsNullAwareAntiJoin(df)
+        })
+    }
+
   }
 
   test("soundex") {

diff --git a/...house/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala b/...house/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala
@@ -58,7 +58,7 @@ object CHHashBuildBenchmark extends SqlBasedBenchmark with CHSqlBasedBenchmark w
                                  |select $scanSchema from parquet.`$parquetDir`
                                  |
                                  |""".stripMargin)
-    val rowCount: Int = chParquet.count().toInt
+    val rowCount = chParquet.count()
 
     val runs = Seq(1, 2, 4, 8, 16, 32, 64).reverse
       .map(num => rowCount / num)
@@ -79,13 +79,14 @@ object CHHashBuildBenchmark extends SqlBasedBenchmark with CHSqlBasedBenchmark w
             s"build hash table with $num rows with $iteration hash tables",
             executedCnt) {
             _ =>
-              for (i <- 0 until iteration) {
+              for (i <- 0L until iteration) {
                 val table = StorageJoinBuilder.build(
                   bytes,
                   num,
                   relation,
                   new util.ArrayList[Expression](),
-                  new util.ArrayList[Attribute]())
+                  new util.ArrayList[Attribute](),
+                  false)
                 StorageJoinBuilder.nativeCleanBuildHashTable("", table)
               }
           }
@@ -94,7 +95,7 @@ object CHHashBuildBenchmark extends SqlBasedBenchmark with CHSqlBasedBenchmark w
   }
 
   private def createBroadcastRelation(
-      child: SparkPlan): (Array[Byte], Int, BroadCastHashJoinContext) = {
+      child: SparkPlan): (Array[Byte], Long, BroadCastHashJoinContext) = {
     val dataSize = SQLMetrics.createSizeMetric(spark.sparkContext, "size of files read")
 
     val countsAndBytes = child