From 541cb549e54764df5f18d06bfe088e6c01c898cc Mon Sep 17 00:00:00 2001 From: "shuai.xu" Date: Fri, 23 Feb 2024 18:14:46 +0800 Subject: [PATCH] [GLUTEN-4452] [CH] fix may get wrong hash table when multi joins in a task (#4453) What changes were proposed in this pull request? This pr fix the bug that it may get wrong hash table when there are multi joins in a task (Fixes: #4452) How was this patch tested? This patch was tested by manual tests. --- .../joins/ClickHouseBuildSideRelation.scala | 1 + ...kHouseTPCDSParquetGraceHashJoinSuite.scala | 27 +++++++++++++++++++ .../Join/StorageJoinFromReadBuffer.cpp | 3 --- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/joins/ClickHouseBuildSideRelation.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/joins/ClickHouseBuildSideRelation.scala index b3114e9fd37b..affa564f3ba7 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/joins/ClickHouseBuildSideRelation.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/joins/ClickHouseBuildSideRelation.scala @@ -44,6 +44,7 @@ case class ClickHouseBuildSideRelation( override def asReadOnlyCopy(): ClickHouseBuildSideRelation = this private var hashTableData: Long = 0L + def buildHashTable( broadCastContext: BroadCastHashJoinContext): (Long, ClickHouseBuildSideRelation) = synchronized { diff --git a/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala b/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala index a5a1b7bef395..f9c148111efb 100644 --- a/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala +++ b/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCDSParquetGraceHashJoinSuite.scala @@ -181,4 +181,31 @@ class GlutenClickHouseTPCDSParquetGraceHashJoinSuite extends GlutenClickHouseTPC } } + test("Gluten-4452: Fix get wrong hash table when multi joins in a task") { + val testSql = + """ + | SELECT ws_item_sk, ws_sold_date_sk, ws_ship_date_sk, + | t3.d_date_id as sold_date_id, t2.d_date_id as ship_date_id + | FROM ( + | SELECT ws_item_sk, ws_sold_date_sk, ws_ship_date_sk, t1.d_date_id + | FROM web_sales + | LEFT JOIN + | (SELECT d_date_id, d_date_sk from date_dim GROUP BY d_date_id, d_date_sk) t1 + | ON ws_sold_date_sk == t1.d_date_sk) t3 + | INNER JOIN + | (SELECT d_date_id, d_date_sk from date_dim GROUP BY d_date_id, d_date_sk) t2 + | ON ws_ship_date_sk == t2.d_date_sk + | LIMIT 100; + |""".stripMargin + compareResultsAgainstVanillaSpark( + testSql, + true, + df => { + val foundBroadcastHashJoinExpr = df.queryExecution.executedPlan.collect { + case f: CHBroadcastHashJoinExecTransformer => f + } + assert(foundBroadcastHashJoinExpr.size == 2) + } + ) + } } diff --git a/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.cpp b/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.cpp index 7852401f2eff..6d0021adbf40 100644 --- a/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.cpp +++ b/cpp-ch/local-engine/Join/StorageJoinFromReadBuffer.cpp @@ -90,9 +90,6 @@ StorageJoinFromReadBuffer::StorageJoinFromReadBuffer( DB::JoinPtr StorageJoinFromReadBuffer::getJoinLocked(std::shared_ptr analyzed_join, DB::ContextPtr /*context*/) const { - if (!analyzed_join->sameStrictnessAndKind(join_->getTableJoin().strictness(), join_->getTableJoin().kind())) - throw Exception(ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN, "Table {} has incompatible type of JOIN.", storage_metadata_.comment); - if ((analyzed_join->forceNullableRight() && !use_nulls_) || (!analyzed_join->forceNullableRight() && isLeftOrFull(analyzed_join->kind()) && use_nulls_)) throw Exception(