From 31e1b7449fba2ab6938d92f5cf013b2e82df5446 Mon Sep 17 00:00:00 2001 From: Kyligence Git Date: Sat, 30 Nov 2024 07:34:46 -0600 Subject: [PATCH] [GLUTEN-1632][CH]Daily Update Clickhouse Version (20241130) (#8112) * [GLUTEN-1632][CH]Daily Update Clickhouse Version (20241130) * Fix Build due to https://github.com/ClickHouse/ClickHouse/pull/71406 * Fix build due to https://github.com/ClickHouse/ClickHouse/pull/72460 --------- Co-authored-by: kyligence-git Co-authored-by: Chang Chen --- cpp-ch/clickhouse.version | 4 +- cpp-ch/local-engine/Common/AggregateUtil.cpp | 39 ++++++++++--------- cpp-ch/local-engine/Common/AggregateUtil.h | 2 +- .../Parser/RelParsers/CrossRelParser.cpp | 11 +++++- .../Parser/RelParsers/JoinRelParser.cpp | 18 +++++++-- cpp-ch/local-engine/tests/gtest_ch_join.cpp | 6 ++- 6 files changed, 52 insertions(+), 28 deletions(-) diff --git a/cpp-ch/clickhouse.version b/cpp-ch/clickhouse.version index edb13fdc5715..565220d7867d 100644 --- a/cpp-ch/clickhouse.version +++ b/cpp-ch/clickhouse.version @@ -1,3 +1,3 @@ CH_ORG=Kyligence -CH_BRANCH=rebase_ch/20241129 -CH_COMMIT=101ba3f944d1 +CH_BRANCH=rebase_ch/20241130 +CH_COMMIT=d5d38588bd3 diff --git a/cpp-ch/local-engine/Common/AggregateUtil.cpp b/cpp-ch/local-engine/Common/AggregateUtil.cpp index 0707d18aa01b..4a768eb17b6c 100644 --- a/cpp-ch/local-engine/Common/AggregateUtil.cpp +++ b/cpp-ch/local-engine/Common/AggregateUtil.cpp @@ -48,6 +48,9 @@ extern const SettingsUInt64 aggregation_in_order_max_block_bytes; extern const SettingsUInt64 group_by_two_level_threshold; extern const SettingsFloat min_hit_rate_to_use_consecutive_keys_optimization; extern const SettingsUInt64 max_block_size; +extern const SettingsBool compile_aggregate_expressions; +extern const SettingsUInt64 min_count_to_compile_aggregate_expression; +extern const SettingsBool enable_software_prefetch_in_aggregation; } template @@ -186,7 +189,7 @@ DB::Block AggregateDataBlockConverter::next() } DB::Aggregator::Params AggregatorParamsHelper::buildParams( - DB::ContextPtr context, + const DB::ContextPtr & context, const DB::Names & grouping_keys, const DB::AggregateDescriptions & agg_descriptions, Mode mode, @@ -194,7 +197,7 @@ DB::Aggregator::Params AggregatorParamsHelper::buildParams( { const auto & settings = context->getSettingsRef(); size_t max_rows_to_group_by = mode == Mode::PARTIAL_TO_FINISHED ? 0 : static_cast(settings[DB::Setting::max_rows_to_group_by]); - DB::OverflowMode group_by_overflow_mode = settings[DB::Setting::group_by_overflow_mode]; + size_t group_by_two_level_threshold = algorithm == Algorithm::GlutenGraceAggregate ? static_cast(settings[DB::Setting::group_by_two_level_threshold]) : 0; size_t group_by_two_level_threshold_bytes = algorithm == Algorithm::GlutenGraceAggregate @@ -207,39 +210,39 @@ DB::Aggregator::Params AggregatorParamsHelper::buildParams( ? false : (mode == Mode::PARTIAL_TO_FINISHED ? false : static_cast(settings[DB::Setting::empty_result_for_aggregation_by_empty_set])); DB::TemporaryDataOnDiskScopePtr tmp_data_scope = algorithm == Algorithm::GlutenGraceAggregate ? nullptr : context->getTempDataOnDisk(); - size_t max_threads = settings[DB::Setting::max_threads]; + size_t min_free_disk_space = algorithm == Algorithm::GlutenGraceAggregate ? 0 : static_cast(settings[DB::Setting::min_free_disk_space_for_temporary_data]); - bool compile_aggregate_expressions = mode == Mode::PARTIAL_TO_FINISHED ? false : true; - size_t min_count_to_compile_aggregate_expression = mode == Mode::PARTIAL_TO_FINISHED ? 0 : 3; + bool compile_aggregate_expressions = mode == Mode::PARTIAL_TO_FINISHED ? false : settings[DB::Setting::compile_aggregate_expressions]; + size_t min_count_to_compile_aggregate_expression = mode == Mode::PARTIAL_TO_FINISHED ? 0 : settings[DB::Setting::min_count_to_compile_aggregate_expression]; size_t max_block_size = PODArrayUtil::adjustMemoryEfficientSize(settings[DB::Setting::max_block_size]); - bool enable_prefetch = mode == Mode::PARTIAL_TO_FINISHED ? false : true; + bool enable_prefetch = mode != Mode::PARTIAL_TO_FINISHED; bool only_merge = mode == Mode::PARTIAL_TO_FINISHED; bool optimize_group_by_constant_keys = mode == Mode::PARTIAL_TO_FINISHED ? false : settings[DB::Setting::optimize_group_by_constant_keys]; - double min_hit_rate_to_use_consecutive_keys_optimization = settings[DB::Setting::min_hit_rate_to_use_consecutive_keys_optimization]; + + DB::Settings aggregate_settings{settings}; + aggregate_settings[DB::Setting::max_rows_to_group_by] = max_rows_to_group_by; + aggregate_settings[DB::Setting::max_bytes_before_external_group_by] = max_bytes_before_external_group_by; + aggregate_settings[DB::Setting::min_free_disk_space_for_temporary_data] = min_free_disk_space; + aggregate_settings[DB::Setting::compile_aggregate_expressions] = compile_aggregate_expressions; + aggregate_settings[DB::Setting::min_count_to_compile_aggregate_expression] = min_count_to_compile_aggregate_expression; + aggregate_settings[DB::Setting::max_block_size] = max_block_size; + aggregate_settings[DB::Setting::enable_software_prefetch_in_aggregation] = enable_prefetch; + aggregate_settings[DB::Setting::optimize_group_by_constant_keys] = optimize_group_by_constant_keys; DB::Aggregator::Params params( + aggregate_settings, grouping_keys, agg_descriptions, false, - max_rows_to_group_by, - group_by_overflow_mode, group_by_two_level_threshold, group_by_two_level_threshold_bytes, - max_bytes_before_external_group_by, empty_result_for_aggregation_by_empty_set, tmp_data_scope, - max_threads, - min_free_disk_space, - compile_aggregate_expressions, - min_count_to_compile_aggregate_expression, - max_block_size, - enable_prefetch, only_merge, - optimize_group_by_constant_keys, - min_hit_rate_to_use_consecutive_keys_optimization, {}); + return params; } diff --git a/cpp-ch/local-engine/Common/AggregateUtil.h b/cpp-ch/local-engine/Common/AggregateUtil.h index 380e1ea35539..8fd36987ac8c 100644 --- a/cpp-ch/local-engine/Common/AggregateUtil.h +++ b/cpp-ch/local-engine/Common/AggregateUtil.h @@ -71,7 +71,7 @@ class AggregatorParamsHelper // for using grace aggregating, never enable ch spill, otherwise there will be data lost. static DB::Aggregator::Params buildParams( - DB::ContextPtr context, + const DB::ContextPtr & context, const DB::Names & grouping_keys, const DB::AggregateDescriptions & agg_descriptions, Mode mode, diff --git a/cpp-ch/local-engine/Parser/RelParsers/CrossRelParser.cpp b/cpp-ch/local-engine/Parser/RelParsers/CrossRelParser.cpp index 5a6f229744fc..ae0a50d2d3f2 100644 --- a/cpp-ch/local-engine/Parser/RelParsers/CrossRelParser.cpp +++ b/cpp-ch/local-engine/Parser/RelParsers/CrossRelParser.cpp @@ -207,7 +207,9 @@ DB::QueryPlanPtr CrossRelParser::parseJoin(const substrait::CrossRel & join, DB: context->getSettingsRef()[Setting::max_block_size], context->getSettingsRef()[Setting::min_joined_block_size_bytes], 1, - false); + /* required_output_ = */ NameSet{}, + false, + /* use_new_analyzer_ = */ false); join_step->setStepDescription("CROSS_JOIN"); steps.emplace_back(join_step.get()); std::vector plans; @@ -254,7 +256,12 @@ void CrossRelParser::addConvertStep(TableJoin & table_join, DB::QueryPlan & left NameSet left_columns_set; for (const auto & col : left.getCurrentHeader().getNames()) left_columns_set.emplace(col); - table_join.setColumnsFromJoinedTable(right.getCurrentHeader().getNamesAndTypesList(), left_columns_set, getUniqueName("right") + "."); + + table_join.setColumnsFromJoinedTable( + right.getCurrentHeader().getNamesAndTypesList(), + left_columns_set, + getUniqueName("right") + ".", + left.getCurrentHeader().getNamesAndTypesList()); // fix right table key duplicate NamesWithAliases right_table_alias; diff --git a/cpp-ch/local-engine/Parser/RelParsers/JoinRelParser.cpp b/cpp-ch/local-engine/Parser/RelParsers/JoinRelParser.cpp index 7493471697f1..6a5f9bc9378e 100644 --- a/cpp-ch/local-engine/Parser/RelParsers/JoinRelParser.cpp +++ b/cpp-ch/local-engine/Parser/RelParsers/JoinRelParser.cpp @@ -322,7 +322,9 @@ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::Q context->getSettingsRef()[Setting::max_block_size], context->getSettingsRef()[Setting::min_joined_block_size_bytes], 1, - false); + /* required_output_ = */ NameSet{}, + false, + /* use_new_analyzer_ = */ false); join_step->setStepDescription("SORT_MERGE_JOIN"); steps.emplace_back(join_step.get()); @@ -390,7 +392,11 @@ void JoinRelParser::addConvertStep(TableJoin & table_join, DB::QueryPlan & left, NameSet left_columns_set; for (const auto & col : left.getCurrentHeader().getNames()) left_columns_set.emplace(col); - table_join.setColumnsFromJoinedTable(right.getCurrentHeader().getNamesAndTypesList(), left_columns_set, getUniqueName("right") + "."); + table_join.setColumnsFromJoinedTable( + right.getCurrentHeader().getNamesAndTypesList(), + left_columns_set, + getUniqueName("right") + ".", + left.getCurrentHeader().getNamesAndTypesList()); // fix right table key duplicate NamesWithAliases right_table_alias; @@ -787,7 +793,9 @@ DB::QueryPlanPtr JoinRelParser::buildMultiOnClauseHashJoin( context->getSettingsRef()[Setting::max_block_size], context->getSettingsRef()[Setting::min_joined_block_size_bytes], 1, - false); + /* required_output_ = */ NameSet{}, + false, + /* use_new_analyzer_ = */ false); join_step->setStepDescription("Multi join on clause hash join"); steps.emplace_back(join_step.get()); std::vector plans; @@ -827,7 +835,9 @@ DB::QueryPlanPtr JoinRelParser::buildSingleOnClauseHashJoin( context->getSettingsRef()[Setting::max_block_size], context->getSettingsRef()[Setting::min_joined_block_size_bytes], 1, - false); + /* required_output_ = */ NameSet{}, + false, + /* use_new_analyzer_ = */ false); join_step->setStepDescription("HASH_JOIN"); steps.emplace_back(join_step.get()); diff --git a/cpp-ch/local-engine/tests/gtest_ch_join.cpp b/cpp-ch/local-engine/tests/gtest_ch_join.cpp index 02d43124745e..5df5eaff8c43 100644 --- a/cpp-ch/local-engine/tests/gtest_ch_join.cpp +++ b/cpp-ch/local-engine/tests/gtest_ch_join.cpp @@ -97,6 +97,10 @@ TEST(TestJoin, simple) for (const auto & column : join->columnsFromJoinedTable()) join->addJoinedColumn(column); + auto columns_from_left_table = left_plan.getCurrentHeader().getNamesAndTypesList(); + for (auto & column_from_joined_table : columns_from_left_table) + join->setUsedColumn(column_from_joined_table, JoinTableSide::Left); + auto left_keys = left.getNamesAndTypesList(); join->addJoinedColumnsAndCorrectTypes(left_keys, true); std::cerr << "after join:\n"; @@ -123,7 +127,7 @@ TEST(TestJoin, simple) auto hash_join = std::make_shared(join, right_plan.getCurrentHeader()); QueryPlanStepPtr join_step - = std::make_unique(left_plan.getCurrentHeader(), right_plan.getCurrentHeader(), hash_join, 8192, 8192, 1, false); + = std::make_unique(left_plan.getCurrentHeader(), right_plan.getCurrentHeader(), hash_join, 8192, 8192, 1, NameSet{}, false, false); std::cerr << "join step:" << join_step->getOutputHeader().dumpStructure() << std::endl;