From 8901bb168f335028db7621a4f868e6d1eb78c760 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Oct 2023 10:17:59 -0400 Subject: [PATCH 1/7] Improve documentation for bounded_order_preserving_variants config --- datafusion/common/src/config.rs | 10 ++++++---- .../enforce_distribution.rs | 10 +++++----- .../replace_with_order_preserving_variants.rs | 4 ++-- datafusion/execution/src/config.rs | 20 +++++++++++-------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 261c2bf435a4..fe6282000c43 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -453,11 +453,13 @@ config_namespace! { /// ``` pub repartition_sorts: bool, default = true - /// When true, DataFusion will opportunistically remove sorts by replacing - /// `RepartitionExec` with `SortPreservingRepartitionExec`, and + /// When true, DataFusion will opportunistically remove sorts when the data is already sorted, + /// replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and /// `CoalescePartitionsExec` with `SortPreservingMergeExec`, - /// even when the query is bounded. - pub bounded_order_preserving_variants: bool, default = false + /// + /// When false, DataFusion will prefer to maximize the parallelism using + /// `Repartition/Coalesce` and resort the data subsequently with `SortExec` + pub prefer_exising_sort: bool, default = false /// When set to true, the logical plan optimizer will produce warning /// messages if any optimization rules produce errors and then proceed to the next diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index b3fb41ea100f..3feb498052a5 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -1228,7 +1228,7 @@ fn ensure_distribution( // - it is desired according to config // - when plan is unbounded let order_preserving_variants_desirable = - is_unbounded || config.optimizer.bounded_order_preserving_variants; + is_unbounded || config.optimizer.prefer_exising_sort; if dist_context.plan.children().is_empty() { return Ok(Transformed::No(dist_context)); @@ -2085,7 +2085,7 @@ mod tests { config.optimizer.enable_round_robin_repartition = false; config.optimizer.repartition_file_scans = false; config.optimizer.repartition_file_min_size = 1024; - config.optimizer.bounded_order_preserving_variants = + config.optimizer.prefer_exising_sort = bounded_order_preserving_variants; ensure_distribution(distribution_context, &config).map(|item| item.into().plan) } @@ -2124,7 +2124,7 @@ mod tests { config.execution.target_partitions = $TARGET_PARTITIONS; config.optimizer.repartition_file_scans = $REPARTITION_FILE_SCANS; config.optimizer.repartition_file_min_size = $REPARTITION_FILE_MIN_SIZE; - config.optimizer.bounded_order_preserving_variants = $BOUNDED_ORDER_PRESERVING_VARIANTS; + config.optimizer.prefer_exising_sort = $BOUNDED_ORDER_PRESERVING_VARIANTS; // NOTE: These tests verify the joint `EnforceDistribution` + `EnforceSorting` cascade // because they were written prior to the separation of `BasicEnforcement` into @@ -4516,7 +4516,7 @@ mod tests { let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; - config.optimizer.bounded_order_preserving_variants = false; + config.optimizer.prefer_exising_sort = false; let distribution_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; assert_plan_txt!(expected, distribution_plan); @@ -4558,7 +4558,7 @@ mod tests { let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; - config.optimizer.bounded_order_preserving_variants = false; + config.optimizer.prefer_exising_sort = false; let distribution_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; assert_plan_txt!(expected, distribution_plan); diff --git a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs index b0ae199a2da4..0f1eedd8c454 100644 --- a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs @@ -251,7 +251,7 @@ pub(crate) fn replace_with_order_preserving_variants( // any case, as doing so helps fix the pipeline. // Also do the replacement if opted-in via config options. let use_order_preserving_variant = - config.optimizer.bounded_order_preserving_variants || unbounded_output(plan); + config.optimizer.prefer_exising_sort || unbounded_output(plan); let updated_sort_input = get_updated_plan( exec_tree, is_spr_better || use_order_preserving_variant, @@ -336,7 +336,7 @@ mod tests { // Run the rule top-down // let optimized_physical_plan = physical_plan.transform_down(&replace_repartition_execs)?; - let config = SessionConfig::new().with_bounded_order_preserving_variants($ALLOW_BOUNDED); + let config = SessionConfig::new().with_prefer_existing_sort($ALLOW_BOUNDED); let plan_with_pipeline_fixer = OrderPreservationContext::new(physical_plan); let parallel = plan_with_pipeline_fixer.transform_up(&|plan_with_pipeline_fixer| replace_with_order_preserving_variants(plan_with_pipeline_fixer, false, false, config.options()))?; let optimized_physical_plan = parallel.plan; diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs index 44fcc2ab49b4..68095724d0f2 100644 --- a/datafusion/execution/src/config.rs +++ b/datafusion/execution/src/config.rs @@ -145,10 +145,12 @@ impl SessionConfig { self.options.optimizer.repartition_sorts } - /// Remove sorts by replacing with order-preserving variants of operators, - /// even when query is bounded? - pub fn bounded_order_preserving_variants(&self) -> bool { - self.options.optimizer.bounded_order_preserving_variants + /// Prefer existing sort (true) or maximize parallelism (false). See + /// [prefer_existing_sort] for more details + /// + /// [prefer_existing_sort]: datafusion_common::config::OptimizerOptions::prefer_existing_sort + pub fn prefer_existing_sort(&self) -> bool { + self.options.optimizer.prefer_exising_sort } /// Are statistics collected during execution? @@ -221,10 +223,12 @@ impl SessionConfig { self } - /// Enables or disables the use of order-preserving variants of `CoalescePartitions` - /// and `RepartitionExec` operators, even when the query is bounded - pub fn with_bounded_order_preserving_variants(mut self, enabled: bool) -> Self { - self.options.optimizer.bounded_order_preserving_variants = enabled; + /// Prefer existing sort (true) or maximize parallelism (false). See + /// [prefer_existing_sort] for more details + /// + /// [prefer_existing_sort]: datafusion_common::config::OptimizerOptions::prefer_existing_sort + pub fn with_prefer_existing_sort(mut self, enabled: bool) -> Self { + self.options.optimizer.prefer_exising_sort = enabled; self } From 33e51083000ca1dfbe5a79a6387733991d8a5cf2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Oct 2023 10:28:09 -0400 Subject: [PATCH 2/7] update docs --- docs/source/user-guide/configs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 638ac5a36b83..d52caab363b2 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -87,7 +87,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.repartition_file_scans | true | When set to `true`, file groups will be repartitioned to achieve maximum parallelism. Currently Parquet and CSV formats are supported. If set to `true`, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false`, different files will be read in parallel, but repartitioning won't happen within a single file. | | datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | -| datafusion.optimizer.bounded_order_preserving_variants | false | When true, DataFusion will opportunistically remove sorts by replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and `CoalescePartitionsExec` with `SortPreservingMergeExec`, even when the query is bounded. | +| datafusion.optimizer.prefer_exising_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and `CoalescePartitionsExec` with `SortPreservingMergeExec`, When false, DataFusion will prefer to maximize the parallelism using `Repartition/Coalesce` and resort the data subsequently with `SortExec` | | datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | | datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | | datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | From 00b96bbf9ace1180e5adbfa9ce2b86b386de9397 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Oct 2023 10:28:21 -0400 Subject: [PATCH 3/7] fmt --- datafusion/core/src/physical_optimizer/enforce_distribution.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 3feb498052a5..8628bd03c3e4 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -2085,8 +2085,7 @@ mod tests { config.optimizer.enable_round_robin_repartition = false; config.optimizer.repartition_file_scans = false; config.optimizer.repartition_file_min_size = 1024; - config.optimizer.prefer_exising_sort = - bounded_order_preserving_variants; + config.optimizer.prefer_exising_sort = bounded_order_preserving_variants; ensure_distribution(distribution_context, &config).map(|item| item.into().plan) } From 177c529ee662fec7009575f80773da7851e1afaa Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Oct 2023 11:05:16 -0400 Subject: [PATCH 4/7] update config --- datafusion/sqllogictest/test_files/information_schema.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 12aa9089a0c9..518777ca950c 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -183,12 +183,12 @@ datafusion.explain.logical_plan_only false datafusion.explain.physical_plan_only false datafusion.explain.show_statistics false datafusion.optimizer.allow_symmetric_joins_without_pruning true -datafusion.optimizer.bounded_order_preserving_variants false datafusion.optimizer.enable_round_robin_repartition true datafusion.optimizer.enable_topk_aggregation true datafusion.optimizer.filter_null_join_keys false datafusion.optimizer.hash_join_single_partition_threshold 1048576 datafusion.optimizer.max_passes 3 +datafusion.optimizer.prefer_exising_sort false datafusion.optimizer.prefer_hash_join true datafusion.optimizer.repartition_aggregations true datafusion.optimizer.repartition_file_min_size 10485760 From d99d88486d1d03399a27fbb584f2d7e4df8bcb89 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Oct 2023 11:19:33 -0400 Subject: [PATCH 5/7] fix typo :facepalm --- datafusion/common/src/config.rs | 2 +- .../src/physical_optimizer/enforce_distribution.rs | 10 +++++----- .../replace_with_order_preserving_variants.rs | 2 +- datafusion/execution/src/config.rs | 4 ++-- .../sqllogictest/test_files/information_schema.slt | 2 +- docs/source/user-guide/configs.md | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index fe6282000c43..88aad08f7d8b 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -459,7 +459,7 @@ config_namespace! { /// /// When false, DataFusion will prefer to maximize the parallelism using /// `Repartition/Coalesce` and resort the data subsequently with `SortExec` - pub prefer_exising_sort: bool, default = false + pub prefer_existing_sort: bool, default = false /// When set to true, the logical plan optimizer will produce warning /// messages if any optimization rules produce errors and then proceed to the next diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 8628bd03c3e4..3463f3a31376 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -1228,7 +1228,7 @@ fn ensure_distribution( // - it is desired according to config // - when plan is unbounded let order_preserving_variants_desirable = - is_unbounded || config.optimizer.prefer_exising_sort; + is_unbounded || config.optimizer.prefer_existing_sort; if dist_context.plan.children().is_empty() { return Ok(Transformed::No(dist_context)); @@ -2085,7 +2085,7 @@ mod tests { config.optimizer.enable_round_robin_repartition = false; config.optimizer.repartition_file_scans = false; config.optimizer.repartition_file_min_size = 1024; - config.optimizer.prefer_exising_sort = bounded_order_preserving_variants; + config.optimizer.prefer_existing_sort = bounded_order_preserving_variants; ensure_distribution(distribution_context, &config).map(|item| item.into().plan) } @@ -2123,7 +2123,7 @@ mod tests { config.execution.target_partitions = $TARGET_PARTITIONS; config.optimizer.repartition_file_scans = $REPARTITION_FILE_SCANS; config.optimizer.repartition_file_min_size = $REPARTITION_FILE_MIN_SIZE; - config.optimizer.prefer_exising_sort = $BOUNDED_ORDER_PRESERVING_VARIANTS; + config.optimizer.prefer_existing_sort = $BOUNDED_ORDER_PRESERVING_VARIANTS; // NOTE: These tests verify the joint `EnforceDistribution` + `EnforceSorting` cascade // because they were written prior to the separation of `BasicEnforcement` into @@ -4515,7 +4515,7 @@ mod tests { let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; - config.optimizer.prefer_exising_sort = false; + config.optimizer.prefer_existing_sort = false; let distribution_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; assert_plan_txt!(expected, distribution_plan); @@ -4557,7 +4557,7 @@ mod tests { let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; - config.optimizer.prefer_exising_sort = false; + config.optimizer.prefer_existing_sort = false; let distribution_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; assert_plan_txt!(expected, distribution_plan); diff --git a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs index 0f1eedd8c454..cb3b6c3d0741 100644 --- a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs @@ -251,7 +251,7 @@ pub(crate) fn replace_with_order_preserving_variants( // any case, as doing so helps fix the pipeline. // Also do the replacement if opted-in via config options. let use_order_preserving_variant = - config.optimizer.prefer_exising_sort || unbounded_output(plan); + config.optimizer.prefer_existing_sort || unbounded_output(plan); let updated_sort_input = get_updated_plan( exec_tree, is_spr_better || use_order_preserving_variant, diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs index 68095724d0f2..cfcc205b5625 100644 --- a/datafusion/execution/src/config.rs +++ b/datafusion/execution/src/config.rs @@ -150,7 +150,7 @@ impl SessionConfig { /// /// [prefer_existing_sort]: datafusion_common::config::OptimizerOptions::prefer_existing_sort pub fn prefer_existing_sort(&self) -> bool { - self.options.optimizer.prefer_exising_sort + self.options.optimizer.prefer_existing_sort } /// Are statistics collected during execution? @@ -228,7 +228,7 @@ impl SessionConfig { /// /// [prefer_existing_sort]: datafusion_common::config::OptimizerOptions::prefer_existing_sort pub fn with_prefer_existing_sort(mut self, enabled: bool) -> Self { - self.options.optimizer.prefer_exising_sort = enabled; + self.options.optimizer.prefer_existing_sort = enabled; self } diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 518777ca950c..74c1296fa4ef 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -188,7 +188,7 @@ datafusion.optimizer.enable_topk_aggregation true datafusion.optimizer.filter_null_join_keys false datafusion.optimizer.hash_join_single_partition_threshold 1048576 datafusion.optimizer.max_passes 3 -datafusion.optimizer.prefer_exising_sort false +datafusion.optimizer.prefer_existing_sort false datafusion.optimizer.prefer_hash_join true datafusion.optimizer.repartition_aggregations true datafusion.optimizer.repartition_file_min_size 10485760 diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index d52caab363b2..42c97ca35ba5 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -87,7 +87,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.repartition_file_scans | true | When set to `true`, file groups will be repartitioned to achieve maximum parallelism. Currently Parquet and CSV formats are supported. If set to `true`, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false`, different files will be read in parallel, but repartitioning won't happen within a single file. | | datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | -| datafusion.optimizer.prefer_exising_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and `CoalescePartitionsExec` with `SortPreservingMergeExec`, When false, DataFusion will prefer to maximize the parallelism using `Repartition/Coalesce` and resort the data subsequently with `SortExec` | +| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and `CoalescePartitionsExec` with `SortPreservingMergeExec`, When false, DataFusion will prefer to maximize the parallelism using `Repartition/Coalesce` and resort the data subsequently with `SortExec` | | datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | | datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | | datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | From 47303f6159577ed9936aed82cff795a3ae9bce19 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Oct 2023 11:26:08 -0400 Subject: [PATCH 6/7] prettier --- docs/source/user-guide/configs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 42c97ca35ba5..347554a74505 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -87,7 +87,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.repartition_file_scans | true | When set to `true`, file groups will be repartitioned to achieve maximum parallelism. Currently Parquet and CSV formats are supported. If set to `true`, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false`, different files will be read in parallel, but repartitioning won't happen within a single file. | | datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | -| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and `CoalescePartitionsExec` with `SortPreservingMergeExec`, When false, DataFusion will prefer to maximize the parallelism using `Repartition/Coalesce` and resort the data subsequently with `SortExec` | +| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and `CoalescePartitionsExec` with `SortPreservingMergeExec`, When false, DataFusion will prefer to maximize the parallelism using `Repartition/Coalesce` and resort the data subsequently with `SortExec` | | datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | | datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | | datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | From 35c674852bbe4ce9dafa47975b4387ef41b5c295 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Oct 2023 14:01:56 -0400 Subject: [PATCH 7/7] Reword for clarity --- datafusion/common/src/config.rs | 8 ++++---- docs/source/user-guide/configs.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 88aad08f7d8b..281da1f69e69 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -454,11 +454,11 @@ config_namespace! { pub repartition_sorts: bool, default = true /// When true, DataFusion will opportunistically remove sorts when the data is already sorted, - /// replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and - /// `CoalescePartitionsExec` with `SortPreservingMergeExec`, + /// (i.e. setting `preserve_order` to true on `RepartitionExec` and + /// using `SortPreservingMergeExec`) /// - /// When false, DataFusion will prefer to maximize the parallelism using - /// `Repartition/Coalesce` and resort the data subsequently with `SortExec` + /// When false, DataFusion will maximize plan parallelism using + /// `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. pub prefer_existing_sort: bool, default = false /// When set to true, the logical plan optimizer will produce warning diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 347554a74505..9eb0862de9c1 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -87,7 +87,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.repartition_file_scans | true | When set to `true`, file groups will be repartitioned to achieve maximum parallelism. Currently Parquet and CSV formats are supported. If set to `true`, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false`, different files will be read in parallel, but repartitioning won't happen within a single file. | | datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | -| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, replacing `RepartitionExec` with `SortPreservingRepartitionExec`, and `CoalescePartitionsExec` with `SortPreservingMergeExec`, When false, DataFusion will prefer to maximize the parallelism using `Repartition/Coalesce` and resort the data subsequently with `SortExec` | +| datafusion.optimizer.prefer_existing_sort | false | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. | | datafusion.optimizer.skip_failed_rules | false | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | | datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | | datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys |