From f3423e1883f9f05ecefc88a64fceb68755c1acf9 Mon Sep 17 00:00:00 2001 From: jefffffyang Date: Wed, 13 Mar 2024 15:42:58 +0800 Subject: [PATCH 1/5] support unnest as subexpression and multiple unnests in projection --- datafusion/expr/src/expr_schema.rs | 20 ++++- datafusion/sql/src/select.rs | 64 +++++++++---- datafusion/sqllogictest/test_files/unnest.slt | 89 +++++++++++++++++-- 3 files changed, 148 insertions(+), 25 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 70ffa5064a52..1d83fbe8c0e0 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -28,7 +28,8 @@ use crate::{utils, LogicalPlan, Projection, Subquery}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFField, ExprSchema, Result, + internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFField, + ExprSchema, Result, }; use std::collections::HashMap; use std::sync::Arc; @@ -113,7 +114,22 @@ impl ExprSchemable for Expr { .iter() .map(|e| e.get_type(schema)) .collect::>>()?; - Ok(arg_data_types[0].clone()) + let arg_data_type = arg_data_types[0].clone(); + // Unnest's output type is the inner type of the list + match arg_data_type{ + DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) =>{ + Ok(field.data_type().clone()) + } + DataType::Struct(_) => { + not_impl_err!("unnest() does not support struct yet") + } + DataType::Null => { + not_impl_err!("unnest() does not support null yet") + } + _ => { + plan_err!("unnest() can only be applied to array, struct and null") + } + } } Expr::ScalarFunction(ScalarFunction { func_def, args }) => { let arg_data_types = args diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index c5c80bd6acf0..c09bfe94f7f9 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -24,6 +24,7 @@ use crate::utils::{ resolve_columns, resolve_positions_to_exprs, }; +use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; use datafusion_common::{Column, UnnestOptions}; use datafusion_expr::expr::{Alias, Unnest}; @@ -276,25 +277,51 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(plan) } - // Try converting Expr::Unnest to LogicalPlan::Unnest if possible, otherwise do the final projection + /// Try converting Expr(Unnest(Expr)) to Projection/Unnest/Projection pub(super) fn try_process_unnest( &self, input: LogicalPlan, select_exprs: Vec, ) -> Result { let mut unnest_columns = vec![]; - // Map unnest expressions to their argument - let projection_exprs = select_exprs + let mut inner_projection_exprs = vec![]; + + let outer_projection_exprs = select_exprs .into_iter() .map(|expr| { - if let Expr::Unnest(Unnest { ref exprs }) = expr { + let Transformed { + data: transformed_expr, + transformed, + tnr: _, + } = expr.transform_up_mut(&mut |expr: Expr| { let column_name = expr.display_name()?; - unnest_columns.push(column_name.clone()); - // Add alias for the argument expression, to avoid naming conflicts with other expressions - // in the select list. For example: `select unnest(col1), col1 from t`. - Ok(exprs[0].clone().alias(column_name)) + if let Expr::Unnest(Unnest { ref exprs }) = expr { + unnest_columns.push(column_name.clone()); + // Add alias for the argument expression, to avoid naming conflicts with other expressions + // in the select list. For example: `select unnest(col1), col1 from t`. + inner_projection_exprs + .push(exprs[0].clone().alias(column_name.clone())); + Ok(Transformed::yes(Expr::Column(Column::from_name( + column_name, + )))) + } else { + Ok(Transformed::no(expr)) + } + })?; + + if !transformed { + if matches!(&transformed_expr, Expr::Column(_)) { + inner_projection_exprs.push(transformed_expr.clone()); + Ok(transformed_expr) + } else { + // We need to evaluate the expr in the inner projection, + // outer projection just select its name + let column_name = transformed_expr.display_name()?; + inner_projection_exprs.push(transformed_expr); + Ok(Expr::Column(Column::from_name(column_name))) + } } else { - Ok(expr) + Ok(transformed_expr) } }) .collect::>>()?; @@ -302,19 +329,20 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Do the final projection if unnest_columns.is_empty() { LogicalPlanBuilder::from(input) - .project(projection_exprs)? + .project(inner_projection_exprs)? .build() } else { - if unnest_columns.len() > 1 { - return not_impl_err!("Only support single unnest expression for now"); - } - let unnest_column = unnest_columns.pop().unwrap(); // Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL let unnest_options = UnnestOptions::new().with_preserve_nulls(false); - LogicalPlanBuilder::from(input) - .project(projection_exprs)? - .unnest_column_with_options(unnest_column, unnest_options)? - .build() + + let mut builder = + LogicalPlanBuilder::from(input).project(inner_projection_exprs)?; + + for unnest_column in unnest_columns { + builder = builder + .unnest_column_with_options(unnest_column, unnest_options.clone())?; + } + builder.project(outer_projection_exprs)?.build() } } diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index f60f715242cf..ceeca8ea02c2 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -135,11 +135,6 @@ select array_remove(column1, 4), unnest(column2), column3 * 10 from unnest_table query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null select unnest(column3) from unnest_table; -## Unnest multiple columns -query error DataFusion error: This feature is not implemented: Only support single unnest expression for now -select unnest(column1), unnest(column2) from unnest_table; - - ## Unnest scalar in select list query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null select unnest(1); @@ -254,5 +249,89 @@ select * from unnest([1,2,(select sum(column3) from unnest_table)]); 2 10 +## Unnest is the sub-expression of other expression +query II +select unnest(column1), column3 as a from unnest_table; +---- +1 1 +2 1 +3 1 +4 2 +5 2 +6 3 +12 NULL + +query BI +select unnest(column1) is not null, column3 from unnest_table; +---- +true 1 +true 1 +true 1 +true 2 +true 2 +true 3 +true NULL + +query II +select -unnest(column1) as a, column3 from unnest_table; +---- +-1 1 +-2 1 +-3 1 +-4 2 +-5 2 +-6 3 +-12 NULL + +query II +select unnest(array_remove(column1, 3)) as a, column3 from unnest_table; +---- +1 1 +2 1 +4 2 +5 2 +6 3 +12 NULL + +query II +select unnest(array_remove(column1, 3)) as c1, column3 from unnest_table order by c1 desc, column3; +---- +12 NULL +6 3 +5 2 +4 2 +2 1 +1 1 + +query II +select unnest(array_remove(column1, 3)) - 1 as c1, column3 from unnest_table; +---- +0 1 +1 1 +3 2 +4 2 +5 3 +11 NULL + + +## Unnest multiple columns +query II +select unnest(column1), unnest(column2) from unnest_table; +---- +1 7 +2 7 +3 7 +4 8 +4 9 +4 10 +5 8 +5 9 +5 10 +6 11 +6 12 +12 NULL +12 42 +12 NULL + statement ok drop table unnest_table; From 01c60b4140f1f3257558fca3719bd552c94983e5 Mon Sep 17 00:00:00 2001 From: jefffffyang Date: Wed, 13 Mar 2024 16:28:40 +0800 Subject: [PATCH 2/5] update alias unnest --- datafusion/sqllogictest/test_files/unnest.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index ceeca8ea02c2..93973f375ccf 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -251,7 +251,7 @@ select * from unnest([1,2,(select sum(column3) from unnest_table)]); ## Unnest is the sub-expression of other expression query II -select unnest(column1), column3 as a from unnest_table; +select unnest(column1) as a, column3 from unnest_table; ---- 1 1 2 1 From d3b9fa5580c72fe30fb69accb7cd93f936b34296 Mon Sep 17 00:00:00 2001 From: jefffffyang Date: Wed, 13 Mar 2024 17:00:43 +0800 Subject: [PATCH 3/5] remove unnest multiple columns --- datafusion/sql/src/select.rs | 18 ++++++++--------- datafusion/sqllogictest/test_files/unnest.slt | 20 ------------------- 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index c09bfe94f7f9..6d067b50be99 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -332,17 +332,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .project(inner_projection_exprs)? .build() } else { + if unnest_columns.len() > 1 { + return not_impl_err!("Only support single unnest expression for now"); + } + let unnest_column = unnest_columns.pop().unwrap(); // Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL let unnest_options = UnnestOptions::new().with_preserve_nulls(false); - - let mut builder = - LogicalPlanBuilder::from(input).project(inner_projection_exprs)?; - - for unnest_column in unnest_columns { - builder = builder - .unnest_column_with_options(unnest_column, unnest_options.clone())?; - } - builder.project(outer_projection_exprs)?.build() + LogicalPlanBuilder::from(input) + .project(inner_projection_exprs)? + .unnest_column_with_options(unnest_column, unnest_options)? + .project(outer_projection_exprs)? + .build() } } diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 93973f375ccf..7a8622eed3cd 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -313,25 +313,5 @@ select unnest(array_remove(column1, 3)) - 1 as c1, column3 from unnest_table; 5 3 11 NULL - -## Unnest multiple columns -query II -select unnest(column1), unnest(column2) from unnest_table; ----- -1 7 -2 7 -3 7 -4 8 -4 9 -4 10 -5 8 -5 9 -5 10 -6 11 -6 12 -12 NULL -12 42 -12 NULL - statement ok drop table unnest_table; From 69eb93b4c2deae09cd8fe753cbc71afcb2955115 Mon Sep 17 00:00:00 2001 From: jefffffyang Date: Wed, 13 Mar 2024 17:53:39 +0800 Subject: [PATCH 4/5] rename mulitple unnest columns to multiple unnest functions in selection --- datafusion/sqllogictest/test_files/unnest.slt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 7a8622eed3cd..5c178bb392b1 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -135,6 +135,10 @@ select array_remove(column1, 4), unnest(column2), column3 * 10 from unnest_table query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null select unnest(column3) from unnest_table; +## Multiple unnest functions in selection +query error DataFusion error: This feature is not implemented: Only support single unnest expression for now +select unnest(column1), unnest(column2) from unnest_table; + ## Unnest scalar in select list query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null select unnest(1); From 6cbec520aac1fa655e8f221fa8a4018c5d18caa8 Mon Sep 17 00:00:00 2001 From: yjy <1731939194@qq.com> Date: Wed, 13 Mar 2024 21:57:26 +0800 Subject: [PATCH 5/5] fix: move column name to if branch --- datafusion/sql/src/select.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 6d067b50be99..1bfd60a8ce1a 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -294,8 +294,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { transformed, tnr: _, } = expr.transform_up_mut(&mut |expr: Expr| { - let column_name = expr.display_name()?; if let Expr::Unnest(Unnest { ref exprs }) = expr { + let column_name = expr.display_name()?; unnest_columns.push(column_name.clone()); // Add alias for the argument expression, to avoid naming conflicts with other expressions // in the select list. For example: `select unnest(col1), col1 from t`.