From 448f140ca01039f0d799318a46a97d8bbb239dc6 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Wed, 4 Dec 2024 13:13:38 +0100 Subject: [PATCH] Allow ColumnarValue to array conversion with less copying `ColumnarValue::into_array` takes ownership of the columnar value and so requires copying data if the call site doesn't own the value. This does not matter in many cases, since the `into_array` internally will copy data even more. It does matter however for the cases when to-array is called with desired array length of 1, which may happen when UDF implementation attempts to normalize arguments into arrays without expanding them. This pattern can be seen in regexp functions, but can be useful in downstream projects too. --- datafusion/expr-common/src/columnar_value.rs | 20 ++++++++++++++++++- datafusion/functions-nested/src/array_has.rs | 4 ++-- datafusion/functions/src/regex/regexpcount.rs | 2 +- datafusion/functions/src/regex/regexplike.rs | 2 +- datafusion/functions/src/regex/regexpmatch.rs | 2 +- .../functions/src/regex/regexpreplace.rs | 4 ++-- datafusion/functions/src/utils.rs | 4 ++-- 7 files changed, 28 insertions(+), 10 deletions(-) diff --git a/datafusion/expr-common/src/columnar_value.rs b/datafusion/expr-common/src/columnar_value.rs index 4b9454ed739d..3b17e606544d 100644 --- a/datafusion/expr-common/src/columnar_value.rs +++ b/datafusion/expr-common/src/columnar_value.rs @@ -129,6 +129,24 @@ impl ColumnarValue { }) } + /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified + /// number of rows. [`Self::Scalar`] is converted by repeating the same + /// scalar multiple times which is not as efficient as handling the scalar + /// directly. + /// + /// See [`Self::values_to_arrays`] to convert multiple columnar values into + /// arrays of the same length. + /// + /// # Errors + /// + /// Errors if `self` is a Scalar that fails to be converted into an array of size + pub fn to_array(&self, num_rows: usize) -> Result { + Ok(match self { + ColumnarValue::Array(array) => Arc::clone(array), + ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)?, + }) + } + /// Null columnar values are implemented as a null array in order to pass batch /// num_rows pub fn create_null_array(num_rows: usize) -> Self { @@ -176,7 +194,7 @@ impl ColumnarValue { let args = args .iter() - .map(|arg| arg.clone().into_array(inferred_length)) + .map(|arg| arg.to_array(inferred_length)) .collect::>>()?; Ok(args) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index c71314d8263f..499b07dafccf 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -106,7 +106,7 @@ impl ScalarUDFImpl for ArrayHas { match &args[1] { ColumnarValue::Array(array_needle) => { // the needle is already an array, convert the haystack to an array of the same length - let haystack = args[0].to_owned().into_array(array_needle.len())?; + let haystack = args[0].to_array(array_needle.len())?; let array = array_has_inner_for_array(&haystack, array_needle)?; Ok(ColumnarValue::Array(array)) } @@ -118,7 +118,7 @@ impl ScalarUDFImpl for ArrayHas { } // since the needle is a scalar, convert it to an array of size 1 - let haystack = args[0].to_owned().into_array(1)?; + let haystack = args[0].to_array(1)?; let needle = scalar_needle.to_array_of_size(1)?; let needle = Scalar::new(needle); let array = array_has_inner_for_scalar(&haystack, &needle)?; diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index a667d70e7bb2..8f06c75b2fe9 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -97,7 +97,7 @@ impl ScalarUDFImpl for RegexpCountFunc { let inferred_length = len.unwrap_or(1); let args = args .iter() - .map(|arg| arg.clone().into_array(inferred_length)) + .map(|arg| arg.to_array(inferred_length)) .collect::>>()?; let result = regexp_count_func(&args); diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index adbd6ef94d84..49e57776c7b8 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -147,7 +147,7 @@ impl ScalarUDFImpl for RegexpLikeFunc { let inferred_length = len.unwrap_or(1); let args = args .iter() - .map(|arg| arg.clone().into_array(inferred_length)) + .map(|arg| arg.to_array(inferred_length)) .collect::>>()?; let result = regexp_like(&args); diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 93178d23de4f..8362ef2f406c 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -99,7 +99,7 @@ impl ScalarUDFImpl for RegexpMatchFunc { let inferred_length = len.unwrap_or(1); let args = args .iter() - .map(|arg| arg.clone().into_array(inferred_length)) + .map(|arg| arg.to_array(inferred_length)) .collect::>>()?; let result = regexp_match_func(&args); diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 3f289e7c1511..af02fa493483 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -575,7 +575,7 @@ pub fn specialize_regexp_replace( Hint::AcceptsSingular => 1, Hint::Pad => inferred_length, }; - arg.clone().into_array(expansion_len) + arg.to_array(expansion_len) }) .collect::>>()?; _regexp_replace_static_pattern_replace::(&args) @@ -586,7 +586,7 @@ pub fn specialize_regexp_replace( (_, _, _, _) => { let args = args .iter() - .map(|arg| arg.clone().into_array(inferred_length)) + .map(|arg| arg.to_array(inferred_length)) .collect::>>()?; match args[0].data_type() { diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs index 8b473500416b..53f607492266 100644 --- a/datafusion/functions/src/utils.rs +++ b/datafusion/functions/src/utils.rs @@ -105,7 +105,7 @@ where Hint::AcceptsSingular => 1, Hint::Pad => inferred_length, }; - arg.clone().into_array(expansion_len) + arg.to_array(expansion_len) }) .collect::>>()?; @@ -152,7 +152,7 @@ pub mod test { let result = func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, number_rows: cardinality, return_type: &return_type}); assert_eq!(result.is_ok(), true, "function returned an error: {}", result.unwrap_err()); - let result = result.unwrap().clone().into_array(cardinality).expect("Failed to convert to array"); + let result = result.unwrap().to_array(cardinality).expect("Failed to convert to array"); let result = result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to type"); // value is correct