From 3ce7c916ffeefddde13936aa7ee70dc2fbcc9f47 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 10 Aug 2024 01:37:13 +0800 Subject: [PATCH 1/6] add utf8view support for generic_trim --- datafusion/functions/src/string/common.rs | 80 +++++++++++++++++++++-- 1 file changed, 76 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index d36bd5cecc47..c49cf7dae5c3 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -19,13 +19,13 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; use arrow::array::{ - new_null_array, Array, ArrayDataBuilder, ArrayRef, GenericStringArray, + new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayRef, GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, }; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; -use datafusion_common::cast::as_generic_string_array; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; use datafusion_expr::ColumnarValue; @@ -49,6 +49,7 @@ impl Display for TrimType { pub(crate) fn general_trim( args: &[ArrayRef], trim_type: TrimType, + string_view_pat: bool, ) -> Result { let func = match trim_type { TrimType::Left => |input, pattern: &str| { @@ -68,6 +69,74 @@ pub(crate) fn general_trim( }, }; + if string_view_pat { + string_view_trim::(trim_type, func, args) + } else { + string_trim::(trim_type, func, args) + } +} + +// removing 'a will cause compiler complaining lifetime of `func` +fn string_view_trim<'a, T: OffsetSizeTrait>( + trim_type: TrimType, + func: fn(&'a str, &'a str) -> &'a str, + args: &'a [ArrayRef], +) -> Result { + let string_array = as_string_view_array(&args[0])?; + + match args.len() { + 1 => { + let result = string_array + .iter() + .map(|string| string.map(|string: &str| func(string, " "))) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) + } + 2 => { + let characters_array = as_string_view_array(&args[1])?; + + if characters_array.len() == 1 { + if characters_array.is_null(0) { + return Ok(new_null_array( + // The schema is expecting utf8 as null + &DataType::Utf8, + string_array.len(), + )); + } + + let characters = characters_array.value(0); + let result = string_array + .iter() + .map(|item| item.map(|string| func(string, characters))) + .collect::>(); + return Ok(Arc::new(result) as ArrayRef); + } + + let result = string_array + .iter() + .zip(characters_array.iter()) + .map(|(string, characters)| match (string, characters) { + (Some(string), Some(characters)) => Some(func(string, characters)), + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) + } + other => { + exec_err!( + "{trim_type} was called with {other} arguments. It requires at least 1 and at most 2." + ) + } + } +} + +fn string_trim<'a, T: OffsetSizeTrait>( + trim_type: TrimType, + func: fn(&'a str, &'a str) -> &'a str, + args: &'a [ArrayRef], +) -> Result { let string_array = as_generic_string_array::(&args[0])?; match args.len() { @@ -84,7 +153,10 @@ pub(crate) fn general_trim( if characters_array.len() == 1 { if characters_array.is_null(0) { - return Ok(new_null_array(args[0].data_type(), args[0].len())); + return Ok(new_null_array( + string_array.data_type(), + string_array.len(), + )); } let characters = characters_array.value(0); @@ -109,7 +181,7 @@ pub(crate) fn general_trim( other => { exec_err!( "{trim_type} was called with {other} arguments. It requires at least 1 and at most 2." - ) + ) } } } From 357b1d5b95f4a2576239d57bf5116c72adf99718 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 10 Aug 2024 01:37:32 +0800 Subject: [PATCH 2/6] add utf8view support for BTRIM --- datafusion/functions/src/string/btrim.rs | 26 +++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 349928d09664..877580541807 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrayRef, OffsetSizeTrait}; -use std::any::Any; - +use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; +use std::any::Any; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; @@ -32,7 +31,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed. /// btrim('xyxtrimyyx', 'xyz') = 'trim' fn btrim(args: &[ArrayRef]) -> Result { - general_trim::(args, TrimType::Both) + let use_string_view = args[0].data_type() == &DataType::Utf8View; + general_trim::(args, TrimType::Both, use_string_view) } #[derive(Debug)] @@ -52,7 +52,16 @@ impl BTrimFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])], + vec![ + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. + Exact(vec![Utf8View, Utf8View]), + // Exact(vec![Utf8, Utf8View]), + Exact(vec![Utf8, Utf8]), + Exact(vec![Utf8View]), + Exact(vec![Utf8]), + ], Volatility::Immutable, ), aliases: vec![String::from("trim")], @@ -79,7 +88,7 @@ impl ScalarUDFImpl for BTrimFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function( + DataType::Utf8 | DataType::Utf8View => make_scalar_function( btrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), @@ -87,7 +96,10 @@ impl ScalarUDFImpl for BTrimFunc { btrim::, vec![Hint::Pad, Hint::AcceptsSingular], )(args), - other => exec_err!("Unsupported data type {other:?} for function btrim"), + other => exec_err!( + "Unsupported data type {other:?} for function btrim,\ + expected for Utf8, LargeUtf8 or Utf8View." + ), } } From b17f55e78d97639012c874038a10381440b21045 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 10 Aug 2024 01:39:17 +0800 Subject: [PATCH 3/6] stop LTRIM and RTRIM from complaining generic_trim missing args --- datafusion/functions/src/string/ltrim.rs | 2 +- datafusion/functions/src/string/rtrim.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index de14bbaa2bcf..6a9fafdd9299 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -32,7 +32,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. /// ltrim('zzzytest', 'xyz') = 'test' fn ltrim(args: &[ArrayRef]) -> Result { - general_trim::(args, TrimType::Left) + general_trim::(args, TrimType::Left, false) } #[derive(Debug)] diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 2d29b50cb173..50b626e3df0e 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -32,7 +32,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. /// rtrim('testxxzx', 'xyz') = 'test' fn rtrim(args: &[ArrayRef]) -> Result { - general_trim::(args, TrimType::Right) + general_trim::(args, TrimType::Right, false) } #[derive(Debug)] From eee77f4a2939fe374a41d08541a9b8ab95728d68 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 10 Aug 2024 01:48:46 +0800 Subject: [PATCH 4/6] add tests to cover utf8view support of BTRIM --- .../sqllogictest/test_files/string_view.slt | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index e7166690580f..93025046f4a8 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -519,15 +519,50 @@ SELECT 228 0 NULL ## Ensure no casts for BTRIM +# Test BTRIM with Utf8View input +query TT +EXPLAIN SELECT + BTRIM(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: btrim(test.column1_utf8view) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test BTRIM with Utf8View input and Utf8View pattern query TT EXPLAIN SELECT BTRIM(column1_utf8view, 'foo') AS l FROM test; ---- logical_plan -01)Projection: btrim(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS l +01)Projection: btrim(test.column1_utf8view, Utf8View("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +# Test BTRIM with Utf8View bytes longer than 12 +query TT +EXPLAIN SELECT + BTRIM(column1_utf8view, 'this is longer than 12') AS l +FROM test; +---- +logical_plan +01)Projection: btrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l 02)--TableScan: test projection=[column1_utf8view] +# Test BTRIM outputs +query TTT +SELECT + BTRIM(column1_utf8view, 'foo') AS l1, + BTRIM(column1_utf8view, 'A') AS l2, + BTRIM(column1_utf8view) AS l3, + BTRIM(column1_utf8view, NULL) AS l4 +FROM test; +---- +Andrew ndrew Andrew NULL +Xiangpeng Xiangpeng Xiangpeng NULL +Raphael Raphael Raphael NULL +NULL NULL NULL NULL + ## Ensure no casts for CHARACTER_LENGTH query TT EXPLAIN SELECT From 1ef7eab4a0d895bc401deda266cb97dedf7163f6 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 10 Aug 2024 01:51:13 +0800 Subject: [PATCH 5/6] fix typo and tiny err --- datafusion/functions/src/string/common.rs | 4 ++-- datafusion/sqllogictest/test_files/string_view.slt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index c49cf7dae5c3..60c5c77dd8f2 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -49,7 +49,7 @@ impl Display for TrimType { pub(crate) fn general_trim( args: &[ArrayRef], trim_type: TrimType, - string_view_pat: bool, + use_string_view: bool, ) -> Result { let func = match trim_type { TrimType::Left => |input, pattern: &str| { @@ -69,7 +69,7 @@ pub(crate) fn general_trim( }, }; - if string_view_pat { + if use_string_view { string_view_trim::(trim_type, func, args) } else { string_trim::(trim_type, func, args) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 93025046f4a8..46a1459e006a 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -550,7 +550,7 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] # Test BTRIM outputs -query TTT +query TTTT SELECT BTRIM(column1_utf8view, 'foo') AS l1, BTRIM(column1_utf8view, 'A') AS l2, From 9cff1c70ac31e4248c2ed97e5fdec2f58ccee6a1 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 10 Aug 2024 02:16:14 +0800 Subject: [PATCH 6/6] remove useless imports --- datafusion/functions/src/string/btrim.rs | 2 +- datafusion/functions/src/string/common.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 877580541807..86470dd7a646 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow::array::{ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; use std::any::Any; diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 60c5c77dd8f2..7037c1d1c3c3 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -19,7 +19,7 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; use arrow::array::{ - new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayRef, GenericStringArray, + new_null_array, Array, ArrayDataBuilder, ArrayRef, GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, }; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};