From cb24d4aed78e4672de1fcd13e9ca591591e9adf3 Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Fri, 13 Dec 2024 11:06:57 +0700 Subject: [PATCH 1/6] Support unicode character for 'initcap' function Signed-off-by: Tai Le Manh --- datafusion/functions/benches/initcap.rs | 4 +- datafusion/functions/src/string/mod.rs | 7 -- .../src/{string => unicode}/initcap.rs | 99 +++++++++++++------ datafusion/functions/src/unicode/mod.rs | 7 ++ 4 files changed, 80 insertions(+), 37 deletions(-) rename datafusion/functions/src/{string => unicode}/initcap.rs (71%) diff --git a/datafusion/functions/benches/initcap.rs b/datafusion/functions/benches/initcap.rs index c88b6b513980..97c76831b33c 100644 --- a/datafusion/functions/benches/initcap.rs +++ b/datafusion/functions/benches/initcap.rs @@ -24,7 +24,7 @@ use arrow::util::bench_util::{ }; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; -use datafusion_functions::string; +use datafusion_functions::unicode; use std::sync::Arc; fn create_args( @@ -46,7 +46,7 @@ fn create_args( } fn criterion_benchmark(c: &mut Criterion) { - let initcap = string::initcap(); + let initcap = unicode::initcap(); for size in [1024, 4096] { let args = create_args::(size, 8, true); c.bench_function( diff --git a/datafusion/functions/src/string/mod.rs b/datafusion/functions/src/string/mod.rs index f156f070d960..c43aaeccbefe 100644 --- a/datafusion/functions/src/string/mod.rs +++ b/datafusion/functions/src/string/mod.rs @@ -30,7 +30,6 @@ pub mod concat; pub mod concat_ws; pub mod contains; pub mod ends_with; -pub mod initcap; pub mod levenshtein; pub mod lower; pub mod ltrim; @@ -52,7 +51,6 @@ make_udf_function!(chr::ChrFunc, chr); make_udf_function!(concat::ConcatFunc, concat); make_udf_function!(concat_ws::ConcatWsFunc, concat_ws); make_udf_function!(ends_with::EndsWithFunc, ends_with); -make_udf_function!(initcap::InitcapFunc, initcap); make_udf_function!(levenshtein::LevenshteinFunc, levenshtein); make_udf_function!(ltrim::LtrimFunc, ltrim); make_udf_function!(lower::LowerFunc, lower); @@ -94,10 +92,6 @@ pub mod expr_fn { ends_with, "Returns true if the `string` ends with the `suffix`, false otherwise.", string suffix - ),( - initcap, - "Converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase", - string ),( levenshtein, "Returns the Levenshtein distance between the two given strings", @@ -177,7 +171,6 @@ pub fn functions() -> Vec> { concat(), concat_ws(), ends_with(), - initcap(), levenshtein(), lower(), ltrim(), diff --git a/datafusion/functions/src/string/initcap.rs b/datafusion/functions/src/unicode/initcap.rs similarity index 71% rename from datafusion/functions/src/string/initcap.rs rename to datafusion/functions/src/unicode/initcap.rs index 2780dcaeeb83..6f3e0f8c25df 100644 --- a/datafusion/functions/src/string/initcap.rs +++ b/datafusion/functions/src/unicode/initcap.rs @@ -18,7 +18,9 @@ use std::any::Any; use std::sync::{Arc, OnceLock}; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; +use arrow::array::{ + Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder, +}; use arrow::datatypes::DataType; use crate::utils::{make_scalar_function, utf8_to_str_type}; @@ -74,7 +76,7 @@ impl ScalarUDFImpl for InitcapFunc { DataType::LargeUtf8 => make_scalar_function(initcap::, vec![])(args), DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args), other => { - exec_err!("Unsupported data type {other:?} for function initcap") + exec_err!("Unsupported data type {other:?} for function `initcap`") } } } @@ -90,9 +92,8 @@ fn get_initcap_doc() -> &'static Documentation { DOCUMENTATION.get_or_init(|| { Documentation::builder( DOC_SECTION_STRING, - "Capitalizes the first character in each word in the ASCII input string. \ - Words are delimited by non-alphanumeric characters.\n\n\ - Note this function does not support UTF-8 characters.", + "Capitalizes the first character in each word in the input string. \ + Words are delimited by non-alphanumeric characters.", "initcap(str)", ) .with_sql_example( @@ -123,24 +124,36 @@ fn get_initcap_doc() -> &'static Documentation { fn initcap(args: &[ArrayRef]) -> Result { let string_array = as_generic_string_array::(&args[0])?; - // first map is the iterator, second is for the `Option<_>` - let result = string_array - .iter() - .map(initcap_string) - .collect::>(); + let mut builder = GenericStringBuilder::::with_capacity( + string_array.len(), + string_array.value_data().len(), + ); - Ok(Arc::new(result) as ArrayRef) + string_array.iter().for_each(|str| match str { + Some(s) => { + let initcap_str = initcap_string(Some(s)).unwrap(); + builder.append_value(initcap_str); + } + None => builder.append_null(), + }); + + Ok(Arc::new(builder.finish()) as ArrayRef) } fn initcap_utf8view(args: &[ArrayRef]) -> Result { let string_view_array = as_string_view_array(&args[0])?; - let result = string_view_array - .iter() - .map(initcap_string) - .collect::(); + let mut builder = StringViewBuilder::with_capacity(string_view_array.len()); + + string_view_array.iter().for_each(|str| match str { + Some(s) => { + let initcap_str = initcap_string(Some(s)).unwrap(); + builder.append_value(initcap_str); + } + None => builder.append_null(), + }); - Ok(Arc::new(result) as ArrayRef) + Ok(Arc::new(builder.finish()) as ArrayRef) } fn initcap_string(input: Option<&str>) -> Option { @@ -149,13 +162,16 @@ fn initcap_string(input: Option<&str>) -> Option { let mut prev_is_alphanumeric = false; for c in s.chars() { - let transformed = if prev_is_alphanumeric { - c.to_ascii_lowercase() + if prev_is_alphanumeric { + for lc in c.to_lowercase() { + result.push(lc); + } } else { - c.to_ascii_uppercase() - }; - result.push(transformed); - prev_is_alphanumeric = c.is_ascii_alphanumeric(); + for uc in c.to_uppercase() { + result.push(uc); + } + } + prev_is_alphanumeric = c.is_alphanumeric(); } result @@ -164,9 +180,9 @@ fn initcap_string(input: Option<&str>) -> Option { #[cfg(test)] mod tests { - use crate::string::initcap::InitcapFunc; + use crate::unicode::initcap::InitcapFunc; use crate::utils::test::test_function; - use arrow::array::{Array, StringArray}; + use arrow::array::{Array, StringArray, StringViewArray}; use arrow::datatypes::DataType::Utf8; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; @@ -181,6 +197,19 @@ mod tests { Utf8, StringArray ); + test_function!( + InitcapFunc::new(), + vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "êm ả ñandú árbol олег иванович íslensku þjóðarinnar ελληνική" + .to_string() + )))], + Ok(Some( + "Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική" + )), + &str, + Utf8, + StringArray + ); test_function!( InitcapFunc::new(), vec![ColumnarValue::Scalar(ScalarValue::from(""))], @@ -205,6 +234,7 @@ mod tests { Utf8, StringArray ); + test_function!( InitcapFunc::new(), vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some( @@ -213,7 +243,7 @@ mod tests { Ok(Some("Hi Thomas")), &str, Utf8, - StringArray + StringViewArray ); test_function!( InitcapFunc::new(), @@ -223,7 +253,20 @@ mod tests { Ok(Some("Hi Thomas With M0re Than 12 Chars")), &str, Utf8, - StringArray + StringViewArray + ); + test_function!( + InitcapFunc::new(), + vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + "êm ả ñandú árbol олег иванович íslensku þjóðarinnar ελληνική" + .to_string() + )))], + Ok(Some( + "Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική" + )), + &str, + Utf8, + StringViewArray ); test_function!( InitcapFunc::new(), @@ -233,7 +276,7 @@ mod tests { Ok(Some("")), &str, Utf8, - StringArray + StringViewArray ); test_function!( InitcapFunc::new(), @@ -241,7 +284,7 @@ mod tests { Ok(None), &str, Utf8, - StringArray + StringViewArray ); Ok(()) diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index f31ece9196d8..7deb84188d2f 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -23,6 +23,7 @@ use datafusion_expr::ScalarUDF; pub mod character_length; pub mod find_in_set; +pub mod initcap; pub mod left; pub mod lpad; pub mod reverse; @@ -36,6 +37,7 @@ pub mod translate; // create UDFs make_udf_function!(character_length::CharacterLengthFunc, character_length); make_udf_function!(find_in_set::FindInSetFunc, find_in_set); +make_udf_function!(initcap::InitcapFunc, initcap); make_udf_function!(left::LeftFunc, left); make_udf_function!(lpad::LPadFunc, lpad); make_udf_function!(right::RightFunc, right); @@ -94,6 +96,10 @@ pub mod expr_fn { left, "returns the first `n` characters in the `string`", string n + ),( + initcap, + "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase", + string n ),( find_in_set, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings", @@ -126,6 +132,7 @@ pub fn functions() -> Vec> { vec![ character_length(), find_in_set(), + initcap(), left(), lpad(), reverse(), From 7ef87e973a06c333cf315f9b82a106f05abacd4b Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Fri, 13 Dec 2024 21:47:55 +0700 Subject: [PATCH 2/6] Update unit tests --- datafusion/functions/src/unicode/initcap.rs | 4 ++-- datafusion/functions/src/unicode/mod.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/unicode/initcap.rs b/datafusion/functions/src/unicode/initcap.rs index 6f3e0f8c25df..dd00ae10f380 100644 --- a/datafusion/functions/src/unicode/initcap.rs +++ b/datafusion/functions/src/unicode/initcap.rs @@ -200,7 +200,7 @@ mod tests { test_function!( InitcapFunc::new(), vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some( - "êm ả ñandú árbol олег иванович íslensku þjóðarinnar ελληνική" + "êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ" .to_string() )))], Ok(Some( @@ -258,7 +258,7 @@ mod tests { test_function!( InitcapFunc::new(), vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some( - "êm ả ñandú árbol олег иванович íslensku þjóðarinnar ελληνική" + "êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ" .to_string() )))], Ok(Some( diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index 7deb84188d2f..e8e3eb3f4e75 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -99,7 +99,7 @@ pub mod expr_fn { ),( initcap, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase", - string n + string ),( find_in_set, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings", From 8323c48ec53963b570116e43186a49b19d44144b Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Fri, 13 Dec 2024 21:59:07 +0700 Subject: [PATCH 3/6] Fix clippy warning --- datafusion/functions/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 575e8484a92f..4192c747c217 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -211,4 +211,4 @@ required-features = ["math_expressions"] [[bench]] harness = false name = "initcap" -required-features = ["string_expressions"] +required-features = ["unicode_expressions"] From 23089aa4c3b618e55b62c7ce549213987cbf2561 Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Fri, 13 Dec 2024 22:16:45 +0700 Subject: [PATCH 4/6] Update sqllogictests - initcap --- datafusion/sqllogictest/test_files/string/string_query.slt.part | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 80fcc0102887..2414e5864c99 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -460,7 +460,7 @@ Andrew Datafusion📊🔥 Xiangpeng Datafusion数据融合 Raphael Datafusionдатафусион Under_Score Un Iść Core -Percent Pan Tadeusz Ma Iść W KąT +Percent Pan Tadeusz Ma Iść W Kąt (empty) (empty) (empty) (empty) % (empty) From 810977cdf5a27b7b87f229dc9683c963dfdec302 Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Fri, 13 Dec 2024 22:40:24 +0700 Subject: [PATCH 5/6] Update scalar_functions.md docs --- docs/source/user-guide/sql/scalar_functions.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 4e74cfc54ae5..208d18f0e5ab 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1046,9 +1046,7 @@ find_in_set(str, strlist) ### `initcap` -Capitalizes the first character in each word in the ASCII input string. Words are delimited by non-alphanumeric characters. - -Note this function does not support UTF-8 characters. +Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters. ``` initcap(str) From 1064aa06d3e27831d79ee01246cb1eddfe6d493b Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Sun, 22 Dec 2024 11:04:03 +0700 Subject: [PATCH 6/6] Add suggestions change Signed-off-by: Tai Le Manh --- datafusion/functions/src/unicode/initcap.rs | 39 ++++++++++++--------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/datafusion/functions/src/unicode/initcap.rs b/datafusion/functions/src/unicode/initcap.rs index dd00ae10f380..e9f966b95868 100644 --- a/datafusion/functions/src/unicode/initcap.rs +++ b/datafusion/functions/src/unicode/initcap.rs @@ -131,7 +131,7 @@ fn initcap(args: &[ArrayRef]) -> Result { string_array.iter().for_each(|str| match str { Some(s) => { - let initcap_str = initcap_string(Some(s)).unwrap(); + let initcap_str = initcap_string(s); builder.append_value(initcap_str); } None => builder.append_null(), @@ -147,7 +147,7 @@ fn initcap_utf8view(args: &[ArrayRef]) -> Result { string_view_array.iter().for_each(|str| match str { Some(s) => { - let initcap_str = initcap_string(Some(s)).unwrap(); + let initcap_str = initcap_string(s); builder.append_value(initcap_str); } None => builder.append_null(), @@ -156,26 +156,31 @@ fn initcap_utf8view(args: &[ArrayRef]) -> Result { Ok(Arc::new(builder.finish()) as ArrayRef) } -fn initcap_string(input: Option<&str>) -> Option { - input.map(|s| { - let mut result = String::with_capacity(s.len()); - let mut prev_is_alphanumeric = false; +fn initcap_string(input: &str) -> String { + let mut result = String::with_capacity(input.len()); + let mut prev_is_alphanumeric = false; - for c in s.chars() { + if input.is_ascii() { + for c in input.chars() { if prev_is_alphanumeric { - for lc in c.to_lowercase() { - result.push(lc); - } + result.push(c.to_ascii_lowercase()); } else { - for uc in c.to_uppercase() { - result.push(uc); - } + result.push(c.to_ascii_uppercase()); + }; + prev_is_alphanumeric = c.is_ascii_alphanumeric(); + } + } else { + for c in input.chars() { + if prev_is_alphanumeric { + result.extend(c.to_lowercase()); + } else { + result.extend(c.to_uppercase()); } prev_is_alphanumeric = c.is_alphanumeric(); } + } - result - }) + result } #[cfg(test)] @@ -258,11 +263,11 @@ mod tests { test_function!( InitcapFunc::new(), vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some( - "êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ" + "đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ" .to_string() )))], Ok(Some( - "Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική" + "Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική" )), &str, Utf8,