From 07130163541b26d76a28bc56d931f1a65c5f7c82 Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Sun, 22 Dec 2024 18:10:35 +0700 Subject: [PATCH] Support unicode character for `initcap` function (#13752) * Support unicode character for 'initcap' function Signed-off-by: Tai Le Manh * Update unit tests * Fix clippy warning * Update sqllogictests - initcap * Update scalar_functions.md docs * Add suggestions change Signed-off-by: Tai Le Manh --------- Signed-off-by: Tai Le Manh --- datafusion/functions/Cargo.toml | 2 +- datafusion/functions/benches/initcap.rs | 4 +- datafusion/functions/src/string/mod.rs | 7 -- .../src/{string => unicode}/initcap.rs | 114 +++++++++++++----- datafusion/functions/src/unicode/mod.rs | 7 ++ .../test_files/string/string_query.slt.part | 2 +- .../source/user-guide/sql/scalar_functions.md | 4 +- 7 files changed, 93 insertions(+), 47 deletions(-) rename datafusion/functions/src/{string => unicode}/initcap.rs (68%) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index de72c7ee946b3..fd986c4be41cc 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -212,4 +212,4 @@ required-features = ["math_expressions"] [[bench]] harness = false name = "initcap" -required-features = ["string_expressions"] +required-features = ["unicode_expressions"] diff --git a/datafusion/functions/benches/initcap.rs b/datafusion/functions/benches/initcap.rs index c88b6b513980c..97c76831b33c8 100644 --- a/datafusion/functions/benches/initcap.rs +++ b/datafusion/functions/benches/initcap.rs @@ -24,7 +24,7 @@ use arrow::util::bench_util::{ }; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; -use datafusion_functions::string; +use datafusion_functions::unicode; use std::sync::Arc; fn create_args( @@ -46,7 +46,7 @@ fn create_args( } fn criterion_benchmark(c: &mut Criterion) { - let initcap = string::initcap(); + let initcap = unicode::initcap(); for size in [1024, 4096] { let args = create_args::(size, 8, true); c.bench_function( diff --git a/datafusion/functions/src/string/mod.rs b/datafusion/functions/src/string/mod.rs index f156f070d960c..c43aaeccbefeb 100644 --- a/datafusion/functions/src/string/mod.rs +++ b/datafusion/functions/src/string/mod.rs @@ -30,7 +30,6 @@ pub mod concat; pub mod concat_ws; pub mod contains; pub mod ends_with; -pub mod initcap; pub mod levenshtein; pub mod lower; pub mod ltrim; @@ -52,7 +51,6 @@ make_udf_function!(chr::ChrFunc, chr); make_udf_function!(concat::ConcatFunc, concat); make_udf_function!(concat_ws::ConcatWsFunc, concat_ws); make_udf_function!(ends_with::EndsWithFunc, ends_with); -make_udf_function!(initcap::InitcapFunc, initcap); make_udf_function!(levenshtein::LevenshteinFunc, levenshtein); make_udf_function!(ltrim::LtrimFunc, ltrim); make_udf_function!(lower::LowerFunc, lower); @@ -94,10 +92,6 @@ pub mod expr_fn { ends_with, "Returns true if the `string` ends with the `suffix`, false otherwise.", string suffix - ),( - initcap, - "Converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase", - string ),( levenshtein, "Returns the Levenshtein distance between the two given strings", @@ -177,7 +171,6 @@ pub fn functions() -> Vec> { concat(), concat_ws(), ends_with(), - initcap(), levenshtein(), lower(), ltrim(), diff --git a/datafusion/functions/src/string/initcap.rs b/datafusion/functions/src/unicode/initcap.rs similarity index 68% rename from datafusion/functions/src/string/initcap.rs rename to datafusion/functions/src/unicode/initcap.rs index 2780dcaeeb834..e9f966b958683 100644 --- a/datafusion/functions/src/string/initcap.rs +++ b/datafusion/functions/src/unicode/initcap.rs @@ -18,7 +18,9 @@ use std::any::Any; use std::sync::{Arc, OnceLock}; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; +use arrow::array::{ + Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder, +}; use arrow::datatypes::DataType; use crate::utils::{make_scalar_function, utf8_to_str_type}; @@ -74,7 +76,7 @@ impl ScalarUDFImpl for InitcapFunc { DataType::LargeUtf8 => make_scalar_function(initcap::, vec![])(args), DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args), other => { - exec_err!("Unsupported data type {other:?} for function initcap") + exec_err!("Unsupported data type {other:?} for function `initcap`") } } } @@ -90,9 +92,8 @@ fn get_initcap_doc() -> &'static Documentation { DOCUMENTATION.get_or_init(|| { Documentation::builder( DOC_SECTION_STRING, - "Capitalizes the first character in each word in the ASCII input string. \ - Words are delimited by non-alphanumeric characters.\n\n\ - Note this function does not support UTF-8 characters.", + "Capitalizes the first character in each word in the input string. \ + Words are delimited by non-alphanumeric characters.", "initcap(str)", ) .with_sql_example( @@ -123,50 +124,70 @@ fn get_initcap_doc() -> &'static Documentation { fn initcap(args: &[ArrayRef]) -> Result { let string_array = as_generic_string_array::(&args[0])?; - // first map is the iterator, second is for the `Option<_>` - let result = string_array - .iter() - .map(initcap_string) - .collect::>(); + let mut builder = GenericStringBuilder::::with_capacity( + string_array.len(), + string_array.value_data().len(), + ); - Ok(Arc::new(result) as ArrayRef) + string_array.iter().for_each(|str| match str { + Some(s) => { + let initcap_str = initcap_string(s); + builder.append_value(initcap_str); + } + None => builder.append_null(), + }); + + Ok(Arc::new(builder.finish()) as ArrayRef) } fn initcap_utf8view(args: &[ArrayRef]) -> Result { let string_view_array = as_string_view_array(&args[0])?; - let result = string_view_array - .iter() - .map(initcap_string) - .collect::(); + let mut builder = StringViewBuilder::with_capacity(string_view_array.len()); + + string_view_array.iter().for_each(|str| match str { + Some(s) => { + let initcap_str = initcap_string(s); + builder.append_value(initcap_str); + } + None => builder.append_null(), + }); - Ok(Arc::new(result) as ArrayRef) + Ok(Arc::new(builder.finish()) as ArrayRef) } -fn initcap_string(input: Option<&str>) -> Option { - input.map(|s| { - let mut result = String::with_capacity(s.len()); - let mut prev_is_alphanumeric = false; +fn initcap_string(input: &str) -> String { + let mut result = String::with_capacity(input.len()); + let mut prev_is_alphanumeric = false; - for c in s.chars() { - let transformed = if prev_is_alphanumeric { - c.to_ascii_lowercase() + if input.is_ascii() { + for c in input.chars() { + if prev_is_alphanumeric { + result.push(c.to_ascii_lowercase()); } else { - c.to_ascii_uppercase() + result.push(c.to_ascii_uppercase()); }; - result.push(transformed); prev_is_alphanumeric = c.is_ascii_alphanumeric(); } + } else { + for c in input.chars() { + if prev_is_alphanumeric { + result.extend(c.to_lowercase()); + } else { + result.extend(c.to_uppercase()); + } + prev_is_alphanumeric = c.is_alphanumeric(); + } + } - result - }) + result } #[cfg(test)] mod tests { - use crate::string::initcap::InitcapFunc; + use crate::unicode::initcap::InitcapFunc; use crate::utils::test::test_function; - use arrow::array::{Array, StringArray}; + use arrow::array::{Array, StringArray, StringViewArray}; use arrow::datatypes::DataType::Utf8; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; @@ -181,6 +202,19 @@ mod tests { Utf8, StringArray ); + test_function!( + InitcapFunc::new(), + vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ" + .to_string() + )))], + Ok(Some( + "Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική" + )), + &str, + Utf8, + StringArray + ); test_function!( InitcapFunc::new(), vec![ColumnarValue::Scalar(ScalarValue::from(""))], @@ -205,6 +239,7 @@ mod tests { Utf8, StringArray ); + test_function!( InitcapFunc::new(), vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some( @@ -213,7 +248,7 @@ mod tests { Ok(Some("Hi Thomas")), &str, Utf8, - StringArray + StringViewArray ); test_function!( InitcapFunc::new(), @@ -223,7 +258,20 @@ mod tests { Ok(Some("Hi Thomas With M0re Than 12 Chars")), &str, Utf8, - StringArray + StringViewArray + ); + test_function!( + InitcapFunc::new(), + vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + "đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ" + .to_string() + )))], + Ok(Some( + "Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική" + )), + &str, + Utf8, + StringViewArray ); test_function!( InitcapFunc::new(), @@ -233,7 +281,7 @@ mod tests { Ok(Some("")), &str, Utf8, - StringArray + StringViewArray ); test_function!( InitcapFunc::new(), @@ -241,7 +289,7 @@ mod tests { Ok(None), &str, Utf8, - StringArray + StringViewArray ); Ok(()) diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index f31ece9196d86..e8e3eb3f4e758 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -23,6 +23,7 @@ use datafusion_expr::ScalarUDF; pub mod character_length; pub mod find_in_set; +pub mod initcap; pub mod left; pub mod lpad; pub mod reverse; @@ -36,6 +37,7 @@ pub mod translate; // create UDFs make_udf_function!(character_length::CharacterLengthFunc, character_length); make_udf_function!(find_in_set::FindInSetFunc, find_in_set); +make_udf_function!(initcap::InitcapFunc, initcap); make_udf_function!(left::LeftFunc, left); make_udf_function!(lpad::LPadFunc, lpad); make_udf_function!(right::RightFunc, right); @@ -94,6 +96,10 @@ pub mod expr_fn { left, "returns the first `n` characters in the `string`", string n + ),( + initcap, + "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase", + string ),( find_in_set, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings", @@ -126,6 +132,7 @@ pub fn functions() -> Vec> { vec![ character_length(), find_in_set(), + initcap(), left(), lpad(), reverse(), diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 80fcc0102887a..2414e5864c998 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -460,7 +460,7 @@ Andrew Datafusion📊🔥 Xiangpeng Datafusion数据融合 Raphael Datafusionдатафусион Under_Score Un Iść Core -Percent Pan Tadeusz Ma Iść W KąT +Percent Pan Tadeusz Ma Iść W Kąt (empty) (empty) (empty) (empty) % (empty) diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 2e4147f96e0fc..be4f5e56b3af3 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1070,9 +1070,7 @@ find_in_set(str, strlist) ### `initcap` -Capitalizes the first character in each word in the ASCII input string. Words are delimited by non-alphanumeric characters. - -Note this function does not support UTF-8 characters. +Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters. ``` initcap(str)