Skip to content

Commit

Permalink
Support unicode character for initcap function (apache#13752)
Browse files Browse the repository at this point in the history
* Support unicode character for 'initcap' function

Signed-off-by: Tai Le Manh <[email protected]>

* Update unit tests

* Fix clippy warning

* Update sqllogictests - initcap

* Update scalar_functions.md docs

* Add suggestions change

Signed-off-by: Tai Le Manh <[email protected]>

---------

Signed-off-by: Tai Le Manh <[email protected]>
  • Loading branch information
tlm365 authored and zhuqi-lucas committed Dec 23, 2024
1 parent 2d2ec62 commit 0713016
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 47 deletions.
2 changes: 1 addition & 1 deletion datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -212,4 +212,4 @@ required-features = ["math_expressions"]
[[bench]]
harness = false
name = "initcap"
required-features = ["string_expressions"]
required-features = ["unicode_expressions"]
4 changes: 2 additions & 2 deletions datafusion/functions/benches/initcap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use arrow::util::bench_util::{
};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::string;
use datafusion_functions::unicode;
use std::sync::Arc;

fn create_args<O: OffsetSizeTrait>(
Expand All @@ -46,7 +46,7 @@ fn create_args<O: OffsetSizeTrait>(
}

fn criterion_benchmark(c: &mut Criterion) {
let initcap = string::initcap();
let initcap = unicode::initcap();
for size in [1024, 4096] {
let args = create_args::<i32>(size, 8, true);
c.bench_function(
Expand Down
7 changes: 0 additions & 7 deletions datafusion/functions/src/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ pub mod concat;
pub mod concat_ws;
pub mod contains;
pub mod ends_with;
pub mod initcap;
pub mod levenshtein;
pub mod lower;
pub mod ltrim;
Expand All @@ -52,7 +51,6 @@ make_udf_function!(chr::ChrFunc, chr);
make_udf_function!(concat::ConcatFunc, concat);
make_udf_function!(concat_ws::ConcatWsFunc, concat_ws);
make_udf_function!(ends_with::EndsWithFunc, ends_with);
make_udf_function!(initcap::InitcapFunc, initcap);
make_udf_function!(levenshtein::LevenshteinFunc, levenshtein);
make_udf_function!(ltrim::LtrimFunc, ltrim);
make_udf_function!(lower::LowerFunc, lower);
Expand Down Expand Up @@ -94,10 +92,6 @@ pub mod expr_fn {
ends_with,
"Returns true if the `string` ends with the `suffix`, false otherwise.",
string suffix
),(
initcap,
"Converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase",
string
),(
levenshtein,
"Returns the Levenshtein distance between the two given strings",
Expand Down Expand Up @@ -177,7 +171,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
concat(),
concat_ws(),
ends_with(),
initcap(),
levenshtein(),
lower(),
ltrim(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
use std::any::Any;
use std::sync::{Arc, OnceLock};

use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
use arrow::array::{
Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
};
use arrow::datatypes::DataType;

use crate::utils::{make_scalar_function, utf8_to_str_type};
Expand Down Expand Up @@ -74,7 +76,7 @@ impl ScalarUDFImpl for InitcapFunc {
DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, vec![])(args),
DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args),
other => {
exec_err!("Unsupported data type {other:?} for function initcap")
exec_err!("Unsupported data type {other:?} for function `initcap`")
}
}
}
Expand All @@ -90,9 +92,8 @@ fn get_initcap_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_STRING,
"Capitalizes the first character in each word in the ASCII input string. \
Words are delimited by non-alphanumeric characters.\n\n\
Note this function does not support UTF-8 characters.",
"Capitalizes the first character in each word in the input string. \
Words are delimited by non-alphanumeric characters.",
"initcap(str)",
)
.with_sql_example(
Expand Down Expand Up @@ -123,50 +124,70 @@ fn get_initcap_doc() -> &'static Documentation {
fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;

// first map is the iterator, second is for the `Option<_>`
let result = string_array
.iter()
.map(initcap_string)
.collect::<GenericStringArray<T>>();
let mut builder = GenericStringBuilder::<T>::with_capacity(
string_array.len(),
string_array.value_data().len(),
);

Ok(Arc::new(result) as ArrayRef)
string_array.iter().for_each(|str| match str {
Some(s) => {
let initcap_str = initcap_string(s);
builder.append_value(initcap_str);
}
None => builder.append_null(),
});

Ok(Arc::new(builder.finish()) as ArrayRef)
}

fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_view_array = as_string_view_array(&args[0])?;

let result = string_view_array
.iter()
.map(initcap_string)
.collect::<StringArray>();
let mut builder = StringViewBuilder::with_capacity(string_view_array.len());

string_view_array.iter().for_each(|str| match str {
Some(s) => {
let initcap_str = initcap_string(s);
builder.append_value(initcap_str);
}
None => builder.append_null(),
});

Ok(Arc::new(result) as ArrayRef)
Ok(Arc::new(builder.finish()) as ArrayRef)
}

fn initcap_string(input: Option<&str>) -> Option<String> {
input.map(|s| {
let mut result = String::with_capacity(s.len());
let mut prev_is_alphanumeric = false;
fn initcap_string(input: &str) -> String {
let mut result = String::with_capacity(input.len());
let mut prev_is_alphanumeric = false;

for c in s.chars() {
let transformed = if prev_is_alphanumeric {
c.to_ascii_lowercase()
if input.is_ascii() {
for c in input.chars() {
if prev_is_alphanumeric {
result.push(c.to_ascii_lowercase());
} else {
c.to_ascii_uppercase()
result.push(c.to_ascii_uppercase());
};
result.push(transformed);
prev_is_alphanumeric = c.is_ascii_alphanumeric();
}
} else {
for c in input.chars() {
if prev_is_alphanumeric {
result.extend(c.to_lowercase());
} else {
result.extend(c.to_uppercase());
}
prev_is_alphanumeric = c.is_alphanumeric();
}
}

result
})
result
}

#[cfg(test)]
mod tests {
use crate::string::initcap::InitcapFunc;
use crate::unicode::initcap::InitcapFunc;
use crate::utils::test::test_function;
use arrow::array::{Array, StringArray};
use arrow::array::{Array, StringArray, StringViewArray};
use arrow::datatypes::DataType::Utf8;
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
Expand All @@ -181,6 +202,19 @@ mod tests {
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
"êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
.to_string()
)))],
Ok(Some(
"Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
)),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::from(""))],
Expand All @@ -205,6 +239,7 @@ mod tests {
Utf8,
StringArray
);

test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
Expand All @@ -213,7 +248,7 @@ mod tests {
Ok(Some("Hi Thomas")),
&str,
Utf8,
StringArray
StringViewArray
);
test_function!(
InitcapFunc::new(),
Expand All @@ -223,7 +258,20 @@ mod tests {
Ok(Some("Hi Thomas With M0re Than 12 Chars")),
&str,
Utf8,
StringArray
StringViewArray
);
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
.to_string()
)))],
Ok(Some(
"Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
)),
&str,
Utf8,
StringViewArray
);
test_function!(
InitcapFunc::new(),
Expand All @@ -233,15 +281,15 @@ mod tests {
Ok(Some("")),
&str,
Utf8,
StringArray
StringViewArray
);
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(None))],
Ok(None),
&str,
Utf8,
StringArray
StringViewArray
);

Ok(())
Expand Down
7 changes: 7 additions & 0 deletions datafusion/functions/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use datafusion_expr::ScalarUDF;

pub mod character_length;
pub mod find_in_set;
pub mod initcap;
pub mod left;
pub mod lpad;
pub mod reverse;
Expand All @@ -36,6 +37,7 @@ pub mod translate;
// create UDFs
make_udf_function!(character_length::CharacterLengthFunc, character_length);
make_udf_function!(find_in_set::FindInSetFunc, find_in_set);
make_udf_function!(initcap::InitcapFunc, initcap);
make_udf_function!(left::LeftFunc, left);
make_udf_function!(lpad::LPadFunc, lpad);
make_udf_function!(right::RightFunc, right);
Expand Down Expand Up @@ -94,6 +96,10 @@ pub mod expr_fn {
left,
"returns the first `n` characters in the `string`",
string n
),(
initcap,
"converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase",
string
),(
find_in_set,
"Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings",
Expand Down Expand Up @@ -126,6 +132,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
vec![
character_length(),
find_in_set(),
initcap(),
left(),
lpad(),
reverse(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ Andrew Datafusion📊🔥
Xiangpeng Datafusion数据融合
Raphael Datafusionдатафусион
Under_Score Un Iść Core
Percent Pan Tadeusz Ma Iść W KąT
Percent Pan Tadeusz Ma Iść W Kąt
(empty) (empty)
(empty) (empty)
% (empty)
Expand Down
4 changes: 1 addition & 3 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1070,9 +1070,7 @@ find_in_set(str, strlist)

### `initcap`

Capitalizes the first character in each word in the ASCII input string. Words are delimited by non-alphanumeric characters.

Note this function does not support UTF-8 characters.
Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters.

```
initcap(str)
Expand Down

0 comments on commit 0713016

Please sign in to comment.