From 5adad9a61143c010ca0632df98e3cfc397b299a5 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 8 Aug 2024 21:33:18 +0800 Subject: [PATCH] Update INITCAP scalar function to support Utf8View --- datafusion/functions/src/string/initcap.rs | 81 ++++++++++++++----- .../sqllogictest/test_files/string_view.slt | 35 ++++++++ 2 files changed, 97 insertions(+), 19 deletions(-) diff --git a/datafusion/functions/src/string/initcap.rs b/datafusion/functions/src/string/initcap.rs index 864179d130fd..5d2733c3dc56 100644 --- a/datafusion/functions/src/string/initcap.rs +++ b/datafusion/functions/src/string/initcap.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::datatypes::DataType; -use datafusion_common::cast::as_generic_string_array; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; @@ -45,7 +45,7 @@ impl InitcapFunc { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8], + vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), } @@ -73,6 +73,9 @@ impl ScalarUDFImpl for InitcapFunc { match args[0].data_type() { DataType::Utf8 => make_scalar_function(initcap::, vec![])(args), DataType::LargeUtf8 => make_scalar_function(initcap::, vec![])(args), + DataType::Utf8View => { + make_scalar_function(initcap_utf8view::, vec![])(args) + } other => { exec_err!("Unsupported data type {other:?} for function initcap") } @@ -88,28 +91,40 @@ fn initcap(args: &[ArrayRef]) -> Result { // first map is the iterator, second is for the `Option<_>` let result = string_array .iter() - .map(|string| { - string.map(|string: &str| { - let mut char_vector = Vec::::new(); - let mut previous_character_letter_or_number = false; - for c in string.chars() { - if previous_character_letter_or_number { - char_vector.push(c.to_ascii_lowercase()); - } else { - char_vector.push(c.to_ascii_uppercase()); - } - previous_character_letter_or_number = c.is_ascii_uppercase() - || c.is_ascii_lowercase() - || c.is_ascii_digit(); - } - char_vector.iter().collect::() - }) - }) + .map(initcap_string) .collect::>(); Ok(Arc::new(result) as ArrayRef) } +fn initcap_utf8view(args: &[ArrayRef]) -> Result { + let string_view_array = as_string_view_array(&args[0])?; + + let result = string_view_array + .iter() + .map(initcap_string) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +fn initcap_string(string: Option<&str>) -> Option { + string.map(|string: &str| { + let mut char_vector = Vec::::new(); + let mut previous_character_letter_or_number = false; + for c in string.chars() { + if previous_character_letter_or_number { + char_vector.push(c.to_ascii_lowercase()); + } else { + char_vector.push(c.to_ascii_uppercase()); + } + previous_character_letter_or_number = + c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit(); + } + char_vector.iter().collect::() + }) +} + #[cfg(test)] mod tests { use crate::string::initcap::InitcapFunc; @@ -153,6 +168,34 @@ mod tests { Utf8, StringArray ); + test_function!( + InitcapFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + "hi THOMAS".to_string() + )))], + Ok(Some("Hi Thomas")), + &str, + Utf8, + StringArray + ); + test_function!( + InitcapFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + "".to_string() + )))], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + test_function!( + InitcapFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(None))], + Ok(None), + &str, + Utf8, + StringArray + ); Ok(()) } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 4d3f72b1e8d4..7849db021fdb 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -425,6 +425,41 @@ logical_plan 01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] +### Initcap + +# Create a table with lowercase strings +statement ok +CREATE TABLE test_lowercase AS SELECT + lower(column1_utf8) as column1_utf8_lower, + lower(column1_large_utf8) as column1_large_utf8_lower, + lower(column1_utf8view) as column1_utf8view_lower +FROM test; + +# Test INITCAP with utf8view, utf8, and largeutf8 +# Should not cast anything +query TT +EXPLAIN SELECT + INITCAP(column1_utf8view_lower) as c1, + INITCAP(column1_utf8_lower) as c2, + INITCAP(column1_large_utf8_lower) as c3 +FROM test_lowercase; +---- +logical_plan +01)Projection: initcap(test_lowercase.column1_utf8view_lower) AS c1, initcap(test_lowercase.column1_utf8_lower) AS c2, initcap(test_lowercase.column1_large_utf8_lower) AS c3 +02)--TableScan: test_lowercase projection=[column1_utf8_lower, column1_large_utf8_lower, column1_utf8view_lower] + +query TTT +SELECT + INITCAP(column1_utf8view_lower) as c1, + INITCAP(column1_utf8_lower) as c2, + INITCAP(column1_large_utf8_lower) as c3 +FROM test_lowercase; +---- +Andrew Andrew Andrew +Xiangpeng Xiangpeng Xiangpeng +Raphael Raphael Raphael +NULL NULL NULL + statement ok drop table test;