From b4a51f97f697c3caf39f748cf1d8736164e44fc0 Mon Sep 17 00:00:00 2001 From: Chojan Shang Date: Sat, 10 Aug 2024 08:59:56 +0800 Subject: [PATCH 1/3] Implement native support StringView for Levenshtein Signed-off-by: Chojan Shang --- .../functions/src/string/levenshtein.rs | 23 ++++++++++++++++--- .../sqllogictest/test_files/string_view.slt | 6 ++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index 3edf6de8c863..ab5005b06f5f 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -42,10 +42,13 @@ impl Default for LevenshteinFunc { impl LevenshteinFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], + vec![ + Exact(vec![DataType::Utf8View, DataType::Utf8View]), + Exact(vec![DataType::Utf8, DataType::Utf8]), + Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), + ], Volatility::Immutable, ), } @@ -71,6 +74,7 @@ impl ScalarUDFImpl for LevenshteinFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { + DataType::Utf8View => make_scalar_function(levenshtein::, vec![])(args), DataType::Utf8 => make_scalar_function(levenshtein::, vec![])(args), DataType::LargeUtf8 => make_scalar_function(levenshtein::, vec![])(args), other => { @@ -92,6 +96,19 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { let str1_array = as_generic_string_array::(&args[0])?; let str2_array = as_generic_string_array::(&args[1])?; match args[0].data_type() { + DataType::Utf8View => { + let result = str1_array + .iter() + .zip(str2_array.iter()) + .map(|(string1, string2)| match (string1, string2) { + (Some(string1), Some(string2)) => { + Some(datafusion_strsim::levenshtein(string1, string2) as i32) + } + _ => None, + }) + .collect::(); + Ok(Arc::new(result) as ArrayRef) + } DataType::Utf8 => { let result = str1_array .iter() @@ -120,7 +137,7 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { } other => { exec_err!( - "levenshtein was called with {other} datatype arguments. It requires Utf8 or LargeUtf8." + "levenshtein was called with {other} datatype arguments. It requires Utf8View, Utf8 or LargeUtf8." ) } } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index e7166690580f..a06148095ac6 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -599,7 +599,6 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for LEVENSHTEIN -## TODO https://github.com/apache/datafusion/issues/11854 query TT EXPLAIN SELECT levenshtein(column1_utf8view, 'foo') as c1, @@ -607,9 +606,8 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: levenshtein(__common_expr_1, Utf8("foo")) AS c1, levenshtein(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 -02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8view, column2_utf8view] +01)Projection: levenshtein(test.column1_utf8view, Utf8View("foo")) AS c1, levenshtein(test.column1_utf8view, test.column2_utf8view) AS c2 +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for LOWER ## TODO https://github.com/apache/datafusion/issues/11855 From 8a14b4083a3311c90d6d4c493a9ba226c81ebd15 Mon Sep 17 00:00:00 2001 From: Chojan Shang Date: Sat, 10 Aug 2024 09:44:57 +0800 Subject: [PATCH 2/3] Remove useless code Signed-off-by: Chojan Shang --- datafusion/functions/src/string/levenshtein.rs | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index ab5005b06f5f..632956622961 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -74,8 +74,7 @@ impl ScalarUDFImpl for LevenshteinFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8View => make_scalar_function(levenshtein::, vec![])(args), - DataType::Utf8 => make_scalar_function(levenshtein::, vec![])(args), + DataType::Utf8View | DataType::Utf8 => make_scalar_function(levenshtein::, vec![])(args), DataType::LargeUtf8 => make_scalar_function(levenshtein::, vec![])(args), other => { exec_err!("Unsupported data type {other:?} for function levenshtein") @@ -96,20 +95,7 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { let str1_array = as_generic_string_array::(&args[0])?; let str2_array = as_generic_string_array::(&args[1])?; match args[0].data_type() { - DataType::Utf8View => { - let result = str1_array - .iter() - .zip(str2_array.iter()) - .map(|(string1, string2)| match (string1, string2) { - (Some(string1), Some(string2)) => { - Some(datafusion_strsim::levenshtein(string1, string2) as i32) - } - _ => None, - }) - .collect::(); - Ok(Arc::new(result) as ArrayRef) - } - DataType::Utf8 => { + DataType::Utf8View | DataType::Utf8 => { let result = str1_array .iter() .zip(str2_array.iter()) From 222e6443a2f150e63fc4d5ac90f88bcc908733eb Mon Sep 17 00:00:00 2001 From: Chojan Shang Date: Sat, 10 Aug 2024 10:44:43 +0800 Subject: [PATCH 3/3] Minor fix Signed-off-by: Chojan Shang --- .../functions/src/string/levenshtein.rs | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index 632956622961..430c402a50c5 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -22,7 +22,7 @@ use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait}; use arrow::datatypes::DataType; use crate::utils::{make_scalar_function, utf8_to_int_type}; -use datafusion_common::cast::as_generic_string_array; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::utils::datafusion_strsim; use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; @@ -74,7 +74,9 @@ impl ScalarUDFImpl for LevenshteinFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8View | DataType::Utf8 => make_scalar_function(levenshtein::, vec![])(args), + DataType::Utf8View | DataType::Utf8 => { + make_scalar_function(levenshtein::, vec![])(args) + } DataType::LargeUtf8 => make_scalar_function(levenshtein::, vec![])(args), other => { exec_err!("Unsupported data type {other:?} for function levenshtein") @@ -92,10 +94,26 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { args.len() ); } - let str1_array = as_generic_string_array::(&args[0])?; - let str2_array = as_generic_string_array::(&args[1])?; + match args[0].data_type() { - DataType::Utf8View | DataType::Utf8 => { + DataType::Utf8View => { + let str1_array = as_string_view_array(&args[0])?; + let str2_array = as_string_view_array(&args[1])?; + let result = str1_array + .iter() + .zip(str2_array.iter()) + .map(|(string1, string2)| match (string1, string2) { + (Some(string1), Some(string2)) => { + Some(datafusion_strsim::levenshtein(string1, string2) as i32) + } + _ => None, + }) + .collect::(); + Ok(Arc::new(result) as ArrayRef) + } + DataType::Utf8 => { + let str1_array = as_generic_string_array::(&args[0])?; + let str2_array = as_generic_string_array::(&args[1])?; let result = str1_array .iter() .zip(str2_array.iter()) @@ -109,6 +127,8 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } DataType::LargeUtf8 => { + let str1_array = as_generic_string_array::(&args[0])?; + let str2_array = as_generic_string_array::(&args[1])?; let result = str1_array .iter() .zip(str2_array.iter())