-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: support Utf8View
type in starts_with
function
#11787
Changes from 11 commits
1bf07f1
2c1dda9
65ba700
f38b363
96c0552
f203553
2a15df3
221dc76
a80e1bc
f06f342
5806155
5509314
82ce1fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,10 +18,10 @@ | |
use std::any::Any; | ||
use std::sync::Arc; | ||
|
||
use arrow::array::{ArrayRef, OffsetSizeTrait}; | ||
use arrow::array::ArrayRef; | ||
use arrow::datatypes::DataType; | ||
|
||
use datafusion_common::{cast::as_generic_string_array, internal_err, Result}; | ||
use datafusion_common::{internal_err, Result}; | ||
use datafusion_expr::ColumnarValue; | ||
use datafusion_expr::TypeSignature::*; | ||
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; | ||
|
@@ -30,12 +30,8 @@ use crate::utils::make_scalar_function; | |
|
||
/// Returns true if string starts with prefix. | ||
/// starts_with('alphabet', 'alph') = 't' | ||
pub fn starts_with<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { | ||
let left = as_generic_string_array::<T>(&args[0])?; | ||
let right = as_generic_string_array::<T>(&args[1])?; | ||
|
||
let result = arrow::compute::kernels::comparison::starts_with(left, right)?; | ||
|
||
pub fn starts_with(args: &[ArrayRef]) -> Result<ArrayRef> { | ||
let result = arrow::compute::kernels::comparison::starts_with(&args[0], &args[1])?; | ||
Ok(Arc::new(result) as ArrayRef) | ||
} | ||
|
||
|
@@ -52,14 +48,15 @@ impl Default for StartsWithFunc { | |
|
||
impl StartsWithFunc { | ||
pub fn new() -> Self { | ||
use DataType::*; | ||
Self { | ||
signature: Signature::one_of( | ||
vec![ | ||
Exact(vec![Utf8, Utf8]), | ||
Exact(vec![Utf8, LargeUtf8]), | ||
Exact(vec![LargeUtf8, Utf8]), | ||
Exact(vec![LargeUtf8, LargeUtf8]), | ||
// Planner attempts coercion to the target type starting with the most preferred candidate. | ||
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. | ||
// If that fails, it proceeds to `(Utf8, Utf8)`. | ||
Exact(vec![DataType::Utf8View, DataType::Utf8View]), | ||
tshauck marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Exact(vec![DataType::Utf8, DataType::Utf8]), | ||
Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), | ||
], | ||
Volatility::Immutable, | ||
), | ||
|
@@ -81,18 +78,73 @@ impl ScalarUDFImpl for StartsWithFunc { | |
} | ||
|
||
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { | ||
use DataType::*; | ||
|
||
Ok(Boolean) | ||
Ok(DataType::Boolean) | ||
} | ||
|
||
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { | ||
match args[0].data_type() { | ||
DataType::Utf8 => make_scalar_function(starts_with::<i32>, vec![])(args), | ||
DataType::LargeUtf8 => { | ||
return make_scalar_function(starts_with::<i64>, vec![])(args); | ||
DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => { | ||
make_scalar_function(starts_with, vec![])(args) | ||
} | ||
_ => internal_err!("Unsupported data type"), | ||
_ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?, | ||
} | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::utils::test::test_function; | ||
use arrow::array::{Array, BooleanArray}; | ||
use arrow::datatypes::DataType::Boolean; | ||
use datafusion_common::{Result, ScalarValue}; | ||
use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; | ||
|
||
use super::*; | ||
|
||
#[test] | ||
fn test_functions() -> Result<()> { | ||
// Generate test cases for starts_with | ||
let test_cases = vec![ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if this is the best way to test this change. It tests the function but doesn't a view wouldn't be coerced. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have tests for the coercion in #11753 -- you beat me to filing the ticket ❤️ -- I plan to make an epic / start filing items for the various other functions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recommend using a slt test (example of sql test above) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to throw it out there, if you make the top level epic, I'd be happy to work though that PR and make individual issues for the problematic functions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deal! I filed #11790 What I recommend doing is starting with a single ticket and making it a complete template / description with the idea being that that someone that is not familar DataFusion or Utf8View can work on it Then when we file tickets for the rest of the functions we can reuse the same description over again. This
I took a shot at updating #11786 with background for use a template. Let me know what you think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good to me. I'll have a go at replicating one of the other functions following #11786, and then if we're happy with it, I can grind through the others. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good -- thank you. What I have found works well in the past is to do like 10 at a time (if you make more than that the number of parallel threads / work I found overwhelmed our ability to review them) |
||
(Some("alphabet"), Some("alph"), Some(true)), | ||
(Some("alphabet"), Some("bet"), Some(false)), | ||
( | ||
Some("somewhat large string"), | ||
Some("somewhat large"), | ||
Some(true), | ||
), | ||
(Some("somewhat large string"), Some("large"), Some(false)), | ||
] | ||
.into_iter() | ||
.flat_map(|(a, b, c)| { | ||
let utf_8_args = vec![ | ||
ColumnarValue::Scalar(ScalarValue::Utf8(a.map(|s| s.to_string()))), | ||
ColumnarValue::Scalar(ScalarValue::Utf8(b.map(|s| s.to_string()))), | ||
]; | ||
|
||
let large_utf_8_args = vec![ | ||
ColumnarValue::Scalar(ScalarValue::LargeUtf8(a.map(|s| s.to_string()))), | ||
ColumnarValue::Scalar(ScalarValue::LargeUtf8(b.map(|s| s.to_string()))), | ||
]; | ||
|
||
let utf_8_view_args = vec![ | ||
ColumnarValue::Scalar(ScalarValue::Utf8View(a.map(|s| s.to_string()))), | ||
ColumnarValue::Scalar(ScalarValue::Utf8View(b.map(|s| s.to_string()))), | ||
]; | ||
|
||
vec![(utf_8_args, c), (large_utf_8_args, c), (utf_8_view_args, c)] | ||
}); | ||
|
||
for (args, expected) in test_cases { | ||
test_function!( | ||
StartsWithFunc::new(), | ||
&args, | ||
Ok(expected), | ||
bool, | ||
Boolean, | ||
BooleanArray | ||
); | ||
} | ||
|
||
Ok(()) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍