From ba959bb11cb86d27650a82560b3a200782e17453 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 16:54:36 +0200 Subject: [PATCH 1/9] add binary support in arrow-string --- arrow-array/src/array/byte_view_array.rs | 15 + .../src/array/fixed_size_binary_array.rs | 11 + arrow-array/src/array/mod.rs | 46 + arrow-string/src/like.rs | 1880 ++++++++++++++++- arrow-string/src/predicate.rs | 526 ++++- 5 files changed, 2333 insertions(+), 145 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 9d2d396a5266..6d4f5f3ea8e3 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -754,6 +754,21 @@ where pub type BinaryViewArray = GenericByteViewArray; impl BinaryViewArray { + + /// Returns true if all data within this array is ASCII + pub fn is_ascii(&self) -> bool { + // Alternative (but incorrect): directly check the underlying buffers + // (1) Our binary view might be sparse, i.e., a subset of the buffers, + // so even if the buffer is not ascii, we can still be ascii. + // (2) It is quite difficult to know the range of each buffer (unlike BinaryArray) + // This means that this operation is quite expensive, shall we cache the result? + // i.e. track `is_ascii` in the builder. + self.iter().all(|v| match v { + Some(v) => v.is_ascii(), + None => true, + }) + } + /// Convert the [`BinaryViewArray`] to [`StringViewArray`] /// If items not utf8 data, validate will fail and error returned. pub fn to_string_view(self) -> Result { diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 576b8012491b..a65a4d391364 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -59,6 +59,17 @@ pub struct FixedSizeBinaryArray { } impl FixedSizeBinaryArray { + + /// Returns true if all data within this array is ASCII + pub fn is_ascii(&self) -> bool { + // TODO - check if we can do similar to BinaryArray + // as this is expensive + self.iter().all(|v| match v { + Some(v) => v.is_ascii(), + None => true, + }) + } + /// Create a new [`FixedSizeBinaryArray`] with `size` element size, panicking on failure /// /// # Panics diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 23b3cb628aaf..f2d06448b617 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -620,6 +620,52 @@ impl<'a> StringArrayType<'a> for &'a StringViewArray { } } +/// A trait for Arrow Binary Arrays, currently the following types are supported: +/// - `BinaryArray` +/// - `LargeBinaryArray` +/// - `BinaryViewArray` +/// - `FixedSizeBinaryArray` +/// +/// This trait helps to abstract over the different types of binary arrays +/// so that we don't need to duplicate the implementation for each type. +pub trait BinaryArrayType<'a>: ArrayAccessor + Sized { + /// Returns true if all data within this binary array is ASCII + fn is_ascii(&self) -> bool; + + /// Constructs a new iterator + fn iter(&self) -> ArrayIter; +} + +impl<'a, O: OffsetSizeTrait> BinaryArrayType<'a> for &'a GenericBinaryArray { + fn is_ascii(&self) -> bool { + GenericBinaryArray::::is_ascii(self) + } + + fn iter(&self) -> ArrayIter { + GenericBinaryArray::::iter(self) + } +} + +impl<'a> BinaryArrayType<'a> for &'a BinaryViewArray { + fn is_ascii(&self) -> bool { + BinaryViewArray::is_ascii(self) + } + + fn iter(&self) -> ArrayIter { + BinaryViewArray::iter(self) + } +} + +impl<'a> BinaryArrayType<'a> for &'a FixedSizeBinaryArray { + fn is_ascii(&self) -> bool { + FixedSizeBinaryArray::is_ascii(self) + } + + fn iter(&self) -> ArrayIter { + FixedSizeBinaryArray::iter(self) + } +} + impl PartialEq for dyn Array + '_ { fn eq(&self, other: &Self) -> bool { self.to_data().eq(&other.to_data()) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index e30e09146c6d..02132ef08003 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -17,7 +17,7 @@ //! Provide SQL's LIKE operators for Arrow's string arrays -use crate::predicate::Predicate; +use crate::predicate::{BinaryPredicate, Predicate, PredicateImpl}; use arrow_array::cast::AsArray; use arrow_array::*; @@ -27,6 +27,7 @@ use arrow_select::take::take; use std::sync::Arc; pub use arrow_array::StringArrayType; +use arrow_schema::DataType::{LargeUtf8, Utf8, Utf8View}; #[derive(Debug)] enum Op { @@ -133,12 +134,12 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { - apply::<&GenericStringArray>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) + apply::>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v, |a| a.is_ascii(), |a| a.as_bytes()) } (LargeUtf8, LargeUtf8) => { - apply::<&GenericStringArray>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) + apply::>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v, |a| a.is_ascii(), |a| a.as_bytes()) } - (Utf8View, Utf8View) => apply::<&StringViewArray>( + (Utf8View, Utf8View) => apply::( op, l.as_string_view(), l_s, @@ -146,6 +147,25 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + apply::>(op, l.as_binary(), l_s, l_v, r.as_binary(), r_s, r_v, |a| a.is_ascii(), |a| a) + } + (LargeBinary, LargeBinary) => { + apply::>(op, l.as_binary(), l_s, l_v, r.as_binary(), r_s, r_v, |a| a.is_ascii(), |a| a) + } + (BinaryView, BinaryView) => apply::( + op, + l.as_binary_view(), + l_s, + l_v, + r.as_binary_view(), + r_s, + r_v, + |a| a.is_ascii(), + |a| a ), (l_t, r_t) => Err(ArrowError::InvalidArgumentError(format!( "Invalid string operation: {l_t} {op} {r_t}" @@ -153,7 +173,7 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result + 'a>( +fn apply<'a, Predicate: PredicateImpl<'a> + 'a, T: ArrayAccessor + IntoIterator> + Sized + 'a>( op: Op, l: T, l_s: bool, @@ -161,6 +181,8 @@ fn apply<'a, T: StringArrayType<'a> + 'a>( r: T, r_s: bool, r_v: Option<&'a dyn AnyDictionaryArray>, + is_ascii: impl Fn(&T) -> bool, + as_bytes: impl Fn(&'a Predicate::UnsizedItem) -> &[u8], ) -> Result { let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); if r_s { @@ -172,50 +194,51 @@ fn apply<'a, T: StringArrayType<'a> + 'a>( if r.is_null(idx) { return Ok(BooleanArray::new_null(l_len)); } - op_scalar::(op, l, l_v, r.value(idx)) + op_scalar::(op, l, l_v, r.value(idx), is_ascii) } else { match (l_s, l_v, r_v) { (true, None, None) => { let v = l.is_valid(0).then(|| l.value(0)); - op_binary(op, std::iter::repeat(v), r.iter()) + op_binary::(op, std::iter::repeat(v), r.into_iter(), as_bytes) } (true, Some(l_v), None) => { let idx = l_v.is_valid(0).then(|| l_v.normalized_keys()[0]); let v = idx.and_then(|idx| l.is_valid(idx).then(|| l.value(idx))); - op_binary(op, std::iter::repeat(v), r.iter()) + op_binary::(op, std::iter::repeat(v), r.into_iter(), as_bytes) } (true, None, Some(r_v)) => { let v = l.is_valid(0).then(|| l.value(0)); - op_binary(op, std::iter::repeat(v), vectored_iter(r, r_v)) + op_binary::(op, std::iter::repeat(v), vectored_iter::(r, r_v), as_bytes) } (true, Some(l_v), Some(r_v)) => { let idx = l_v.is_valid(0).then(|| l_v.normalized_keys()[0]); let v = idx.and_then(|idx| l.is_valid(idx).then(|| l.value(idx))); - op_binary(op, std::iter::repeat(v), vectored_iter(r, r_v)) + op_binary::(op, std::iter::repeat(v), vectored_iter::(r, r_v), as_bytes) } - (false, None, None) => op_binary(op, l.iter(), r.iter()), - (false, Some(l_v), None) => op_binary(op, vectored_iter(l, l_v), r.iter()), - (false, None, Some(r_v)) => op_binary(op, l.iter(), vectored_iter(r, r_v)), + (false, None, None) => op_binary::(op, l.into_iter(), r.into_iter(), as_bytes), + (false, Some(l_v), None) => op_binary::(op, vectored_iter::(l, l_v), r.into_iter(), as_bytes), + (false, None, Some(r_v)) => op_binary::(op, l.into_iter(), vectored_iter::(r, r_v), as_bytes), (false, Some(l_v), Some(r_v)) => { - op_binary(op, vectored_iter(l, l_v), vectored_iter(r, r_v)) + op_binary::(op, vectored_iter::(l, l_v), vectored_iter::(r, r_v), as_bytes) } } } } #[inline(never)] -fn op_scalar<'a, T: StringArrayType<'a>>( +fn op_scalar<'a, Predicate: PredicateImpl<'a>, T: ArrayAccessor>( op: Op, l: T, l_v: Option<&dyn AnyDictionaryArray>, - r: &str, + r: &'a Predicate::UnsizedItem, + is_ascii: impl Fn(&T) -> bool, ) -> Result { let r = match op { Op::Like(neg) => Predicate::like(r)?.evaluate_array(l, neg), - Op::ILike(neg) => Predicate::ilike(r, l.is_ascii())?.evaluate_array(l, neg), + Op::ILike(neg) => Predicate::ilike(r, is_ascii(&l))?.evaluate_array(l, neg), Op::Contains => Predicate::contains(r).evaluate_array(l, false), - Op::StartsWith => Predicate::StartsWith(r).evaluate_array(l, false), - Op::EndsWith => Predicate::EndsWith(r).evaluate_array(l, false), + Op::StartsWith => Predicate::starts_with(r).evaluate_array(l, false), + Op::EndsWith => Predicate::ends_with(r).evaluate_array(l, false), }; Ok(match l_v { @@ -224,10 +247,10 @@ fn op_scalar<'a, T: StringArrayType<'a>>( }) } -fn vectored_iter<'a, T: StringArrayType<'a> + 'a>( +fn vectored_iter<'a, Predicate: PredicateImpl<'a> + 'a, T: ArrayAccessor + Sized + 'a>( a: T, a_v: &'a dyn AnyDictionaryArray, -) -> impl Iterator> + 'a { +) -> impl Iterator> + 'a { let nulls = a_v.nulls(); let keys = a_v.normalized_keys(); keys.into_iter().enumerate().map(move |(idx, key)| { @@ -239,35 +262,36 @@ fn vectored_iter<'a, T: StringArrayType<'a> + 'a>( } #[inline(never)] -fn op_binary<'a>( +fn op_binary<'a, Predicate: PredicateImpl<'a> + 'a>( op: Op, - l: impl Iterator>, - r: impl Iterator>, + l: impl Iterator>, + r: impl Iterator>, + as_bytes: impl Fn(&'a Predicate::UnsizedItem) -> &[u8], ) -> Result { match op { Op::Like(neg) => binary_predicate(l, r, neg, Predicate::like), Op::ILike(neg) => binary_predicate(l, r, neg, |s| Predicate::ilike(s, false)), - Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(l?, r?))).collect()), + Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(as_bytes(l?), as_bytes(r?)))).collect()), Op::StartsWith => Ok(l .zip(r) - .map(|(l, r)| Some(Predicate::StartsWith(r?).evaluate(l?))) + .map(|(l, r)| Some(Predicate::starts_with(r?).evaluate(l?))) .collect()), Op::EndsWith => Ok(l .zip(r) - .map(|(l, r)| Some(Predicate::EndsWith(r?).evaluate(l?))) + .map(|(l, r)| Some(Predicate::ends_with(r?).evaluate(l?))) .collect()), } } -fn str_contains(haystack: &str, needle: &str) -> bool { - memchr::memmem::find(haystack.as_bytes(), needle.as_bytes()).is_some() +fn str_contains(haystack: &[u8], needle: &[u8]) -> bool { + memchr::memmem::find(haystack, needle).is_some() } -fn binary_predicate<'a>( - l: impl Iterator>, - r: impl Iterator>, +fn binary_predicate<'a, Predicate: PredicateImpl<'a> + 'a>( + l: impl Iterator>, + r: impl Iterator>, neg: bool, - f: impl Fn(&'a str) -> Result, ArrowError>, + f: impl Fn(&'a Predicate::UnsizedItem) -> Result, ) -> Result { let mut previous = None; l.zip(r) @@ -275,7 +299,7 @@ fn binary_predicate<'a>( (Some(l), Some(r)) => { let p: &Predicate = match previous { Some((expr, ref predicate)) if expr == r => predicate, - _ => &previous.insert((r, f(r)?)).1, + _ => &previous.insert((r, f(&r)?)).1, }; Ok(Some(p.evaluate(l) != neg)) } @@ -291,6 +315,8 @@ fn make_scalar(data_type: &DataType, scalar: &str) -> Result Ok(Arc::new(StringArray::from_iter_values([scalar]))), DataType::LargeUtf8 => Ok(Arc::new(LargeStringArray::from_iter_values([scalar]))), DataType::Dictionary(_, v) => make_scalar(v.as_ref(), scalar), + DataType::Binary => Ok(Arc::new(BinaryArray::from_iter_values([scalar.as_bytes()]))), + DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter_values([scalar.as_bytes()]))), d => Err(ArrowError::InvalidArgumentError(format!( "Unsupported string scalar data type {d:?}", ))), @@ -398,8 +424,35 @@ legacy_kernels!( #[allow(deprecated)] mod tests { use super::*; - use arrow_array::types::Int8Type; + use arrow_array::types::{ArrowDictionaryKeyType, Int8Type}; use std::iter::zip; + use arrow_array::builder::BinaryDictionaryBuilder; + + + fn convert_binary_iterator_to_binary_dictionary<'a, K: ArrowDictionaryKeyType, I: IntoIterator>(iter: I) -> DictionaryArray { + let it = iter.into_iter(); + let (lower, _) = it.size_hint(); + let mut builder = BinaryDictionaryBuilder::with_capacity(lower, 256, 1024); + it.for_each(|i| { + builder + .append(i) + .expect("Unable to append a value to a dictionary array."); + }); + + builder.finish() + } + + fn convert_string_iterator_to_binary_dictionary<'a, K: ArrowDictionaryKeyType, I: IntoIterator>>(iter: I) -> DictionaryArray { + let it = iter.into_iter(); + let (lower, _) = it.size_hint(); + let mut builder = BinaryDictionaryBuilder::with_capacity(lower, 256, 1024); + it.for_each(|i| { + builder + .append_option(i.map(|i| i.as_bytes())) + }); + + builder.finish() + } /// Applying `op(left, right)`, both sides are arrays /// The macro tests four types of array implementations: @@ -432,6 +485,25 @@ mod tests { let right: DictionaryArray = $right.into_iter().collect(); let res = $op(&left, &right).unwrap(); assert_eq!(res, expected); + + let left_binary = $left.iter().map(|x| x.as_bytes()).collect::>(); + let right_binary = $right.iter().map(|x| x.as_bytes()).collect::>(); + let expected = BooleanArray::from($expected); + + let left = BinaryArray::from(left_binary.clone()); + let right = BinaryArray::from(right_binary.clone()); + let res = $op(&left, &right).unwrap(); + assert_eq!(res, expected); + + let left = LargeBinaryArray::from(left_binary.clone()); + let right = LargeBinaryArray::from(right_binary.clone()); + let res = $op(&left, &right).unwrap(); + assert_eq!(res, expected); + + let left: DictionaryArray = convert_binary_iterator_to_binary_dictionary(left_binary); + let right: DictionaryArray = convert_binary_iterator_to_binary_dictionary(right_binary); + let res = $op(&left, &right).unwrap(); + assert_eq!(res, expected); } }; } @@ -467,10 +539,45 @@ mod tests { let right: DictionaryArray = [$right].into_iter().collect(); let res = $op(&left, &Scalar::new(&right)).unwrap(); assert_eq!(res, expected); + + let left_binary = $left.iter().map(|x| x.as_bytes()).collect::>(); + let right_binary = $right.as_bytes(); + let expected = BooleanArray::from($expected); + + let left = BinaryArray::from(left_binary.clone()); + let right = BinaryArray::from_iter_values([right_binary]); + let res = $op(&left, &Scalar::new(&right)).unwrap(); + assert_eq!(res, expected); + + let left = LargeBinaryArray::from(left_binary.clone()); + let right = LargeBinaryArray::from_iter_values([right_binary]); + let res = $op(&left, &Scalar::new(&right)).unwrap(); + assert_eq!(res, expected); + + let left: DictionaryArray = convert_binary_iterator_to_binary_dictionary(left_binary); + let right: DictionaryArray = convert_binary_iterator_to_binary_dictionary([right_binary]); + let res = $op(&left, &Scalar::new(&right)).unwrap(); + assert_eq!(res, expected); } }; } + + trait IntoBinaryScalar { + fn into_binary_scalar(self) -> Scalar; + fn into_large_binary_scalar(self) -> Scalar; + } + + impl IntoBinaryScalar for &str { + fn into_binary_scalar(self) -> Scalar { + Scalar::new(make_scalar(&DataType::Binary, self).unwrap()) + } + + fn into_large_binary_scalar(self) -> Scalar { + Scalar::new(make_scalar(&DataType::LargeBinary, self).unwrap()) + } + } + test_utf8!( test_utf8_array_like, vec![ @@ -2887,6 +2994,1709 @@ mod tests { } } + + #[test] + fn test_binary_dict_like_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + Some("bbbbb\nAir"), + ]; + + let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data); + + assert_eq!( + like(&dict_array, &"Air".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true), + Some(false), + ]), + ); + + assert_eq!( + like(&dict_array, &"Air".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true), + Some(false), + ]), + ); + + assert_eq!( + like(&dict_array, &"Wa%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + like(&dict_array, &"Wa%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + like(&dict_array, &"%r".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + like(&dict_array, &"%r".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + like(&dict_array, &"%i%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + like(&dict_array, &"%i%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + like(&dict_array, &"%a%r%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + like(&dict_array, &"%a%r%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + } + + #[test] + fn test_binary_dict_nlike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + Some("bbbbb\nAir"), + ]; + + let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data); + + assert_eq!( + nlike(&dict_array, &"Air".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false), + Some(true), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"Air".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false), + Some(true), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"Wa%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"Wa%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"%r".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"%r".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"%i%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"%i%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"%a%r%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + nlike(&dict_array, &"%a%r%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + } + + #[test] + fn test_binary_dict_ilike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + Some("bbbbb\nAir"), + ]; + + let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data); + + assert_eq!( + ilike(&dict_array, &"air".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true), + Some(false), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"air".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true), + Some(false), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"wa%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"wa%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"%R".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"%R".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"%I%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"%I%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"%A%r%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + ilike(&dict_array, &"%A%r%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + } + + #[test] + fn test_binary_dict_nilike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + Some("bbbbb\nAir"), + ]; + + let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data); + + assert_eq!( + nilike(&dict_array, &"air".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false), + Some(true), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"air".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false), + Some(true), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"wa%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"wa%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"%R".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"%R".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"%I%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"%I%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"%A%r%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + nilike(&dict_array, &"%A%r%".into_binary_scalar()).unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + } + + #[test] + fn binary_null_like_pattern() { + // Different patterns have different execution code paths + for pattern in &[ + "", // can execute as equality check + "_", // can execute as length check + "%", // can execute as starts_with("") or non-null check + "a%", // can execute as starts_with("a") + "%a", // can execute as ends_with("") + "a%b", // can execute as starts_with("a") && ends_with("b") + "%a%", // can_execute as contains("a") + "%a%b_c_d%e", // can_execute as regular expression + ] { + // These tests focus on the null handling, but are case-insensitive + for like_f in [like, ilike, nlike, nilike] { + let a = Scalar::new(BinaryArray::new_null(1)); + let b = BinaryArray::new_scalar(pattern); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = Scalar::new(BinaryArray::new_null(1)); + let b = BinaryArray::from_iter_values([pattern]); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = BinaryArray::new_null(1); + let b = BinaryArray::from_iter_values([pattern]); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = BinaryArray::new_null(1); + let b = BinaryArray::new_scalar(pattern); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + } + } + } + + #[test] + fn binary_like_scalar_null() { + for like_f in [like, ilike, nlike, nilike] { + let a = BinaryArray::new_scalar("a"); + let b = Scalar::new(BinaryArray::new_null(1)); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = BinaryArray::from_iter_values(["a"]); + let b = Scalar::new(BinaryArray::new_null(1)); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = BinaryArray::from_iter_values(["a"]); + let b = BinaryArray::new_null(1); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = BinaryArray::new_scalar("a"); + let b = BinaryArray::new_null(1); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + } + } + + #[test] + fn binary_like_escape() { + // (value, pattern, expected) + let test_cases = vec![ + // Empty pattern + (r"", r"", true), + (r"\", r"", false), + // Sole (dangling) escape (some engines consider this invalid pattern) + (r"", r"\", false), + (r"\", r"\", true), + (r"\\", r"\", false), + (r"a", r"\", false), + (r"\a", r"\", false), + (r"\\a", r"\", false), + // Sole escape + (r"", r"\\", false), + (r"\", r"\\", true), + (r"\\", r"\\", false), + (r"a", r"\\", false), + (r"\a", r"\\", false), + (r"\\a", r"\\", false), + // Sole escape and dangling escape + (r"", r"\\\", false), + (r"\", r"\\\", false), + (r"\\", r"\\\", true), + (r"\\\", r"\\\", false), + (r"\\\\", r"\\\", false), + (r"a", r"\\\", false), + (r"\a", r"\\\", false), + (r"\\a", r"\\\", false), + // Sole two escapes + (r"", r"\\\\", false), + (r"\", r"\\\\", false), + (r"\\", r"\\\\", true), + (r"\\\", r"\\\\", false), + (r"\\\\", r"\\\\", false), + (r"\\\\\", r"\\\\", false), + (r"a", r"\\\\", false), + (r"\a", r"\\\\", false), + (r"\\a", r"\\\\", false), + // Escaped non-wildcard + (r"", r"\a", false), + (r"\", r"\a", false), + (r"\\", r"\a", false), + (r"a", r"\a", true), + (r"\a", r"\a", false), + (r"\\a", r"\a", false), + // Escaped _ wildcard + (r"", r"\_", false), + (r"\", r"\_", false), + (r"\\", r"\_", false), + (r"a", r"\_", false), + (r"_", r"\_", true), + (r"%", r"\_", false), + (r"\a", r"\_", false), + (r"\\a", r"\_", false), + (r"\_", r"\_", false), + (r"\\_", r"\_", false), + // Escaped % wildcard + (r"", r"\%", false), + (r"\", r"\%", false), + (r"\\", r"\%", false), + (r"a", r"\%", false), + (r"_", r"\%", false), + (r"%", r"\%", true), + (r"\a", r"\%", false), + (r"\\a", r"\%", false), + (r"\%", r"\%", false), + (r"\\%", r"\%", false), + // Escape and non-wildcard + (r"", r"\\a", false), + (r"\", r"\\a", false), + (r"\\", r"\\a", false), + (r"a", r"\\a", false), + (r"\a", r"\\a", true), + (r"\\a", r"\\a", false), + (r"\\\a", r"\\a", false), + // Escape and _ wildcard + (r"", r"\\_", false), + (r"\", r"\\_", false), + (r"\\", r"\\_", true), + (r"a", r"\\_", false), + (r"_", r"\\_", false), + (r"%", r"\\_", false), + (r"\a", r"\\_", true), + (r"\\a", r"\\_", false), + (r"\_", r"\\_", true), + (r"\\_", r"\\_", false), + (r"\\\_", r"\\_", false), + // Escape and % wildcard + (r"", r"\\%", false), + (r"\", r"\\%", true), + (r"\\", r"\\%", true), + (r"a", r"\\%", false), + (r"ab", r"\\%", false), + (r"a%", r"\\%", false), + (r"_", r"\\%", false), + (r"%", r"\\%", false), + (r"\a", r"\\%", true), + (r"\\a", r"\\%", true), + (r"\%", r"\\%", true), + (r"\\%", r"\\%", true), + (r"\\\%", r"\\%", true), + // %... pattern with dangling wildcard + (r"\", r"%\", true), + (r"\\", r"%\", true), + (r"%\", r"%\", true), + (r"%\\", r"%\", true), + (r"abc\", r"%\", true), + (r"abc", r"%\", false), + // %... pattern with wildcard + (r"\", r"%\\", true), + (r"\\", r"%\\", true), + (r"%\\", r"%\\", true), + (r"%\\\", r"%\\", true), + (r"abc\", r"%\\", true), + (r"abc", r"%\\", false), + // %... pattern including escaped non-wildcard + (r"ac", r"%a\c", true), + (r"xyzac", r"%a\c", true), + (r"abc", r"%a\c", false), + (r"a\c", r"%a\c", false), + (r"%a\c", r"%a\c", false), + // %... pattern including escape + (r"\", r"%a\\c", false), + (r"\\", r"%a\\c", false), + (r"ac", r"%a\\c", false), + (r"a\c", r"%a\\c", true), + (r"a\\c", r"%a\\c", false), + (r"abc", r"%a\\c", false), + (r"xyza\c", r"%a\\c", true), + (r"xyza\\c", r"%a\\c", false), + (r"%a\\c", r"%a\\c", false), + // ...% pattern with wildcard + (r"\", r"\\%", true), + (r"\\", r"\\%", true), + (r"\\%", r"\\%", true), + (r"\\\%", r"\\%", true), + (r"\abc", r"\\%", true), + (r"a", r"\\%", false), + (r"abc", r"\\%", false), + // ...% pattern including escaped non-wildcard + (r"ac", r"a\c%", true), + (r"acxyz", r"a\c%", true), + (r"abc", r"a\c%", false), + (r"a\c", r"a\c%", false), + (r"a\c%", r"a\c%", false), + (r"a\\c%", r"a\c%", false), + // ...% pattern including escape + (r"ac", r"a\\c%", false), + (r"a\c", r"a\\c%", true), + (r"a\cxyz", r"a\\c%", true), + (r"a\\c", r"a\\c%", false), + (r"a\\cxyz", r"a\\c%", false), + (r"abc", r"a\\c%", false), + (r"abcxyz", r"a\\c%", false), + (r"a\\c%", r"a\\c%", false), + // %...% pattern including escaped non-wildcard + (r"ac", r"%a\c%", true), + (r"xyzacxyz", r"%a\c%", true), + (r"abc", r"%a\c%", false), + (r"a\c", r"%a\c%", false), + (r"xyza\cxyz", r"%a\c%", false), + (r"%a\c%", r"%a\c%", false), + (r"%a\\c%", r"%a\c%", false), + // %...% pattern including escape + (r"ac", r"%a\\c%", false), + (r"a\c", r"%a\\c%", true), + (r"xyza\cxyz", r"%a\\c%", true), + (r"a\\c", r"%a\\c%", false), + (r"xyza\\cxyz", r"%a\\c%", false), + (r"abc", r"%a\\c%", false), + (r"xyzabcxyz", r"%a\\c%", false), + (r"%a\\c%", r"%a\\c%", false), + // Odd (7) backslashes and % wildcard + (r"\\%", r"\\\\\\\%", false), + (r"\\\", r"\\\\\\\%", false), + (r"\\\%", r"\\\\\\\%", true), + (r"\\\\", r"\\\\\\\%", false), + (r"\\\\%", r"\\\\\\\%", false), + (r"\\\\\\\%", r"\\\\\\\%", false), + // Odd (7) backslashes and _ wildcard + (r"\\\", r"\\\\\\\_", false), + (r"\\\\", r"\\\\\\\_", false), + (r"\\\_", r"\\\\\\\_", true), + (r"\\\\", r"\\\\\\\_", false), + (r"\\\a", r"\\\\\\\_", false), + (r"\\\\_", r"\\\\\\\_", false), + (r"\\\\\\\_", r"\\\\\\\_", false), + // Even (8) backslashes and % wildcard + (r"\\\", r"\\\\\\\\%", false), + (r"\\\\", r"\\\\\\\\%", true), + (r"\\\\\", r"\\\\\\\\%", true), + (r"\\\\xyz", r"\\\\\\\\%", true), + (r"\\\\\\\\%", r"\\\\\\\\%", true), + // Even (8) backslashes and _ wildcard + (r"\\\", r"\\\\\\\\_", false), + (r"\\\\", r"\\\\\\\\_", false), + (r"\\\\\", r"\\\\\\\\_", true), + (r"\\\\a", r"\\\\\\\\_", true), + (r"\\\\\a", r"\\\\\\\\_", false), + (r"\\\\ab", r"\\\\\\\\_", false), + (r"\\\\\\\\_", r"\\\\\\\\_", false), + ]; + + for (value, pattern, expected) in test_cases { + let unexpected = BooleanArray::from(vec![!expected]); + let expected = BooleanArray::from(vec![expected]); + + for binary_type in [DataType::Binary, DataType::LargeBinary] { + for ((value_datum, value_type), (pattern_datum, pattern_type)) in zip( + make_binary_datums(value.as_bytes(), &binary_type), + make_binary_datums(pattern.as_bytes(), &binary_type), + ) { + let value_datum = value_datum.as_ref(); + let pattern_datum = pattern_datum.as_ref(); + assert_eq!( + like(value_datum, pattern_datum).unwrap(), + expected, + "{value_type:?} «{value}» like {pattern_type:?} «{pattern}»" + ); + assert_eq!( + ilike(value_datum, pattern_datum).unwrap(), + expected, + "{value_type:?} «{value}» ilike {pattern_type:?} «{pattern}»" + ); + assert_eq!( + nlike(value_datum, pattern_datum).unwrap(), + unexpected, + "{value_type:?} «{value}» nlike {pattern_type:?} «{pattern}»" + ); + assert_eq!( + nilike(value_datum, pattern_datum).unwrap(), + unexpected, + "{value_type:?} «{value}» nilike {pattern_type:?} «{pattern}»" + ); + } + } + } + } + + #[test] + fn binary_like_escape_many() { + // (value, pattern, expected) + let test_cases = vec![ + (r"", r"", true), + (r"\", r"", false), + (r"\\", r"", false), + (r"\\\", r"", false), + (r"\\\\", r"", false), + (r"a", r"", false), + (r"\a", r"", false), + (r"\\a", r"", false), + (r"%", r"", false), + (r"\%", r"", false), + (r"\\%", r"", false), + (r"%%", r"", false), + (r"\%%", r"", false), + (r"\\%%", r"", false), + (r"_", r"", false), + (r"\_", r"", false), + (r"\\_", r"", false), + (r"__", r"", false), + (r"\__", r"", false), + (r"\\__", r"", false), + (r"abc", r"", false), + (r"a_c", r"", false), + (r"a\bc", r"", false), + (r"a\_c", r"", false), + (r"%abc", r"", false), + (r"\%abc", r"", false), + (r"a\\_c%", r"", false), + (r"", r"\", false), + (r"\", r"\", true), + (r"\\", r"\", false), + (r"\\\", r"\", false), + (r"\\\\", r"\", false), + (r"a", r"\", false), + (r"\a", r"\", false), + (r"\\a", r"\", false), + (r"%", r"\", false), + (r"\%", r"\", false), + (r"\\%", r"\", false), + (r"%%", r"\", false), + (r"\%%", r"\", false), + (r"\\%%", r"\", false), + (r"_", r"\", false), + (r"\_", r"\", false), + (r"\\_", r"\", false), + (r"__", r"\", false), + (r"\__", r"\", false), + (r"\\__", r"\", false), + (r"abc", r"\", false), + (r"a_c", r"\", false), + (r"a\bc", r"\", false), + (r"a\_c", r"\", false), + (r"%abc", r"\", false), + (r"\%abc", r"\", false), + (r"a\\_c%", r"\", false), + (r"", r"\\", false), + (r"\", r"\\", true), + (r"\\", r"\\", false), + (r"\\\", r"\\", false), + (r"\\\\", r"\\", false), + (r"a", r"\\", false), + (r"\a", r"\\", false), + (r"\\a", r"\\", false), + (r"%", r"\\", false), + (r"\%", r"\\", false), + (r"\\%", r"\\", false), + (r"%%", r"\\", false), + (r"\%%", r"\\", false), + (r"\\%%", r"\\", false), + (r"_", r"\\", false), + (r"\_", r"\\", false), + (r"\\_", r"\\", false), + (r"__", r"\\", false), + (r"\__", r"\\", false), + (r"\\__", r"\\", false), + (r"abc", r"\\", false), + (r"a_c", r"\\", false), + (r"a\bc", r"\\", false), + (r"a\_c", r"\\", false), + (r"%abc", r"\\", false), + (r"\%abc", r"\\", false), + (r"a\\_c%", r"\\", false), + (r"", r"\\\", false), + (r"\", r"\\\", false), + (r"\\", r"\\\", true), + (r"\\\", r"\\\", false), + (r"\\\\", r"\\\", false), + (r"a", r"\\\", false), + (r"\a", r"\\\", false), + (r"\\a", r"\\\", false), + (r"%", r"\\\", false), + (r"\%", r"\\\", false), + (r"\\%", r"\\\", false), + (r"%%", r"\\\", false), + (r"\%%", r"\\\", false), + (r"\\%%", r"\\\", false), + (r"_", r"\\\", false), + (r"\_", r"\\\", false), + (r"\\_", r"\\\", false), + (r"__", r"\\\", false), + (r"\__", r"\\\", false), + (r"\\__", r"\\\", false), + (r"abc", r"\\\", false), + (r"a_c", r"\\\", false), + (r"a\bc", r"\\\", false), + (r"a\_c", r"\\\", false), + (r"%abc", r"\\\", false), + (r"\%abc", r"\\\", false), + (r"a\\_c%", r"\\\", false), + (r"", r"\\\\", false), + (r"\", r"\\\\", false), + (r"\\", r"\\\\", true), + (r"\\\", r"\\\\", false), + (r"\\\\", r"\\\\", false), + (r"a", r"\\\\", false), + (r"\a", r"\\\\", false), + (r"\\a", r"\\\\", false), + (r"%", r"\\\\", false), + (r"\%", r"\\\\", false), + (r"\\%", r"\\\\", false), + (r"%%", r"\\\\", false), + (r"\%%", r"\\\\", false), + (r"\\%%", r"\\\\", false), + (r"_", r"\\\\", false), + (r"\_", r"\\\\", false), + (r"\\_", r"\\\\", false), + (r"__", r"\\\\", false), + (r"\__", r"\\\\", false), + (r"\\__", r"\\\\", false), + (r"abc", r"\\\\", false), + (r"a_c", r"\\\\", false), + (r"a\bc", r"\\\\", false), + (r"a\_c", r"\\\\", false), + (r"%abc", r"\\\\", false), + (r"\%abc", r"\\\\", false), + (r"a\\_c%", r"\\\\", false), + (r"", r"a", false), + (r"\", r"a", false), + (r"\\", r"a", false), + (r"\\\", r"a", false), + (r"\\\\", r"a", false), + (r"a", r"a", true), + (r"\a", r"a", false), + (r"\\a", r"a", false), + (r"%", r"a", false), + (r"\%", r"a", false), + (r"\\%", r"a", false), + (r"%%", r"a", false), + (r"\%%", r"a", false), + (r"\\%%", r"a", false), + (r"_", r"a", false), + (r"\_", r"a", false), + (r"\\_", r"a", false), + (r"__", r"a", false), + (r"\__", r"a", false), + (r"\\__", r"a", false), + (r"abc", r"a", false), + (r"a_c", r"a", false), + (r"a\bc", r"a", false), + (r"a\_c", r"a", false), + (r"%abc", r"a", false), + (r"\%abc", r"a", false), + (r"a\\_c%", r"a", false), + (r"", r"\a", false), + (r"\", r"\a", false), + (r"\\", r"\a", false), + (r"\\\", r"\a", false), + (r"\\\\", r"\a", false), + (r"a", r"\a", true), + (r"\a", r"\a", false), + (r"\\a", r"\a", false), + (r"%", r"\a", false), + (r"\%", r"\a", false), + (r"\\%", r"\a", false), + (r"%%", r"\a", false), + (r"\%%", r"\a", false), + (r"\\%%", r"\a", false), + (r"_", r"\a", false), + (r"\_", r"\a", false), + (r"\\_", r"\a", false), + (r"__", r"\a", false), + (r"\__", r"\a", false), + (r"\\__", r"\a", false), + (r"abc", r"\a", false), + (r"a_c", r"\a", false), + (r"a\bc", r"\a", false), + (r"a\_c", r"\a", false), + (r"%abc", r"\a", false), + (r"\%abc", r"\a", false), + (r"a\\_c%", r"\a", false), + (r"", r"\\a", false), + (r"\", r"\\a", false), + (r"\\", r"\\a", false), + (r"\\\", r"\\a", false), + (r"\\\\", r"\\a", false), + (r"a", r"\\a", false), + (r"\a", r"\\a", true), + (r"\\a", r"\\a", false), + (r"%", r"\\a", false), + (r"\%", r"\\a", false), + (r"\\%", r"\\a", false), + (r"%%", r"\\a", false), + (r"\%%", r"\\a", false), + (r"\\%%", r"\\a", false), + (r"_", r"\\a", false), + (r"\_", r"\\a", false), + (r"\\_", r"\\a", false), + (r"__", r"\\a", false), + (r"\__", r"\\a", false), + (r"\\__", r"\\a", false), + (r"abc", r"\\a", false), + (r"a_c", r"\\a", false), + (r"a\bc", r"\\a", false), + (r"a\_c", r"\\a", false), + (r"%abc", r"\\a", false), + (r"\%abc", r"\\a", false), + (r"a\\_c%", r"\\a", false), + (r"", r"%", true), + (r"\", r"%", true), + (r"\\", r"%", true), + (r"\\\", r"%", true), + (r"\\\\", r"%", true), + (r"a", r"%", true), + (r"\a", r"%", true), + (r"\\a", r"%", true), + (r"%", r"%", true), + (r"\%", r"%", true), + (r"\\%", r"%", true), + (r"%%", r"%", true), + (r"\%%", r"%", true), + (r"\\%%", r"%", true), + (r"_", r"%", true), + (r"\_", r"%", true), + (r"\\_", r"%", true), + (r"__", r"%", true), + (r"\__", r"%", true), + (r"\\__", r"%", true), + (r"abc", r"%", true), + (r"a_c", r"%", true), + (r"a\bc", r"%", true), + (r"a\_c", r"%", true), + (r"%abc", r"%", true), + (r"\%abc", r"%", true), + (r"a\\_c%", r"%", true), + (r"", r"\%", false), + (r"\", r"\%", false), + (r"\\", r"\%", false), + (r"\\\", r"\%", false), + (r"\\\\", r"\%", false), + (r"a", r"\%", false), + (r"\a", r"\%", false), + (r"\\a", r"\%", false), + (r"%", r"\%", true), + (r"\%", r"\%", false), + (r"\\%", r"\%", false), + (r"%%", r"\%", false), + (r"\%%", r"\%", false), + (r"\\%%", r"\%", false), + (r"_", r"\%", false), + (r"\_", r"\%", false), + (r"\\_", r"\%", false), + (r"__", r"\%", false), + (r"\__", r"\%", false), + (r"\\__", r"\%", false), + (r"abc", r"\%", false), + (r"a_c", r"\%", false), + (r"a\bc", r"\%", false), + (r"a\_c", r"\%", false), + (r"%abc", r"\%", false), + (r"\%abc", r"\%", false), + (r"a\\_c%", r"\%", false), + (r"", r"\\%", false), + (r"\", r"\\%", true), + (r"\\", r"\\%", true), + (r"\\\", r"\\%", true), + (r"\\\\", r"\\%", true), + (r"a", r"\\%", false), + (r"\a", r"\\%", true), + (r"\\a", r"\\%", true), + (r"%", r"\\%", false), + (r"\%", r"\\%", true), + (r"\\%", r"\\%", true), + (r"%%", r"\\%", false), + (r"\%%", r"\\%", true), + (r"\\%%", r"\\%", true), + (r"_", r"\\%", false), + (r"\_", r"\\%", true), + (r"\\_", r"\\%", true), + (r"__", r"\\%", false), + (r"\__", r"\\%", true), + (r"\\__", r"\\%", true), + (r"abc", r"\\%", false), + (r"a_c", r"\\%", false), + (r"a\bc", r"\\%", false), + (r"a\_c", r"\\%", false), + (r"%abc", r"\\%", false), + (r"\%abc", r"\\%", true), + (r"a\\_c%", r"\\%", false), + (r"", r"%%", true), + (r"\", r"%%", true), + (r"\\", r"%%", true), + (r"\\\", r"%%", true), + (r"\\\\", r"%%", true), + (r"a", r"%%", true), + (r"\a", r"%%", true), + (r"\\a", r"%%", true), + (r"%", r"%%", true), + (r"\%", r"%%", true), + (r"\\%", r"%%", true), + (r"%%", r"%%", true), + (r"\%%", r"%%", true), + (r"\\%%", r"%%", true), + (r"_", r"%%", true), + (r"\_", r"%%", true), + (r"\\_", r"%%", true), + (r"__", r"%%", true), + (r"\__", r"%%", true), + (r"\\__", r"%%", true), + (r"abc", r"%%", true), + (r"a_c", r"%%", true), + (r"a\bc", r"%%", true), + (r"a\_c", r"%%", true), + (r"%abc", r"%%", true), + (r"\%abc", r"%%", true), + (r"a\\_c%", r"%%", true), + (r"", r"\%%", false), + (r"\", r"\%%", false), + (r"\\", r"\%%", false), + (r"\\\", r"\%%", false), + (r"\\\\", r"\%%", false), + (r"a", r"\%%", false), + (r"\a", r"\%%", false), + (r"\\a", r"\%%", false), + (r"%", r"\%%", true), + (r"\%", r"\%%", false), + (r"\\%", r"\%%", false), + (r"%%", r"\%%", true), + (r"\%%", r"\%%", false), + (r"\\%%", r"\%%", false), + (r"_", r"\%%", false), + (r"\_", r"\%%", false), + (r"\\_", r"\%%", false), + (r"__", r"\%%", false), + (r"\__", r"\%%", false), + (r"\\__", r"\%%", false), + (r"abc", r"\%%", false), + (r"a_c", r"\%%", false), + (r"a\bc", r"\%%", false), + (r"a\_c", r"\%%", false), + (r"%abc", r"\%%", true), + (r"\%abc", r"\%%", false), + (r"a\\_c%", r"\%%", false), + (r"", r"\\%%", false), + (r"\", r"\\%%", true), + (r"\\", r"\\%%", true), + (r"\\\", r"\\%%", true), + (r"\\\\", r"\\%%", true), + (r"a", r"\\%%", false), + (r"\a", r"\\%%", true), + (r"\\a", r"\\%%", true), + (r"%", r"\\%%", false), + (r"\%", r"\\%%", true), + (r"\\%", r"\\%%", true), + (r"%%", r"\\%%", false), + (r"\%%", r"\\%%", true), + (r"\\%%", r"\\%%", true), + (r"_", r"\\%%", false), + (r"\_", r"\\%%", true), + (r"\\_", r"\\%%", true), + (r"__", r"\\%%", false), + (r"\__", r"\\%%", true), + (r"\\__", r"\\%%", true), + (r"abc", r"\\%%", false), + (r"a_c", r"\\%%", false), + (r"a\bc", r"\\%%", false), + (r"a\_c", r"\\%%", false), + (r"%abc", r"\\%%", false), + (r"\%abc", r"\\%%", true), + (r"a\\_c%", r"\\%%", false), + (r"", r"_", false), + (r"\", r"_", true), + (r"\\", r"_", false), + (r"\\\", r"_", false), + (r"\\\\", r"_", false), + (r"a", r"_", true), + (r"\a", r"_", false), + (r"\\a", r"_", false), + (r"%", r"_", true), + (r"\%", r"_", false), + (r"\\%", r"_", false), + (r"%%", r"_", false), + (r"\%%", r"_", false), + (r"\\%%", r"_", false), + (r"_", r"_", true), + (r"\_", r"_", false), + (r"\\_", r"_", false), + (r"__", r"_", false), + (r"\__", r"_", false), + (r"\\__", r"_", false), + (r"abc", r"_", false), + (r"a_c", r"_", false), + (r"a\bc", r"_", false), + (r"a\_c", r"_", false), + (r"%abc", r"_", false), + (r"\%abc", r"_", false), + (r"a\\_c%", r"_", false), + (r"", r"\_", false), + (r"\", r"\_", false), + (r"\\", r"\_", false), + (r"\\\", r"\_", false), + (r"\\\\", r"\_", false), + (r"a", r"\_", false), + (r"\a", r"\_", false), + (r"\\a", r"\_", false), + (r"%", r"\_", false), + (r"\%", r"\_", false), + (r"\\%", r"\_", false), + (r"%%", r"\_", false), + (r"\%%", r"\_", false), + (r"\\%%", r"\_", false), + (r"_", r"\_", true), + (r"\_", r"\_", false), + (r"\\_", r"\_", false), + (r"__", r"\_", false), + (r"\__", r"\_", false), + (r"\\__", r"\_", false), + (r"abc", r"\_", false), + (r"a_c", r"\_", false), + (r"a\bc", r"\_", false), + (r"a\_c", r"\_", false), + (r"%abc", r"\_", false), + (r"\%abc", r"\_", false), + (r"a\\_c%", r"\_", false), + (r"", r"\\_", false), + (r"\", r"\\_", false), + (r"\\", r"\\_", true), + (r"\\\", r"\\_", false), + (r"\\\\", r"\\_", false), + (r"a", r"\\_", false), + (r"\a", r"\\_", true), + (r"\\a", r"\\_", false), + (r"%", r"\\_", false), + (r"\%", r"\\_", true), + (r"\\%", r"\\_", false), + (r"%%", r"\\_", false), + (r"\%%", r"\\_", false), + (r"\\%%", r"\\_", false), + (r"_", r"\\_", false), + (r"\_", r"\\_", true), + (r"\\_", r"\\_", false), + (r"__", r"\\_", false), + (r"\__", r"\\_", false), + (r"\\__", r"\\_", false), + (r"abc", r"\\_", false), + (r"a_c", r"\\_", false), + (r"a\bc", r"\\_", false), + (r"a\_c", r"\\_", false), + (r"%abc", r"\\_", false), + (r"\%abc", r"\\_", false), + (r"a\\_c%", r"\\_", false), + (r"", r"__", false), + (r"\", r"__", false), + (r"\\", r"__", true), + (r"\\\", r"__", false), + (r"\\\\", r"__", false), + (r"a", r"__", false), + (r"\a", r"__", true), + (r"\\a", r"__", false), + (r"%", r"__", false), + (r"\%", r"__", true), + (r"\\%", r"__", false), + (r"%%", r"__", true), + (r"\%%", r"__", false), + (r"\\%%", r"__", false), + (r"_", r"__", false), + (r"\_", r"__", true), + (r"\\_", r"__", false), + (r"__", r"__", true), + (r"\__", r"__", false), + (r"\\__", r"__", false), + (r"abc", r"__", false), + (r"a_c", r"__", false), + (r"a\bc", r"__", false), + (r"a\_c", r"__", false), + (r"%abc", r"__", false), + (r"\%abc", r"__", false), + (r"a\\_c%", r"__", false), + (r"", r"\__", false), + (r"\", r"\__", false), + (r"\\", r"\__", false), + (r"\\\", r"\__", false), + (r"\\\\", r"\__", false), + (r"a", r"\__", false), + (r"\a", r"\__", false), + (r"\\a", r"\__", false), + (r"%", r"\__", false), + (r"\%", r"\__", false), + (r"\\%", r"\__", false), + (r"%%", r"\__", false), + (r"\%%", r"\__", false), + (r"\\%%", r"\__", false), + (r"_", r"\__", false), + (r"\_", r"\__", false), + (r"\\_", r"\__", false), + (r"__", r"\__", true), + (r"\__", r"\__", false), + (r"\\__", r"\__", false), + (r"abc", r"\__", false), + (r"a_c", r"\__", false), + (r"a\bc", r"\__", false), + (r"a\_c", r"\__", false), + (r"%abc", r"\__", false), + (r"\%abc", r"\__", false), + (r"a\\_c%", r"\__", false), + (r"", r"\\__", false), + (r"\", r"\\__", false), + (r"\\", r"\\__", false), + (r"\\\", r"\\__", true), + (r"\\\\", r"\\__", false), + (r"a", r"\\__", false), + (r"\a", r"\\__", false), + (r"\\a", r"\\__", true), + (r"%", r"\\__", false), + (r"\%", r"\\__", false), + (r"\\%", r"\\__", true), + (r"%%", r"\\__", false), + (r"\%%", r"\\__", true), + (r"\\%%", r"\\__", false), + (r"_", r"\\__", false), + (r"\_", r"\\__", false), + (r"\\_", r"\\__", true), + (r"__", r"\\__", false), + (r"\__", r"\\__", true), + (r"\\__", r"\\__", false), + (r"abc", r"\\__", false), + (r"a_c", r"\\__", false), + (r"a\bc", r"\\__", false), + (r"a\_c", r"\\__", false), + (r"%abc", r"\\__", false), + (r"\%abc", r"\\__", false), + (r"a\\_c%", r"\\__", false), + (r"", r"abc", false), + (r"\", r"abc", false), + (r"\\", r"abc", false), + (r"\\\", r"abc", false), + (r"\\\\", r"abc", false), + (r"a", r"abc", false), + (r"\a", r"abc", false), + (r"\\a", r"abc", false), + (r"%", r"abc", false), + (r"\%", r"abc", false), + (r"\\%", r"abc", false), + (r"%%", r"abc", false), + (r"\%%", r"abc", false), + (r"\\%%", r"abc", false), + (r"_", r"abc", false), + (r"\_", r"abc", false), + (r"\\_", r"abc", false), + (r"__", r"abc", false), + (r"\__", r"abc", false), + (r"\\__", r"abc", false), + (r"abc", r"abc", true), + (r"a_c", r"abc", false), + (r"a\bc", r"abc", false), + (r"a\_c", r"abc", false), + (r"%abc", r"abc", false), + (r"\%abc", r"abc", false), + (r"a\\_c%", r"abc", false), + (r"", r"a_c", false), + (r"\", r"a_c", false), + (r"\\", r"a_c", false), + (r"\\\", r"a_c", false), + (r"\\\\", r"a_c", false), + (r"a", r"a_c", false), + (r"\a", r"a_c", false), + (r"\\a", r"a_c", false), + (r"%", r"a_c", false), + (r"\%", r"a_c", false), + (r"\\%", r"a_c", false), + (r"%%", r"a_c", false), + (r"\%%", r"a_c", false), + (r"\\%%", r"a_c", false), + (r"_", r"a_c", false), + (r"\_", r"a_c", false), + (r"\\_", r"a_c", false), + (r"__", r"a_c", false), + (r"\__", r"a_c", false), + (r"\\__", r"a_c", false), + (r"abc", r"a_c", true), + (r"a_c", r"a_c", true), + (r"a\bc", r"a_c", false), + (r"a\_c", r"a_c", false), + (r"%abc", r"a_c", false), + (r"\%abc", r"a_c", false), + (r"a\\_c%", r"a_c", false), + (r"", r"a\bc", false), + (r"\", r"a\bc", false), + (r"\\", r"a\bc", false), + (r"\\\", r"a\bc", false), + (r"\\\\", r"a\bc", false), + (r"a", r"a\bc", false), + (r"\a", r"a\bc", false), + (r"\\a", r"a\bc", false), + (r"%", r"a\bc", false), + (r"\%", r"a\bc", false), + (r"\\%", r"a\bc", false), + (r"%%", r"a\bc", false), + (r"\%%", r"a\bc", false), + (r"\\%%", r"a\bc", false), + (r"_", r"a\bc", false), + (r"\_", r"a\bc", false), + (r"\\_", r"a\bc", false), + (r"__", r"a\bc", false), + (r"\__", r"a\bc", false), + (r"\\__", r"a\bc", false), + (r"abc", r"a\bc", true), + (r"a_c", r"a\bc", false), + (r"a\bc", r"a\bc", false), + (r"a\_c", r"a\bc", false), + (r"%abc", r"a\bc", false), + (r"\%abc", r"a\bc", false), + (r"a\\_c%", r"a\bc", false), + (r"", r"a\_c", false), + (r"\", r"a\_c", false), + (r"\\", r"a\_c", false), + (r"\\\", r"a\_c", false), + (r"\\\\", r"a\_c", false), + (r"a", r"a\_c", false), + (r"\a", r"a\_c", false), + (r"\\a", r"a\_c", false), + (r"%", r"a\_c", false), + (r"\%", r"a\_c", false), + (r"\\%", r"a\_c", false), + (r"%%", r"a\_c", false), + (r"\%%", r"a\_c", false), + (r"\\%%", r"a\_c", false), + (r"_", r"a\_c", false), + (r"\_", r"a\_c", false), + (r"\\_", r"a\_c", false), + (r"__", r"a\_c", false), + (r"\__", r"a\_c", false), + (r"\\__", r"a\_c", false), + (r"abc", r"a\_c", false), + (r"a_c", r"a\_c", true), + (r"a\bc", r"a\_c", false), + (r"a\_c", r"a\_c", false), + (r"%abc", r"a\_c", false), + (r"\%abc", r"a\_c", false), + (r"a\\_c%", r"a\_c", false), + (r"", r"%abc", false), + (r"\", r"%abc", false), + (r"\\", r"%abc", false), + (r"\\\", r"%abc", false), + (r"\\\\", r"%abc", false), + (r"a", r"%abc", false), + (r"\a", r"%abc", false), + (r"\\a", r"%abc", false), + (r"%", r"%abc", false), + (r"\%", r"%abc", false), + (r"\\%", r"%abc", false), + (r"%%", r"%abc", false), + (r"\%%", r"%abc", false), + (r"\\%%", r"%abc", false), + (r"_", r"%abc", false), + (r"\_", r"%abc", false), + (r"\\_", r"%abc", false), + (r"__", r"%abc", false), + (r"\__", r"%abc", false), + (r"\\__", r"%abc", false), + (r"abc", r"%abc", true), + (r"a_c", r"%abc", false), + (r"a\bc", r"%abc", false), + (r"a\_c", r"%abc", false), + (r"%abc", r"%abc", true), + (r"\%abc", r"%abc", true), + (r"a\\_c%", r"%abc", false), + (r"", r"\%abc", false), + (r"\", r"\%abc", false), + (r"\\", r"\%abc", false), + (r"\\\", r"\%abc", false), + (r"\\\\", r"\%abc", false), + (r"a", r"\%abc", false), + (r"\a", r"\%abc", false), + (r"\\a", r"\%abc", false), + (r"%", r"\%abc", false), + (r"\%", r"\%abc", false), + (r"\\%", r"\%abc", false), + (r"%%", r"\%abc", false), + (r"\%%", r"\%abc", false), + (r"\\%%", r"\%abc", false), + (r"_", r"\%abc", false), + (r"\_", r"\%abc", false), + (r"\\_", r"\%abc", false), + (r"__", r"\%abc", false), + (r"\__", r"\%abc", false), + (r"\\__", r"\%abc", false), + (r"abc", r"\%abc", false), + (r"a_c", r"\%abc", false), + (r"a\bc", r"\%abc", false), + (r"a\_c", r"\%abc", false), + (r"%abc", r"\%abc", true), + (r"\%abc", r"\%abc", false), + (r"a\\_c%", r"\%abc", false), + (r"", r"a\\_c%", false), + (r"\", r"a\\_c%", false), + (r"\\", r"a\\_c%", false), + (r"\\\", r"a\\_c%", false), + (r"\\\\", r"a\\_c%", false), + (r"a", r"a\\_c%", false), + (r"\a", r"a\\_c%", false), + (r"\\a", r"a\\_c%", false), + (r"%", r"a\\_c%", false), + (r"\%", r"a\\_c%", false), + (r"\\%", r"a\\_c%", false), + (r"%%", r"a\\_c%", false), + (r"\%%", r"a\\_c%", false), + (r"\\%%", r"a\\_c%", false), + (r"_", r"a\\_c%", false), + (r"\_", r"a\\_c%", false), + (r"\\_", r"a\\_c%", false), + (r"__", r"a\\_c%", false), + (r"\__", r"a\\_c%", false), + (r"\\__", r"a\\_c%", false), + (r"abc", r"a\\_c%", false), + (r"a_c", r"a\\_c%", false), + (r"a\bc", r"a\\_c%", true), + (r"a\_c", r"a\\_c%", true), + (r"%abc", r"a\\_c%", false), + (r"\%abc", r"a\\_c%", false), + (r"a\\_c%", r"a\\_c%", false), + ]; + + let values = test_cases + .iter() + .map(|(value, _, _)| *value) + .collect::>(); + let patterns = test_cases + .iter() + .map(|(_, pattern, _)| *pattern) + .collect::>(); + let expected = BooleanArray::from( + test_cases + .iter() + .map(|(_, _, expected)| *expected) + .collect::>(), + ); + let unexpected = BooleanArray::from( + test_cases + .iter() + .map(|(_, _, expected)| !*expected) + .collect::>(), + ); + + for string_type in [DataType::Binary, DataType::LargeBinary] { + let values = make_binary_array(values.iter(), &string_type); + let patterns = make_binary_array(patterns.iter(), &string_type); + let (values, patterns) = (values.as_ref(), patterns.as_ref()); + + assert_eq!(like(&values, &patterns).unwrap(), expected,); + assert_eq!(ilike(&values, &patterns).unwrap(), expected,); + assert_eq!(nlike(&values, &patterns).unwrap(), unexpected,); + assert_eq!(nilike(&values, &patterns).unwrap(), unexpected,); + } + } + + fn make_binary_datums( + value: impl AsRef<[u8]>, + data_type: &DataType, + ) -> Vec<(Box, DatumType)> { + match data_type { + DataType::Binary => { + let array = BinaryArray::from_iter_values([value]); + vec![ + (Box::new(array.clone()), DatumType::Array), + (Box::new(Scalar::new(array)), DatumType::Scalar), + ] + } + DataType::LargeBinary => { + let array = LargeBinaryArray::from_iter_values([value]); + vec![ + (Box::new(array.clone()), DatumType::Array), + (Box::new(Scalar::new(array)), DatumType::Scalar), + ] + } + _ => unimplemented!(), + } + } + + fn make_binary_array( + values: impl IntoIterator>, + data_type: &DataType, + ) -> Box { + match data_type { + DataType::Binary => Box::new(BinaryArray::from_iter_values(values)), + DataType::LargeBinary => Box::new(LargeBinaryArray::from_iter_values(values)), + _ => unimplemented!("Unsupported data type: {:?}", data_type), + } + } + #[derive(Debug)] enum DatumType { Array, diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 8bbfe65bab44..034175b43de1 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -15,12 +15,13 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray}; + +use arrow_array::{Array, ArrayAccessor, BinaryViewArray, BooleanArray, StringViewArray}; use arrow_buffer::BooleanBuffer; use arrow_schema::ArrowError; use memchr::memchr3; use memchr::memmem::Finder; -use regex::{Regex, RegexBuilder}; +use regex::{Regex, RegexBuilder, bytes::Regex as BinaryRegex, bytes::RegexBuilder as BinaryRegexBuilder}; use std::iter::zip; /// A string based predicate @@ -40,59 +41,127 @@ pub enum Predicate<'a> { Regex(Regex), } -impl<'a> Predicate<'a> { +/// A string based predicate +pub enum BinaryPredicate<'a> { + Eq(&'a [u8]), + Contains(Finder<'a>), + StartsWith(&'a [u8]), + EndsWith(&'a [u8]), + + /// Equality ignoring ASCII case + IEqAscii(&'a [u8]), + /// Starts with ignoring ASCII case + IStartsWithAscii(&'a [u8]), + /// Ends with ignoring ASCII case + IEndsWithAscii(&'a [u8]), + + Regex(BinaryRegex), +} + +pub trait PredicateImpl<'a>: Sized { + + type UnsizedItem: ?Sized + PartialEq; + type RegexType; + /// Create a predicate for the given like pattern - pub fn like(pattern: &'a str) -> Result { - if !contains_like_pattern(pattern) { + fn like(pattern: &'a Self::UnsizedItem) -> Result; + + fn contains(needle: &'a Self::UnsizedItem) -> Self; + + /// Create a predicate for the given ilike pattern + fn ilike(pattern: &'a Self::UnsizedItem, is_ascii: bool) -> Result; + + /// Create a predicate for the given starts_with pattern + fn starts_with(pattern: &'a Self::UnsizedItem) -> Self; + + /// Create a predicate for the given ends_with pattern + fn ends_with(pattern: &'a Self::UnsizedItem) -> Self; + + /// Evaluate this predicate against the given haystack + fn evaluate(&self, haystack: &'a Self::UnsizedItem) -> bool; + + /// Evaluate this predicate against the elements of `array` + /// + /// If `negate` is true the result of the predicate will be negated + #[inline(never)] + fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray + where + T: ArrayAccessor, Self::UnsizedItem: 'i; + + /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: + /// + /// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, + /// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`) + /// 2. Replace `LIKE` single-character wildcards `_` => `.` + /// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` + /// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` + fn regex_like(pattern: &'a Self::UnsizedItem, case_insensitive: bool) -> Result; +} + +impl<'a> PredicateImpl<'a> for Predicate<'a> { + type UnsizedItem = str; + type RegexType = Regex; + + /// Create a predicate for the given like pattern + fn like(pattern: &'a str) -> Result { + if !contains_like_pattern(pattern.as_bytes()) { Ok(Self::Eq(pattern)) - } else if pattern.ends_with('%') && !contains_like_pattern(&pattern[..pattern.len() - 1]) { + } else if pattern.ends_with('%') && !contains_like_pattern(&pattern[..pattern.len() - 1].as_bytes()) { Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) - } else if pattern.starts_with('%') && !contains_like_pattern(&pattern[1..]) { + } else if pattern.starts_with('%') && !contains_like_pattern(&pattern[1..].as_bytes()) { Ok(Self::EndsWith(&pattern[1..])) } else if pattern.starts_with('%') && pattern.ends_with('%') - && !contains_like_pattern(&pattern[1..pattern.len() - 1]) + && !contains_like_pattern(&pattern[1..pattern.len() - 1].as_bytes()) { Ok(Self::contains(&pattern[1..pattern.len() - 1])) } else { - Ok(Self::Regex(regex_like(pattern, false)?)) + Ok(Self::Regex(Self::regex_like(pattern, false)?)) } } - pub fn contains(needle: &'a str) -> Self { + fn contains(needle: &'a str) -> Self { Self::Contains(Finder::new(needle.as_bytes())) } /// Create a predicate for the given ilike pattern - pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result { + fn ilike(pattern: &'a str, is_ascii: bool) -> Result { if is_ascii && pattern.is_ascii() { - if !contains_like_pattern(pattern) { + if !contains_like_pattern(pattern.as_bytes()) { return Ok(Self::IEqAscii(pattern)); } else if pattern.ends_with('%') && !pattern.ends_with("\\%") - && !contains_like_pattern(&pattern[..pattern.len() - 1]) + && !contains_like_pattern(&pattern[..pattern.len() - 1].as_bytes()) { return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); - } else if pattern.starts_with('%') && !contains_like_pattern(&pattern[1..]) { + } else if pattern.starts_with('%') && !contains_like_pattern(&pattern[1..].as_bytes()) { return Ok(Self::IEndsWithAscii(&pattern[1..])); } } - Ok(Self::Regex(regex_like(pattern, true)?)) + Ok(Self::Regex(Self::regex_like(pattern, true)?)) + } + + fn starts_with(pattern: &'a str) -> Self { + Self::StartsWith(pattern) + } + + fn ends_with(pattern: &'a str) -> Self { + Self::EndsWith(pattern) } /// Evaluate this predicate against the given haystack - pub fn evaluate(&self, haystack: &str) -> bool { + fn evaluate(&self, haystack: &'a str) -> bool { match self { - Predicate::Eq(v) => *v == haystack, - Predicate::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), - Predicate::Contains(finder) => finder.find(haystack.as_bytes()).is_some(), - Predicate::StartsWith(v) => starts_with(haystack, v, equals_kernel), - Predicate::IStartsWithAscii(v) => { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) + Self::Eq(v) => *v == haystack, + Self::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), + Self::Contains(finder) => finder.find(haystack.as_bytes()).is_some(), + Self::StartsWith(v) => starts_with(haystack.as_bytes(), v.as_bytes(), equals_kernel), + Self::IStartsWithAscii(v) => { + starts_with(haystack.as_bytes(), v.as_bytes(), equals_ignore_ascii_case_kernel) } - Predicate::EndsWith(v) => ends_with(haystack, v, equals_kernel), - Predicate::IEndsWithAscii(v) => ends_with(haystack, v, equals_ignore_ascii_case_kernel), - Predicate::Regex(v) => v.is_match(haystack), + Self::EndsWith(v) => ends_with(haystack.as_bytes(), v.as_bytes(), equals_kernel), + Self::IEndsWithAscii(v) => ends_with(haystack.as_bytes(), v.as_bytes(), equals_ignore_ascii_case_kernel), + Self::Regex(v) => v.is_match(haystack), } } @@ -100,21 +169,21 @@ impl<'a> Predicate<'a> { /// /// If `negate` is true the result of the predicate will be negated #[inline(never)] - pub fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray + fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray where T: ArrayAccessor, { match self { - Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| { + Self::Eq(v) => BooleanArray::from_unary(array, |haystack| { (haystack.len() == v.len() && haystack == *v) != negate }), - Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { + Self::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { haystack.eq_ignore_ascii_case(v) != negate }), - Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| { + Self::Contains(finder) => BooleanArray::from_unary(array, |haystack| { finder.find(haystack.as_bytes()).is_some() != negate }), - Predicate::StartsWith(v) => { + Self::StartsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { let nulls = string_view_array.logical_nulls(); let values = BooleanBuffer::from( @@ -128,11 +197,11 @@ impl<'a> Predicate<'a> { BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - starts_with(haystack, v, equals_kernel) != negate + starts_with(haystack.as_bytes(), v.as_bytes(), equals_kernel) != negate }) } } - Predicate::IStartsWithAscii(v) => { + Self::IStartsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { let nulls = string_view_array.logical_nulls(); let values = BooleanBuffer::from( @@ -150,11 +219,11 @@ impl<'a> Predicate<'a> { BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + starts_with(haystack.as_bytes(), v.as_bytes(), equals_ignore_ascii_case_kernel) != negate }) } } - Predicate::EndsWith(v) => { + Self::EndsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { let nulls = string_view_array.logical_nulls(); let values = BooleanBuffer::from( @@ -168,11 +237,11 @@ impl<'a> Predicate<'a> { BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_kernel) != negate + ends_with(haystack.as_bytes(), v.as_bytes(), equals_kernel) != negate }) } } - Predicate::IEndsWithAscii(v) => { + Self::IEndsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { let nulls = string_view_array.logical_nulls(); let values = BooleanBuffer::from( @@ -188,17 +257,319 @@ impl<'a> Predicate<'a> { .collect::>(), ); BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack.as_bytes(), v.as_bytes(), equals_ignore_ascii_case_kernel) != negate + }) + } + } + Self::Regex(v) => { + BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) + } + } + } + + /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: + /// + /// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, + /// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`) + /// 2. Replace `LIKE` single-character wildcards `_` => `.` + /// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` + /// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` + fn regex_like(pattern: &str, case_insensitive: bool) -> Result { + let mut result = String::with_capacity(pattern.len() * 2); + let mut chars_iter = pattern.chars().peekable(); + match chars_iter.peek() { + // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` + Some('%') => { + chars_iter.next(); + } + _ => result.push('^'), + }; + + while let Some(c) = chars_iter.next() { + match c { + '\\' => { + match chars_iter.peek() { + Some(&next) => { + if regex_syntax::is_meta_character(next) { + result.push('\\'); + } + result.push(next); + // Skipping the next char as it is already appended + chars_iter.next(); + } + None => { + // Trailing backslash in the pattern. E.g. PostgreSQL and Trino treat it as an error, but e.g. Snowflake treats it as a literal backslash + result.push('\\'); + result.push('\\'); + } + } + } + '%' => result.push_str(".*"), + '_' => result.push('.'), + c => { + if regex_syntax::is_meta_character(c) { + result.push('\\'); + } + result.push(c); + } + } + } + // instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex + if result.ends_with(".*") { + result.pop(); + result.pop(); + } else { + result.push('$'); + } + RegexBuilder::new(&result) + .case_insensitive(case_insensitive) + .dot_matches_new_line(true) + .build() + .map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Unable to build regex from LIKE pattern: {e}" + )) + }) + } +} + +impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { + type UnsizedItem = [u8]; + type RegexType = BinaryRegex; + + /// Create a predicate for the given like pattern + fn like(pattern: &'a [u8]) -> Result { + if !contains_like_pattern(pattern) { + Ok(Self::Eq(pattern)) + } else if pattern.ends_with(b"%") && !contains_like_pattern(&pattern[..pattern.len() - 1]) { + Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) + } else if pattern.starts_with(b"%") && !contains_like_pattern(&pattern[1..]) { + Ok(Self::EndsWith(&pattern[1..])) + } else if pattern.starts_with(b"%") + && pattern.ends_with(b"%") + && !contains_like_pattern(&pattern[1..pattern.len() - 1]) + { + Ok(Self::contains(&pattern[1..pattern.len() - 1])) + } else { + Ok(Self::Regex(Self::regex_like(pattern, false)?)) + } + } + + fn contains(needle: &'a [u8]) -> Self { + Self::Contains(Finder::new(needle)) + } + + /// Create a predicate for the given ilike pattern + fn ilike(pattern: &'a [u8], is_ascii: bool) -> Result { + if is_ascii && pattern.is_ascii() { + if !contains_like_pattern(pattern) { + return Ok(Self::IEqAscii(pattern)); + } else if pattern.ends_with(b"%") + && !pattern.ends_with(b"\\%") + && !contains_like_pattern(&pattern[..pattern.len() - 1]) + { + return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); + } else if pattern.starts_with(b"%") && !contains_like_pattern(&pattern[1..]) { + return Ok(Self::IEndsWithAscii(&pattern[1..])); + } + } + Ok(Self::Regex(Self::regex_like(pattern, true)?)) + } + + fn starts_with(pattern: &'a [u8]) -> Self { + Self::StartsWith(pattern) + } + + fn ends_with(pattern: &'a [u8]) -> Self { + Self::EndsWith(pattern) + } + + /// Evaluate this predicate against the given haystack + fn evaluate(&self, haystack: &[u8]) -> bool { + match self { + Self::Eq(v) => *v == haystack, + Self::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), + Self::Contains(finder) => finder.find(haystack).is_some(), + Self::StartsWith(v) => starts_with(haystack, v, equals_kernel), + Self::IStartsWithAscii(v) => { + starts_with(haystack, v, equals_ignore_ascii_case_kernel) + } + Self::EndsWith(v) => ends_with(haystack, v, equals_kernel), + Self::IEndsWithAscii(v) => ends_with(haystack, v, equals_ignore_ascii_case_kernel), + Self::Regex(v) => v.is_match(haystack), + } + } + + /// Evaluate this predicate against the elements of `array` + /// + /// If `negate` is true the result of the predicate will be negated + #[inline(never)] + fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray + where + T: ArrayAccessor, + { + match self { + Self::Eq(v) => BooleanArray::from_unary(array, |haystack| { + (haystack.len() == v.len() && haystack == *v) != negate + }), + Self::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { + haystack.eq_ignore_ascii_case(v) != negate + }), + Self::Contains(finder) => BooleanArray::from_unary(array, |haystack| { + finder.find(haystack).is_some() != negate + }), + Self::StartsWith(v) => { + if let Some(binary_view_array) = array.as_any().downcast_ref::() { + let nulls = binary_view_array.logical_nulls(); + let values = BooleanBuffer::from( + binary_view_array + .prefix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, v, equals_kernel) != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_kernel) != negate + }) + } + } + Self::IStartsWithAscii(v) => { + if let Some(binary_view_array) = array.as_any().downcast_ref::() { + let nulls = binary_view_array.logical_nulls(); + let values = BooleanBuffer::from( + binary_view_array + .prefix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes( + haystack, + v, + equals_ignore_ascii_case_kernel, + ) != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }) + } + } + Self::EndsWith(v) => { + if let Some(binary_view_array) = array.as_any().downcast_ref::() { + let nulls = binary_view_array.logical_nulls(); + let values = BooleanBuffer::from( + binary_view_array + .suffix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, v, equals_kernel) != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_kernel) != negate + }) + } + } + Self::IEndsWithAscii(v) => { + if let Some(binary_view_array) = array.as_any().downcast_ref::() { + let nulls = binary_view_array.logical_nulls(); + let values = BooleanBuffer::from( + binary_view_array + .suffix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes( + haystack, + v, + equals_ignore_ascii_case_kernel, + ) != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate }) } } - Predicate::Regex(v) => { + Self::Regex(v) => { BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) } } } + + /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: + /// + /// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, + /// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`) + /// 2. Replace `LIKE` single-character wildcards `_` => `.` + /// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` + /// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` + fn regex_like(pattern: &[u8], case_insensitive: bool) -> Result { + let mut result = String::with_capacity(pattern.len() * 2); + let mut chars_iter = pattern.iter().peekable(); + match chars_iter.peek() { + // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` + Some(b'%') => { + chars_iter.next(); + } + _ => result.push('^'), + }; + + while let Some(b) = chars_iter.next() { + match b { + b'\\' => { + match chars_iter.peek() { + Some(&next) => { + if regex_syntax::is_meta_character(*next as char) { + result.push('\\'); + } + result.push(*next as char); + // Skipping the next char as it is already appended + chars_iter.next(); + } + None => { + // Trailing backslash in the pattern. E.g. PostgreSQL and Trino treat it as an error, but e.g. Snowflake treats it as a literal backslash + result.push('\\'); + result.push('\\'); + } + } + } + b'%' => result.push_str(".*"), + b'_' => result.push('.'), + b => { + if regex_syntax::is_meta_character(*b as char) { + result.push('\\'); + } + result.push(*b as char); + } + } + } + // instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex + if result.ends_with(".*") { + result.pop(); + result.pop(); + } else { + result.push('$'); + } + BinaryRegexBuilder::new(&result) + .case_insensitive(case_insensitive) + .dot_matches_new_line(true) + .build() + .map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Unable to build regex from LIKE pattern: {e}" + )) + }) + } } fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { @@ -207,22 +578,22 @@ fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> b /// This is faster than `str::starts_with` for small strings. /// See for more details. -fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { +fn starts_with(haystack: &[u8], needle: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { if needle.len() > haystack.len() { false } else { - zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) + zip(haystack, needle).all(byte_eq_kernel) } } /// This is faster than `str::ends_with` for small strings. /// See for more details. -fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { +fn ends_with(haystack: &[u8], needle: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { if needle.len() > haystack.len() { false } else { zip( - haystack.as_bytes().iter().rev(), - needle.as_bytes().iter().rev(), + haystack.iter().rev(), + needle.iter().rev(), ) .all(byte_eq_kernel) } @@ -236,73 +607,8 @@ fn equals_ignore_ascii_case_kernel((n, h): (&u8, &u8)) -> bool { n.eq_ignore_ascii_case(h) } -/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: -/// -/// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, -/// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`) -/// 2. Replace `LIKE` single-character wildcards `_` => `.` -/// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` -/// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` -fn regex_like(pattern: &str, case_insensitive: bool) -> Result { - let mut result = String::with_capacity(pattern.len() * 2); - let mut chars_iter = pattern.chars().peekable(); - match chars_iter.peek() { - // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` - Some('%') => { - chars_iter.next(); - } - _ => result.push('^'), - }; - - while let Some(c) = chars_iter.next() { - match c { - '\\' => { - match chars_iter.peek() { - Some(&next) => { - if regex_syntax::is_meta_character(next) { - result.push('\\'); - } - result.push(next); - // Skipping the next char as it is already appended - chars_iter.next(); - } - None => { - // Trailing backslash in the pattern. E.g. PostgreSQL and Trino treat it as an error, but e.g. Snowflake treats it as a literal backslash - result.push('\\'); - result.push('\\'); - } - } - } - '%' => result.push_str(".*"), - '_' => result.push('.'), - c => { - if regex_syntax::is_meta_character(c) { - result.push('\\'); - } - result.push(c); - } - } - } - // instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex - if result.ends_with(".*") { - result.pop(); - result.pop(); - } else { - result.push('$'); - } - RegexBuilder::new(&result) - .case_insensitive(case_insensitive) - .dot_matches_new_line(true) - .build() - .map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) -} - -fn contains_like_pattern(pattern: &str) -> bool { - memchr3(b'%', b'_', b'\\', pattern.as_bytes()).is_some() +fn contains_like_pattern(pattern: &[u8]) -> bool { + memchr3(b'%', b'_', b'\\', pattern).is_some() } #[cfg(test)] @@ -333,7 +639,7 @@ mod tests { ]; for (like_pattern, expected_regexp) in test_cases { - let r = regex_like(like_pattern, false).unwrap(); + let r = Predicate::regex_like(like_pattern, false).unwrap(); assert_eq!(r.to_string(), expected_regexp); } } From 9dc694715c487178a2cc314fe2d7d2abecb12387 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 17:27:59 +0200 Subject: [PATCH 2/9] cleanup --- arrow-string/src/like.rs | 3019 ++++++++++---------------------------- 1 file changed, 783 insertions(+), 2236 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 02132ef08003..34f249f101c1 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -113,6 +113,85 @@ pub fn contains(left: &dyn Datum, right: &dyn Datum) -> Result: PredicateImpl<'a, UnsizedItem = Self::UnsizedItem>; + + fn is_ascii(&self) -> bool; + + fn iter(&self) -> impl Iterator>; + + fn item_as_bytes(item: &Self::UnsizedItem) -> &[u8]; +} + +impl LikeSupportedArray for GenericStringArray { + type UnsizedItem = str; + type MatchingPredicate<'a> = Predicate<'a>; + + fn is_ascii(&self) -> bool { + self.is_ascii() + } + + fn iter(&self) -> impl Iterator> { + self.iter() + } + + fn item_as_bytes(item: &Self::UnsizedItem) -> &[u8] { + item.as_bytes() + } +} + +impl LikeSupportedArray for StringViewArray { + type UnsizedItem = str; + type MatchingPredicate<'a> = Predicate<'a>; + + fn is_ascii(&self) -> bool { + self.is_ascii() + } + + fn iter(&self) -> impl Iterator> { + self.iter() + } + + fn item_as_bytes(item: &Self::UnsizedItem) -> &[u8] { + item.as_bytes() + } +} + +impl LikeSupportedArray for GenericBinaryArray { + type UnsizedItem = [u8]; + type MatchingPredicate<'a> = BinaryPredicate<'a>; + + fn is_ascii(&self) -> bool { + self.is_ascii() + } + + fn iter(&self) -> impl Iterator> { + self.iter() + } + + fn item_as_bytes(item: &Self::UnsizedItem) -> &[u8] { + item + } +} + +impl LikeSupportedArray for BinaryViewArray { + type UnsizedItem = [u8]; + type MatchingPredicate<'a> = BinaryPredicate<'a>; + + fn is_ascii(&self) -> bool { + self.is_ascii() + } + + fn iter(&self) -> impl Iterator> { + self.iter() + } + + fn item_as_bytes(item: &Self::UnsizedItem) -> &[u8] { + item + } +} + fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { use arrow_schema::DataType::*; let (l, l_s) = lhs.get(); @@ -134,12 +213,12 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { - apply::>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v, |a| a.is_ascii(), |a| a.as_bytes()) + apply::>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) } (LargeUtf8, LargeUtf8) => { - apply::>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v, |a| a.is_ascii(), |a| a.as_bytes()) + apply::>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) } - (Utf8View, Utf8View) => apply::( + (Utf8View, Utf8View) => apply::( op, l.as_string_view(), l_s, @@ -147,16 +226,14 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { - apply::>(op, l.as_binary(), l_s, l_v, r.as_binary(), r_s, r_v, |a| a.is_ascii(), |a| a) + apply::>(op, l.as_binary(), l_s, l_v, r.as_binary(), r_s, r_v) } (LargeBinary, LargeBinary) => { - apply::>(op, l.as_binary(), l_s, l_v, r.as_binary(), r_s, r_v, |a| a.is_ascii(), |a| a) + apply::>(op, l.as_binary(), l_s, l_v, r.as_binary(), r_s, r_v) } - (BinaryView, BinaryView) => apply::( + (BinaryView, BinaryView) => apply::( op, l.as_binary_view(), l_s, @@ -164,8 +241,6 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result Err(ArrowError::InvalidArgumentError(format!( "Invalid string operation: {l_t} {op} {r_t}" @@ -173,17 +248,15 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result + 'a, T: ArrayAccessor + IntoIterator> + Sized + 'a>( +fn apply<'a, T: LikeSupportedArray>( op: Op, - l: T, + l: &'a T, l_s: bool, l_v: Option<&'a dyn AnyDictionaryArray>, - r: T, + r: &'a T, r_s: bool, r_v: Option<&'a dyn AnyDictionaryArray>, - is_ascii: impl Fn(&T) -> bool, - as_bytes: impl Fn(&'a Predicate::UnsizedItem) -> &[u8], -) -> Result { +) -> Result where &'a T: ArrayAccessor { let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); if r_s { let idx = match r_v { @@ -194,51 +267,50 @@ fn apply<'a, Predicate: PredicateImpl<'a> + 'a, T: ArrayAccessor(op, l, l_v, r.value(idx), is_ascii) + op_scalar::(op, l, l_v, r.value(idx)) } else { match (l_s, l_v, r_v) { (true, None, None) => { let v = l.is_valid(0).then(|| l.value(0)); - op_binary::(op, std::iter::repeat(v), r.into_iter(), as_bytes) + op_binary::(op, std::iter::repeat(v), r.iter()) } (true, Some(l_v), None) => { let idx = l_v.is_valid(0).then(|| l_v.normalized_keys()[0]); let v = idx.and_then(|idx| l.is_valid(idx).then(|| l.value(idx))); - op_binary::(op, std::iter::repeat(v), r.into_iter(), as_bytes) + op_binary::(op, std::iter::repeat(v), r.iter()) } (true, None, Some(r_v)) => { let v = l.is_valid(0).then(|| l.value(0)); - op_binary::(op, std::iter::repeat(v), vectored_iter::(r, r_v), as_bytes) + op_binary::(op, std::iter::repeat(v), vectored_iter::(r, r_v)) } (true, Some(l_v), Some(r_v)) => { let idx = l_v.is_valid(0).then(|| l_v.normalized_keys()[0]); let v = idx.and_then(|idx| l.is_valid(idx).then(|| l.value(idx))); - op_binary::(op, std::iter::repeat(v), vectored_iter::(r, r_v), as_bytes) + op_binary::(op, std::iter::repeat(v), vectored_iter::(r, r_v)) } - (false, None, None) => op_binary::(op, l.into_iter(), r.into_iter(), as_bytes), - (false, Some(l_v), None) => op_binary::(op, vectored_iter::(l, l_v), r.into_iter(), as_bytes), - (false, None, Some(r_v)) => op_binary::(op, l.into_iter(), vectored_iter::(r, r_v), as_bytes), + (false, None, None) => op_binary::(op, l.iter(), r.iter()), + (false, Some(l_v), None) => op_binary::(op, vectored_iter::(l, l_v), r.iter()), + (false, None, Some(r_v)) => op_binary::(op, l.iter(), vectored_iter::(r, r_v)), (false, Some(l_v), Some(r_v)) => { - op_binary::(op, vectored_iter::(l, l_v), vectored_iter::(r, r_v), as_bytes) + op_binary::(op, vectored_iter::(l, l_v), vectored_iter::(r, r_v)) } } } } #[inline(never)] -fn op_scalar<'a, Predicate: PredicateImpl<'a>, T: ArrayAccessor>( +fn op_scalar<'a, T: LikeSupportedArray>( op: Op, - l: T, + l: &'a T, l_v: Option<&dyn AnyDictionaryArray>, - r: &'a Predicate::UnsizedItem, - is_ascii: impl Fn(&T) -> bool, -) -> Result { + r: &'a T::UnsizedItem, +) -> Result where &'a T: arrow_array::ArrayAccessor { let r = match op { - Op::Like(neg) => Predicate::like(r)?.evaluate_array(l, neg), - Op::ILike(neg) => Predicate::ilike(r, is_ascii(&l))?.evaluate_array(l, neg), - Op::Contains => Predicate::contains(r).evaluate_array(l, false), - Op::StartsWith => Predicate::starts_with(r).evaluate_array(l, false), - Op::EndsWith => Predicate::ends_with(r).evaluate_array(l, false), + Op::Like(neg) => T::MatchingPredicate::like(r)?.evaluate_array(l, neg), + Op::ILike(neg) => T::MatchingPredicate::ilike(r, l.is_ascii())?.evaluate_array(l, neg), + Op::Contains => T::MatchingPredicate::contains(r).evaluate_array(l, false), + Op::StartsWith => T::MatchingPredicate::starts_with(r).evaluate_array(l, false), + Op::EndsWith => T::MatchingPredicate::ends_with(r).evaluate_array(l, false), }; Ok(match l_v { @@ -247,10 +319,10 @@ fn op_scalar<'a, Predicate: PredicateImpl<'a>, T: ArrayAccessor + 'a, T: ArrayAccessor + Sized + 'a>( - a: T, +fn vectored_iter<'a, T: LikeSupportedArray>( + a: &'a T, a_v: &'a dyn AnyDictionaryArray, -) -> impl Iterator> + 'a { +) -> impl Iterator> + 'a where &'a T: arrow_array::ArrayAccessor + 'a { let nulls = a_v.nulls(); let keys = a_v.normalized_keys(); keys.into_iter().enumerate().map(move |(idx, key)| { @@ -262,23 +334,22 @@ fn vectored_iter<'a, Predicate: PredicateImpl<'a> + 'a, T: ArrayAccessor + 'a>( +fn op_binary<'a, T: LikeSupportedArray + 'a>( op: Op, - l: impl Iterator>, - r: impl Iterator>, - as_bytes: impl Fn(&'a Predicate::UnsizedItem) -> &[u8], + l: impl Iterator>, + r: impl Iterator>, ) -> Result { match op { - Op::Like(neg) => binary_predicate(l, r, neg, Predicate::like), - Op::ILike(neg) => binary_predicate(l, r, neg, |s| Predicate::ilike(s, false)), - Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(as_bytes(l?), as_bytes(r?)))).collect()), + Op::Like(neg) => binary_predicate(l, r, neg, T::MatchingPredicate::like), + Op::ILike(neg) => binary_predicate(l, r, neg, |s| T::MatchingPredicate::ilike(s, false)), + Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(T::item_as_bytes(l?), T::item_as_bytes(r?)))).collect()), Op::StartsWith => Ok(l .zip(r) - .map(|(l, r)| Some(Predicate::starts_with(r?).evaluate(l?))) + .map(|(l, r)| Some(T::MatchingPredicate::starts_with(r?).evaluate(l?))) .collect()), Op::EndsWith => Ok(l .zip(r) - .map(|(l, r)| Some(Predicate::ends_with(r?).evaluate(l?))) + .map(|(l, r)| Some(T::MatchingPredicate::ends_with(r?).evaluate(l?))) .collect()), } } @@ -1205,6 +1276,50 @@ mod tests { vec![true, false, true, true, true] ); + fn like_utf8_scalar(data: &Vec>, pattern: &str) -> Result { + let dict_array: DictionaryArray = data.clone().into_iter().collect(); + like_utf8_scalar_dyn(&dict_array, pattern) + } + + fn like_binary_scalar(data: &Vec>, pattern: &str) -> Result { + let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data.clone()); + + like(&dict_array, &pattern.into_binary_scalar()) + } + + fn nlike_utf8_scalar(data: &Vec>, pattern: &str) -> Result { + let dict_array: DictionaryArray = data.clone().into_iter().collect(); + nlike_utf8_scalar_dyn(&dict_array, pattern) + } + + fn nlike_binary_scalar(data: &Vec>, pattern: &str) -> Result { + let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data.clone()); + + nlike(&dict_array, &pattern.into_binary_scalar()) + } + + fn ilike_utf8_scalar(data: &Vec>, pattern: &str) -> Result { + let dict_array: DictionaryArray = data.clone().into_iter().collect(); + ilike_utf8_scalar_dyn(&dict_array, pattern) + } + + fn ilike_binary_scalar(data: &Vec>, pattern: &str) -> Result { + let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data.clone()); + + ilike(&dict_array, &pattern.into_binary_scalar()) + } + + fn nilike_utf8_scalar(data: &Vec>, pattern: &str) -> Result { + let dict_array: DictionaryArray = data.clone().into_iter().collect(); + nilike_utf8_scalar_dyn(&dict_array, pattern) + } + + fn nilike_binary_scalar(data: &Vec>, pattern: &str) -> Result { + let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data.clone()); + + nilike(&dict_array, &pattern.into_binary_scalar()) + } + #[test] fn test_dict_like_kernels() { let data = vec![ @@ -1217,137 +1332,137 @@ mod tests { Some("bbbbb\nAir"), ]; - let dict_array: DictionaryArray = data.into_iter().collect(); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - None, - Some(true), - Some(false), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - None, - Some(true), - Some(false), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); + for func in &[like_utf8_scalar, like_binary_scalar] { + assert_eq!( + func(&data, "Air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "Air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%r").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%r").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + } } #[test] @@ -1362,137 +1477,139 @@ mod tests { Some("bbbbb\nAir"), ]; - let dict_array: DictionaryArray = data.into_iter().collect(); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(true), - Some(false), - None, - Some(false), - Some(true), - ]), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(true), - Some(false), - None, - Some(false), - Some(true), - ]), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); + for func in &[nlike_utf8_scalar, nlike_binary_scalar] { + assert_eq!( + func(&data, "Air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "Air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%r").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%r").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + } } #[test] @@ -1507,137 +1624,137 @@ mod tests { Some("bbbbb\nAir"), ]; - let dict_array: DictionaryArray = data.into_iter().collect(); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - None, - Some(true), - Some(false), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - None, - Some(true), - Some(false), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); + for func in &[ilike_utf8_scalar, ilike_binary_scalar] { + assert_eq!( + func(&data, "air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%R").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%R").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + } } #[test] @@ -1652,137 +1769,139 @@ mod tests { Some("bbbbb\nAir"), ]; - let dict_array: DictionaryArray = data.into_iter().collect(); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(true), - Some(false), - None, - Some(false), - Some(true), - ]), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(true), - Some(false), - None, - Some(false), - Some(true), - ]), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); + for func in &[nilike_utf8_scalar, nilike_binary_scalar] { + assert_eq!( + func(&data, "air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true), + Some(true), + ]), + ); + + assert_eq!( + func(&data, "%R").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%R").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + + assert_eq!( + func(&data, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(false), + None, + Some(false), + Some(false), + ]), + ); + } } #[test] @@ -1827,6 +1946,34 @@ mod tests { assert_eq!(r.len(), 1, "With pattern {pattern}"); assert_eq!(r.null_count(), 1, "With pattern {pattern}"); assert!(r.is_null(0), "With pattern {pattern}"); + + let a = Scalar::new(BinaryArray::new_null(1)); + let b = BinaryArray::new_scalar(pattern); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = Scalar::new(BinaryArray::new_null(1)); + let b = BinaryArray::from_iter_values([pattern]); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = BinaryArray::new_null(1); + let b = BinaryArray::from_iter_values([pattern]); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); + + let a = BinaryArray::new_null(1); + let b = BinaryArray::new_scalar(pattern); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1, "With pattern {pattern}"); + assert_eq!(r.null_count(), 1, "With pattern {pattern}"); + assert!(r.is_null(0), "With pattern {pattern}"); } } } @@ -1907,6 +2054,34 @@ mod tests { assert_eq!(r.len(), 1); assert_eq!(r.null_count(), 1); assert!(r.is_null(0)); + + let a = BinaryArray::new_scalar("a"); + let b = Scalar::new(BinaryArray::new_null(1)); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = BinaryArray::from_iter_values(["a"]); + let b = Scalar::new(BinaryArray::new_null(1)); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = BinaryArray::from_iter_values(["a"]); + let b = BinaryArray::new_null(1); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = BinaryArray::new_scalar("a"); + let b = BinaryArray::new_null(1); + let r = like_f(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); } } @@ -2181,6 +2356,36 @@ mod tests { ); } } + + for binary_type in [DataType::Binary, DataType::LargeBinary] { + for ((value_datum, value_type), (pattern_datum, pattern_type)) in zip( + make_binary_datums(value.as_bytes(), &binary_type), + make_binary_datums(pattern.as_bytes(), &binary_type), + ) { + let value_datum = value_datum.as_ref(); + let pattern_datum = pattern_datum.as_ref(); + assert_eq!( + like(value_datum, pattern_datum).unwrap(), + expected, + "{value_type:?} «{value}» like {pattern_type:?} «{pattern}»" + ); + assert_eq!( + ilike(value_datum, pattern_datum).unwrap(), + expected, + "{value_type:?} «{value}» ilike {pattern_type:?} «{pattern}»" + ); + assert_eq!( + nlike(value_datum, pattern_datum).unwrap(), + unexpected, + "{value_type:?} «{value}» nlike {pattern_type:?} «{pattern}»" + ); + assert_eq!( + nilike(value_datum, pattern_datum).unwrap(), + unexpected, + "{value_type:?} «{value}» nilike {pattern_type:?} «{pattern}»" + ); + } + } } } @@ -2950,6 +3155,17 @@ mod tests { assert_eq!(nlike(&values, &patterns).unwrap(), unexpected,); assert_eq!(nilike(&values, &patterns).unwrap(), unexpected,); } + + for binary_type in [DataType::Binary, DataType::LargeBinary] { + let values = make_binary_array(values.iter(), &binary_type); + let patterns = make_binary_array(patterns.iter(), &binary_type); + let (values, patterns) = (values.as_ref(), patterns.as_ref()); + + assert_eq!(like(&values, &patterns).unwrap(), expected,); + assert_eq!(ilike(&values, &patterns).unwrap(), expected,); + assert_eq!(nlike(&values, &patterns).unwrap(), unexpected,); + assert_eq!(nilike(&values, &patterns).unwrap(), unexpected,); + } } fn make_datums( @@ -2994,1675 +3210,6 @@ mod tests { } } - - #[test] - fn test_binary_dict_like_kernels() { - let data = vec![ - Some("Earth"), - Some("Fire"), - Some("Water"), - Some("Air"), - None, - Some("Air"), - Some("bbbbb\nAir"), - ]; - - let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data); - - assert_eq!( - like(&dict_array, &"Air".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - None, - Some(true), - Some(false), - ]), - ); - - assert_eq!( - like(&dict_array, &"Air".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - None, - Some(true), - Some(false), - ]), - ); - - assert_eq!( - like(&dict_array, &"Wa%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - like(&dict_array, &"Wa%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - like(&dict_array, &"%r".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - like(&dict_array, &"%r".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - like(&dict_array, &"%i%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - like(&dict_array, &"%i%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - like(&dict_array, &"%a%r%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - like(&dict_array, &"%a%r%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - } - - #[test] - fn test_binary_dict_nlike_kernels() { - let data = vec![ - Some("Earth"), - Some("Fire"), - Some("Water"), - Some("Air"), - None, - Some("Air"), - Some("bbbbb\nAir"), - ]; - - let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data); - - assert_eq!( - nlike(&dict_array, &"Air".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(true), - Some(false), - None, - Some(false), - Some(true), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"Air".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(true), - Some(false), - None, - Some(false), - Some(true), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"Wa%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"Wa%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"%r".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"%r".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"%i%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"%i%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"%a%r%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nlike(&dict_array, &"%a%r%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - } - - #[test] - fn test_binary_dict_ilike_kernels() { - let data = vec![ - Some("Earth"), - Some("Fire"), - Some("Water"), - Some("Air"), - None, - Some("Air"), - Some("bbbbb\nAir"), - ]; - - let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data); - - assert_eq!( - ilike(&dict_array, &"air".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - None, - Some(true), - Some(false), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"air".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - None, - Some(true), - Some(false), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"wa%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"wa%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"%R".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"%R".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"%I%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"%I%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"%A%r%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - ilike(&dict_array, &"%A%r%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - } - - #[test] - fn test_binary_dict_nilike_kernels() { - let data = vec![ - Some("Earth"), - Some("Fire"), - Some("Water"), - Some("Air"), - None, - Some("Air"), - Some("bbbbb\nAir"), - ]; - - let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data); - - assert_eq!( - nilike(&dict_array, &"air".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(true), - Some(false), - None, - Some(false), - Some(true), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"air".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(true), - Some(false), - None, - Some(false), - Some(true), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"wa%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"wa%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"%R".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"%R".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"%I%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"%I%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"%A%r%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - - assert_eq!( - nilike(&dict_array, &"%A%r%".into_binary_scalar()).unwrap(), - BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(false), - None, - Some(false), - Some(false), - ]), - ); - } - - #[test] - fn binary_null_like_pattern() { - // Different patterns have different execution code paths - for pattern in &[ - "", // can execute as equality check - "_", // can execute as length check - "%", // can execute as starts_with("") or non-null check - "a%", // can execute as starts_with("a") - "%a", // can execute as ends_with("") - "a%b", // can execute as starts_with("a") && ends_with("b") - "%a%", // can_execute as contains("a") - "%a%b_c_d%e", // can_execute as regular expression - ] { - // These tests focus on the null handling, but are case-insensitive - for like_f in [like, ilike, nlike, nilike] { - let a = Scalar::new(BinaryArray::new_null(1)); - let b = BinaryArray::new_scalar(pattern); - let r = like_f(&a, &b).unwrap(); - assert_eq!(r.len(), 1, "With pattern {pattern}"); - assert_eq!(r.null_count(), 1, "With pattern {pattern}"); - assert!(r.is_null(0), "With pattern {pattern}"); - - let a = Scalar::new(BinaryArray::new_null(1)); - let b = BinaryArray::from_iter_values([pattern]); - let r = like_f(&a, &b).unwrap(); - assert_eq!(r.len(), 1, "With pattern {pattern}"); - assert_eq!(r.null_count(), 1, "With pattern {pattern}"); - assert!(r.is_null(0), "With pattern {pattern}"); - - let a = BinaryArray::new_null(1); - let b = BinaryArray::from_iter_values([pattern]); - let r = like_f(&a, &b).unwrap(); - assert_eq!(r.len(), 1, "With pattern {pattern}"); - assert_eq!(r.null_count(), 1, "With pattern {pattern}"); - assert!(r.is_null(0), "With pattern {pattern}"); - - let a = BinaryArray::new_null(1); - let b = BinaryArray::new_scalar(pattern); - let r = like_f(&a, &b).unwrap(); - assert_eq!(r.len(), 1, "With pattern {pattern}"); - assert_eq!(r.null_count(), 1, "With pattern {pattern}"); - assert!(r.is_null(0), "With pattern {pattern}"); - } - } - } - - #[test] - fn binary_like_scalar_null() { - for like_f in [like, ilike, nlike, nilike] { - let a = BinaryArray::new_scalar("a"); - let b = Scalar::new(BinaryArray::new_null(1)); - let r = like_f(&a, &b).unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r.null_count(), 1); - assert!(r.is_null(0)); - - let a = BinaryArray::from_iter_values(["a"]); - let b = Scalar::new(BinaryArray::new_null(1)); - let r = like_f(&a, &b).unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r.null_count(), 1); - assert!(r.is_null(0)); - - let a = BinaryArray::from_iter_values(["a"]); - let b = BinaryArray::new_null(1); - let r = like_f(&a, &b).unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r.null_count(), 1); - assert!(r.is_null(0)); - - let a = BinaryArray::new_scalar("a"); - let b = BinaryArray::new_null(1); - let r = like_f(&a, &b).unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r.null_count(), 1); - assert!(r.is_null(0)); - } - } - - #[test] - fn binary_like_escape() { - // (value, pattern, expected) - let test_cases = vec![ - // Empty pattern - (r"", r"", true), - (r"\", r"", false), - // Sole (dangling) escape (some engines consider this invalid pattern) - (r"", r"\", false), - (r"\", r"\", true), - (r"\\", r"\", false), - (r"a", r"\", false), - (r"\a", r"\", false), - (r"\\a", r"\", false), - // Sole escape - (r"", r"\\", false), - (r"\", r"\\", true), - (r"\\", r"\\", false), - (r"a", r"\\", false), - (r"\a", r"\\", false), - (r"\\a", r"\\", false), - // Sole escape and dangling escape - (r"", r"\\\", false), - (r"\", r"\\\", false), - (r"\\", r"\\\", true), - (r"\\\", r"\\\", false), - (r"\\\\", r"\\\", false), - (r"a", r"\\\", false), - (r"\a", r"\\\", false), - (r"\\a", r"\\\", false), - // Sole two escapes - (r"", r"\\\\", false), - (r"\", r"\\\\", false), - (r"\\", r"\\\\", true), - (r"\\\", r"\\\\", false), - (r"\\\\", r"\\\\", false), - (r"\\\\\", r"\\\\", false), - (r"a", r"\\\\", false), - (r"\a", r"\\\\", false), - (r"\\a", r"\\\\", false), - // Escaped non-wildcard - (r"", r"\a", false), - (r"\", r"\a", false), - (r"\\", r"\a", false), - (r"a", r"\a", true), - (r"\a", r"\a", false), - (r"\\a", r"\a", false), - // Escaped _ wildcard - (r"", r"\_", false), - (r"\", r"\_", false), - (r"\\", r"\_", false), - (r"a", r"\_", false), - (r"_", r"\_", true), - (r"%", r"\_", false), - (r"\a", r"\_", false), - (r"\\a", r"\_", false), - (r"\_", r"\_", false), - (r"\\_", r"\_", false), - // Escaped % wildcard - (r"", r"\%", false), - (r"\", r"\%", false), - (r"\\", r"\%", false), - (r"a", r"\%", false), - (r"_", r"\%", false), - (r"%", r"\%", true), - (r"\a", r"\%", false), - (r"\\a", r"\%", false), - (r"\%", r"\%", false), - (r"\\%", r"\%", false), - // Escape and non-wildcard - (r"", r"\\a", false), - (r"\", r"\\a", false), - (r"\\", r"\\a", false), - (r"a", r"\\a", false), - (r"\a", r"\\a", true), - (r"\\a", r"\\a", false), - (r"\\\a", r"\\a", false), - // Escape and _ wildcard - (r"", r"\\_", false), - (r"\", r"\\_", false), - (r"\\", r"\\_", true), - (r"a", r"\\_", false), - (r"_", r"\\_", false), - (r"%", r"\\_", false), - (r"\a", r"\\_", true), - (r"\\a", r"\\_", false), - (r"\_", r"\\_", true), - (r"\\_", r"\\_", false), - (r"\\\_", r"\\_", false), - // Escape and % wildcard - (r"", r"\\%", false), - (r"\", r"\\%", true), - (r"\\", r"\\%", true), - (r"a", r"\\%", false), - (r"ab", r"\\%", false), - (r"a%", r"\\%", false), - (r"_", r"\\%", false), - (r"%", r"\\%", false), - (r"\a", r"\\%", true), - (r"\\a", r"\\%", true), - (r"\%", r"\\%", true), - (r"\\%", r"\\%", true), - (r"\\\%", r"\\%", true), - // %... pattern with dangling wildcard - (r"\", r"%\", true), - (r"\\", r"%\", true), - (r"%\", r"%\", true), - (r"%\\", r"%\", true), - (r"abc\", r"%\", true), - (r"abc", r"%\", false), - // %... pattern with wildcard - (r"\", r"%\\", true), - (r"\\", r"%\\", true), - (r"%\\", r"%\\", true), - (r"%\\\", r"%\\", true), - (r"abc\", r"%\\", true), - (r"abc", r"%\\", false), - // %... pattern including escaped non-wildcard - (r"ac", r"%a\c", true), - (r"xyzac", r"%a\c", true), - (r"abc", r"%a\c", false), - (r"a\c", r"%a\c", false), - (r"%a\c", r"%a\c", false), - // %... pattern including escape - (r"\", r"%a\\c", false), - (r"\\", r"%a\\c", false), - (r"ac", r"%a\\c", false), - (r"a\c", r"%a\\c", true), - (r"a\\c", r"%a\\c", false), - (r"abc", r"%a\\c", false), - (r"xyza\c", r"%a\\c", true), - (r"xyza\\c", r"%a\\c", false), - (r"%a\\c", r"%a\\c", false), - // ...% pattern with wildcard - (r"\", r"\\%", true), - (r"\\", r"\\%", true), - (r"\\%", r"\\%", true), - (r"\\\%", r"\\%", true), - (r"\abc", r"\\%", true), - (r"a", r"\\%", false), - (r"abc", r"\\%", false), - // ...% pattern including escaped non-wildcard - (r"ac", r"a\c%", true), - (r"acxyz", r"a\c%", true), - (r"abc", r"a\c%", false), - (r"a\c", r"a\c%", false), - (r"a\c%", r"a\c%", false), - (r"a\\c%", r"a\c%", false), - // ...% pattern including escape - (r"ac", r"a\\c%", false), - (r"a\c", r"a\\c%", true), - (r"a\cxyz", r"a\\c%", true), - (r"a\\c", r"a\\c%", false), - (r"a\\cxyz", r"a\\c%", false), - (r"abc", r"a\\c%", false), - (r"abcxyz", r"a\\c%", false), - (r"a\\c%", r"a\\c%", false), - // %...% pattern including escaped non-wildcard - (r"ac", r"%a\c%", true), - (r"xyzacxyz", r"%a\c%", true), - (r"abc", r"%a\c%", false), - (r"a\c", r"%a\c%", false), - (r"xyza\cxyz", r"%a\c%", false), - (r"%a\c%", r"%a\c%", false), - (r"%a\\c%", r"%a\c%", false), - // %...% pattern including escape - (r"ac", r"%a\\c%", false), - (r"a\c", r"%a\\c%", true), - (r"xyza\cxyz", r"%a\\c%", true), - (r"a\\c", r"%a\\c%", false), - (r"xyza\\cxyz", r"%a\\c%", false), - (r"abc", r"%a\\c%", false), - (r"xyzabcxyz", r"%a\\c%", false), - (r"%a\\c%", r"%a\\c%", false), - // Odd (7) backslashes and % wildcard - (r"\\%", r"\\\\\\\%", false), - (r"\\\", r"\\\\\\\%", false), - (r"\\\%", r"\\\\\\\%", true), - (r"\\\\", r"\\\\\\\%", false), - (r"\\\\%", r"\\\\\\\%", false), - (r"\\\\\\\%", r"\\\\\\\%", false), - // Odd (7) backslashes and _ wildcard - (r"\\\", r"\\\\\\\_", false), - (r"\\\\", r"\\\\\\\_", false), - (r"\\\_", r"\\\\\\\_", true), - (r"\\\\", r"\\\\\\\_", false), - (r"\\\a", r"\\\\\\\_", false), - (r"\\\\_", r"\\\\\\\_", false), - (r"\\\\\\\_", r"\\\\\\\_", false), - // Even (8) backslashes and % wildcard - (r"\\\", r"\\\\\\\\%", false), - (r"\\\\", r"\\\\\\\\%", true), - (r"\\\\\", r"\\\\\\\\%", true), - (r"\\\\xyz", r"\\\\\\\\%", true), - (r"\\\\\\\\%", r"\\\\\\\\%", true), - // Even (8) backslashes and _ wildcard - (r"\\\", r"\\\\\\\\_", false), - (r"\\\\", r"\\\\\\\\_", false), - (r"\\\\\", r"\\\\\\\\_", true), - (r"\\\\a", r"\\\\\\\\_", true), - (r"\\\\\a", r"\\\\\\\\_", false), - (r"\\\\ab", r"\\\\\\\\_", false), - (r"\\\\\\\\_", r"\\\\\\\\_", false), - ]; - - for (value, pattern, expected) in test_cases { - let unexpected = BooleanArray::from(vec![!expected]); - let expected = BooleanArray::from(vec![expected]); - - for binary_type in [DataType::Binary, DataType::LargeBinary] { - for ((value_datum, value_type), (pattern_datum, pattern_type)) in zip( - make_binary_datums(value.as_bytes(), &binary_type), - make_binary_datums(pattern.as_bytes(), &binary_type), - ) { - let value_datum = value_datum.as_ref(); - let pattern_datum = pattern_datum.as_ref(); - assert_eq!( - like(value_datum, pattern_datum).unwrap(), - expected, - "{value_type:?} «{value}» like {pattern_type:?} «{pattern}»" - ); - assert_eq!( - ilike(value_datum, pattern_datum).unwrap(), - expected, - "{value_type:?} «{value}» ilike {pattern_type:?} «{pattern}»" - ); - assert_eq!( - nlike(value_datum, pattern_datum).unwrap(), - unexpected, - "{value_type:?} «{value}» nlike {pattern_type:?} «{pattern}»" - ); - assert_eq!( - nilike(value_datum, pattern_datum).unwrap(), - unexpected, - "{value_type:?} «{value}» nilike {pattern_type:?} «{pattern}»" - ); - } - } - } - } - - #[test] - fn binary_like_escape_many() { - // (value, pattern, expected) - let test_cases = vec![ - (r"", r"", true), - (r"\", r"", false), - (r"\\", r"", false), - (r"\\\", r"", false), - (r"\\\\", r"", false), - (r"a", r"", false), - (r"\a", r"", false), - (r"\\a", r"", false), - (r"%", r"", false), - (r"\%", r"", false), - (r"\\%", r"", false), - (r"%%", r"", false), - (r"\%%", r"", false), - (r"\\%%", r"", false), - (r"_", r"", false), - (r"\_", r"", false), - (r"\\_", r"", false), - (r"__", r"", false), - (r"\__", r"", false), - (r"\\__", r"", false), - (r"abc", r"", false), - (r"a_c", r"", false), - (r"a\bc", r"", false), - (r"a\_c", r"", false), - (r"%abc", r"", false), - (r"\%abc", r"", false), - (r"a\\_c%", r"", false), - (r"", r"\", false), - (r"\", r"\", true), - (r"\\", r"\", false), - (r"\\\", r"\", false), - (r"\\\\", r"\", false), - (r"a", r"\", false), - (r"\a", r"\", false), - (r"\\a", r"\", false), - (r"%", r"\", false), - (r"\%", r"\", false), - (r"\\%", r"\", false), - (r"%%", r"\", false), - (r"\%%", r"\", false), - (r"\\%%", r"\", false), - (r"_", r"\", false), - (r"\_", r"\", false), - (r"\\_", r"\", false), - (r"__", r"\", false), - (r"\__", r"\", false), - (r"\\__", r"\", false), - (r"abc", r"\", false), - (r"a_c", r"\", false), - (r"a\bc", r"\", false), - (r"a\_c", r"\", false), - (r"%abc", r"\", false), - (r"\%abc", r"\", false), - (r"a\\_c%", r"\", false), - (r"", r"\\", false), - (r"\", r"\\", true), - (r"\\", r"\\", false), - (r"\\\", r"\\", false), - (r"\\\\", r"\\", false), - (r"a", r"\\", false), - (r"\a", r"\\", false), - (r"\\a", r"\\", false), - (r"%", r"\\", false), - (r"\%", r"\\", false), - (r"\\%", r"\\", false), - (r"%%", r"\\", false), - (r"\%%", r"\\", false), - (r"\\%%", r"\\", false), - (r"_", r"\\", false), - (r"\_", r"\\", false), - (r"\\_", r"\\", false), - (r"__", r"\\", false), - (r"\__", r"\\", false), - (r"\\__", r"\\", false), - (r"abc", r"\\", false), - (r"a_c", r"\\", false), - (r"a\bc", r"\\", false), - (r"a\_c", r"\\", false), - (r"%abc", r"\\", false), - (r"\%abc", r"\\", false), - (r"a\\_c%", r"\\", false), - (r"", r"\\\", false), - (r"\", r"\\\", false), - (r"\\", r"\\\", true), - (r"\\\", r"\\\", false), - (r"\\\\", r"\\\", false), - (r"a", r"\\\", false), - (r"\a", r"\\\", false), - (r"\\a", r"\\\", false), - (r"%", r"\\\", false), - (r"\%", r"\\\", false), - (r"\\%", r"\\\", false), - (r"%%", r"\\\", false), - (r"\%%", r"\\\", false), - (r"\\%%", r"\\\", false), - (r"_", r"\\\", false), - (r"\_", r"\\\", false), - (r"\\_", r"\\\", false), - (r"__", r"\\\", false), - (r"\__", r"\\\", false), - (r"\\__", r"\\\", false), - (r"abc", r"\\\", false), - (r"a_c", r"\\\", false), - (r"a\bc", r"\\\", false), - (r"a\_c", r"\\\", false), - (r"%abc", r"\\\", false), - (r"\%abc", r"\\\", false), - (r"a\\_c%", r"\\\", false), - (r"", r"\\\\", false), - (r"\", r"\\\\", false), - (r"\\", r"\\\\", true), - (r"\\\", r"\\\\", false), - (r"\\\\", r"\\\\", false), - (r"a", r"\\\\", false), - (r"\a", r"\\\\", false), - (r"\\a", r"\\\\", false), - (r"%", r"\\\\", false), - (r"\%", r"\\\\", false), - (r"\\%", r"\\\\", false), - (r"%%", r"\\\\", false), - (r"\%%", r"\\\\", false), - (r"\\%%", r"\\\\", false), - (r"_", r"\\\\", false), - (r"\_", r"\\\\", false), - (r"\\_", r"\\\\", false), - (r"__", r"\\\\", false), - (r"\__", r"\\\\", false), - (r"\\__", r"\\\\", false), - (r"abc", r"\\\\", false), - (r"a_c", r"\\\\", false), - (r"a\bc", r"\\\\", false), - (r"a\_c", r"\\\\", false), - (r"%abc", r"\\\\", false), - (r"\%abc", r"\\\\", false), - (r"a\\_c%", r"\\\\", false), - (r"", r"a", false), - (r"\", r"a", false), - (r"\\", r"a", false), - (r"\\\", r"a", false), - (r"\\\\", r"a", false), - (r"a", r"a", true), - (r"\a", r"a", false), - (r"\\a", r"a", false), - (r"%", r"a", false), - (r"\%", r"a", false), - (r"\\%", r"a", false), - (r"%%", r"a", false), - (r"\%%", r"a", false), - (r"\\%%", r"a", false), - (r"_", r"a", false), - (r"\_", r"a", false), - (r"\\_", r"a", false), - (r"__", r"a", false), - (r"\__", r"a", false), - (r"\\__", r"a", false), - (r"abc", r"a", false), - (r"a_c", r"a", false), - (r"a\bc", r"a", false), - (r"a\_c", r"a", false), - (r"%abc", r"a", false), - (r"\%abc", r"a", false), - (r"a\\_c%", r"a", false), - (r"", r"\a", false), - (r"\", r"\a", false), - (r"\\", r"\a", false), - (r"\\\", r"\a", false), - (r"\\\\", r"\a", false), - (r"a", r"\a", true), - (r"\a", r"\a", false), - (r"\\a", r"\a", false), - (r"%", r"\a", false), - (r"\%", r"\a", false), - (r"\\%", r"\a", false), - (r"%%", r"\a", false), - (r"\%%", r"\a", false), - (r"\\%%", r"\a", false), - (r"_", r"\a", false), - (r"\_", r"\a", false), - (r"\\_", r"\a", false), - (r"__", r"\a", false), - (r"\__", r"\a", false), - (r"\\__", r"\a", false), - (r"abc", r"\a", false), - (r"a_c", r"\a", false), - (r"a\bc", r"\a", false), - (r"a\_c", r"\a", false), - (r"%abc", r"\a", false), - (r"\%abc", r"\a", false), - (r"a\\_c%", r"\a", false), - (r"", r"\\a", false), - (r"\", r"\\a", false), - (r"\\", r"\\a", false), - (r"\\\", r"\\a", false), - (r"\\\\", r"\\a", false), - (r"a", r"\\a", false), - (r"\a", r"\\a", true), - (r"\\a", r"\\a", false), - (r"%", r"\\a", false), - (r"\%", r"\\a", false), - (r"\\%", r"\\a", false), - (r"%%", r"\\a", false), - (r"\%%", r"\\a", false), - (r"\\%%", r"\\a", false), - (r"_", r"\\a", false), - (r"\_", r"\\a", false), - (r"\\_", r"\\a", false), - (r"__", r"\\a", false), - (r"\__", r"\\a", false), - (r"\\__", r"\\a", false), - (r"abc", r"\\a", false), - (r"a_c", r"\\a", false), - (r"a\bc", r"\\a", false), - (r"a\_c", r"\\a", false), - (r"%abc", r"\\a", false), - (r"\%abc", r"\\a", false), - (r"a\\_c%", r"\\a", false), - (r"", r"%", true), - (r"\", r"%", true), - (r"\\", r"%", true), - (r"\\\", r"%", true), - (r"\\\\", r"%", true), - (r"a", r"%", true), - (r"\a", r"%", true), - (r"\\a", r"%", true), - (r"%", r"%", true), - (r"\%", r"%", true), - (r"\\%", r"%", true), - (r"%%", r"%", true), - (r"\%%", r"%", true), - (r"\\%%", r"%", true), - (r"_", r"%", true), - (r"\_", r"%", true), - (r"\\_", r"%", true), - (r"__", r"%", true), - (r"\__", r"%", true), - (r"\\__", r"%", true), - (r"abc", r"%", true), - (r"a_c", r"%", true), - (r"a\bc", r"%", true), - (r"a\_c", r"%", true), - (r"%abc", r"%", true), - (r"\%abc", r"%", true), - (r"a\\_c%", r"%", true), - (r"", r"\%", false), - (r"\", r"\%", false), - (r"\\", r"\%", false), - (r"\\\", r"\%", false), - (r"\\\\", r"\%", false), - (r"a", r"\%", false), - (r"\a", r"\%", false), - (r"\\a", r"\%", false), - (r"%", r"\%", true), - (r"\%", r"\%", false), - (r"\\%", r"\%", false), - (r"%%", r"\%", false), - (r"\%%", r"\%", false), - (r"\\%%", r"\%", false), - (r"_", r"\%", false), - (r"\_", r"\%", false), - (r"\\_", r"\%", false), - (r"__", r"\%", false), - (r"\__", r"\%", false), - (r"\\__", r"\%", false), - (r"abc", r"\%", false), - (r"a_c", r"\%", false), - (r"a\bc", r"\%", false), - (r"a\_c", r"\%", false), - (r"%abc", r"\%", false), - (r"\%abc", r"\%", false), - (r"a\\_c%", r"\%", false), - (r"", r"\\%", false), - (r"\", r"\\%", true), - (r"\\", r"\\%", true), - (r"\\\", r"\\%", true), - (r"\\\\", r"\\%", true), - (r"a", r"\\%", false), - (r"\a", r"\\%", true), - (r"\\a", r"\\%", true), - (r"%", r"\\%", false), - (r"\%", r"\\%", true), - (r"\\%", r"\\%", true), - (r"%%", r"\\%", false), - (r"\%%", r"\\%", true), - (r"\\%%", r"\\%", true), - (r"_", r"\\%", false), - (r"\_", r"\\%", true), - (r"\\_", r"\\%", true), - (r"__", r"\\%", false), - (r"\__", r"\\%", true), - (r"\\__", r"\\%", true), - (r"abc", r"\\%", false), - (r"a_c", r"\\%", false), - (r"a\bc", r"\\%", false), - (r"a\_c", r"\\%", false), - (r"%abc", r"\\%", false), - (r"\%abc", r"\\%", true), - (r"a\\_c%", r"\\%", false), - (r"", r"%%", true), - (r"\", r"%%", true), - (r"\\", r"%%", true), - (r"\\\", r"%%", true), - (r"\\\\", r"%%", true), - (r"a", r"%%", true), - (r"\a", r"%%", true), - (r"\\a", r"%%", true), - (r"%", r"%%", true), - (r"\%", r"%%", true), - (r"\\%", r"%%", true), - (r"%%", r"%%", true), - (r"\%%", r"%%", true), - (r"\\%%", r"%%", true), - (r"_", r"%%", true), - (r"\_", r"%%", true), - (r"\\_", r"%%", true), - (r"__", r"%%", true), - (r"\__", r"%%", true), - (r"\\__", r"%%", true), - (r"abc", r"%%", true), - (r"a_c", r"%%", true), - (r"a\bc", r"%%", true), - (r"a\_c", r"%%", true), - (r"%abc", r"%%", true), - (r"\%abc", r"%%", true), - (r"a\\_c%", r"%%", true), - (r"", r"\%%", false), - (r"\", r"\%%", false), - (r"\\", r"\%%", false), - (r"\\\", r"\%%", false), - (r"\\\\", r"\%%", false), - (r"a", r"\%%", false), - (r"\a", r"\%%", false), - (r"\\a", r"\%%", false), - (r"%", r"\%%", true), - (r"\%", r"\%%", false), - (r"\\%", r"\%%", false), - (r"%%", r"\%%", true), - (r"\%%", r"\%%", false), - (r"\\%%", r"\%%", false), - (r"_", r"\%%", false), - (r"\_", r"\%%", false), - (r"\\_", r"\%%", false), - (r"__", r"\%%", false), - (r"\__", r"\%%", false), - (r"\\__", r"\%%", false), - (r"abc", r"\%%", false), - (r"a_c", r"\%%", false), - (r"a\bc", r"\%%", false), - (r"a\_c", r"\%%", false), - (r"%abc", r"\%%", true), - (r"\%abc", r"\%%", false), - (r"a\\_c%", r"\%%", false), - (r"", r"\\%%", false), - (r"\", r"\\%%", true), - (r"\\", r"\\%%", true), - (r"\\\", r"\\%%", true), - (r"\\\\", r"\\%%", true), - (r"a", r"\\%%", false), - (r"\a", r"\\%%", true), - (r"\\a", r"\\%%", true), - (r"%", r"\\%%", false), - (r"\%", r"\\%%", true), - (r"\\%", r"\\%%", true), - (r"%%", r"\\%%", false), - (r"\%%", r"\\%%", true), - (r"\\%%", r"\\%%", true), - (r"_", r"\\%%", false), - (r"\_", r"\\%%", true), - (r"\\_", r"\\%%", true), - (r"__", r"\\%%", false), - (r"\__", r"\\%%", true), - (r"\\__", r"\\%%", true), - (r"abc", r"\\%%", false), - (r"a_c", r"\\%%", false), - (r"a\bc", r"\\%%", false), - (r"a\_c", r"\\%%", false), - (r"%abc", r"\\%%", false), - (r"\%abc", r"\\%%", true), - (r"a\\_c%", r"\\%%", false), - (r"", r"_", false), - (r"\", r"_", true), - (r"\\", r"_", false), - (r"\\\", r"_", false), - (r"\\\\", r"_", false), - (r"a", r"_", true), - (r"\a", r"_", false), - (r"\\a", r"_", false), - (r"%", r"_", true), - (r"\%", r"_", false), - (r"\\%", r"_", false), - (r"%%", r"_", false), - (r"\%%", r"_", false), - (r"\\%%", r"_", false), - (r"_", r"_", true), - (r"\_", r"_", false), - (r"\\_", r"_", false), - (r"__", r"_", false), - (r"\__", r"_", false), - (r"\\__", r"_", false), - (r"abc", r"_", false), - (r"a_c", r"_", false), - (r"a\bc", r"_", false), - (r"a\_c", r"_", false), - (r"%abc", r"_", false), - (r"\%abc", r"_", false), - (r"a\\_c%", r"_", false), - (r"", r"\_", false), - (r"\", r"\_", false), - (r"\\", r"\_", false), - (r"\\\", r"\_", false), - (r"\\\\", r"\_", false), - (r"a", r"\_", false), - (r"\a", r"\_", false), - (r"\\a", r"\_", false), - (r"%", r"\_", false), - (r"\%", r"\_", false), - (r"\\%", r"\_", false), - (r"%%", r"\_", false), - (r"\%%", r"\_", false), - (r"\\%%", r"\_", false), - (r"_", r"\_", true), - (r"\_", r"\_", false), - (r"\\_", r"\_", false), - (r"__", r"\_", false), - (r"\__", r"\_", false), - (r"\\__", r"\_", false), - (r"abc", r"\_", false), - (r"a_c", r"\_", false), - (r"a\bc", r"\_", false), - (r"a\_c", r"\_", false), - (r"%abc", r"\_", false), - (r"\%abc", r"\_", false), - (r"a\\_c%", r"\_", false), - (r"", r"\\_", false), - (r"\", r"\\_", false), - (r"\\", r"\\_", true), - (r"\\\", r"\\_", false), - (r"\\\\", r"\\_", false), - (r"a", r"\\_", false), - (r"\a", r"\\_", true), - (r"\\a", r"\\_", false), - (r"%", r"\\_", false), - (r"\%", r"\\_", true), - (r"\\%", r"\\_", false), - (r"%%", r"\\_", false), - (r"\%%", r"\\_", false), - (r"\\%%", r"\\_", false), - (r"_", r"\\_", false), - (r"\_", r"\\_", true), - (r"\\_", r"\\_", false), - (r"__", r"\\_", false), - (r"\__", r"\\_", false), - (r"\\__", r"\\_", false), - (r"abc", r"\\_", false), - (r"a_c", r"\\_", false), - (r"a\bc", r"\\_", false), - (r"a\_c", r"\\_", false), - (r"%abc", r"\\_", false), - (r"\%abc", r"\\_", false), - (r"a\\_c%", r"\\_", false), - (r"", r"__", false), - (r"\", r"__", false), - (r"\\", r"__", true), - (r"\\\", r"__", false), - (r"\\\\", r"__", false), - (r"a", r"__", false), - (r"\a", r"__", true), - (r"\\a", r"__", false), - (r"%", r"__", false), - (r"\%", r"__", true), - (r"\\%", r"__", false), - (r"%%", r"__", true), - (r"\%%", r"__", false), - (r"\\%%", r"__", false), - (r"_", r"__", false), - (r"\_", r"__", true), - (r"\\_", r"__", false), - (r"__", r"__", true), - (r"\__", r"__", false), - (r"\\__", r"__", false), - (r"abc", r"__", false), - (r"a_c", r"__", false), - (r"a\bc", r"__", false), - (r"a\_c", r"__", false), - (r"%abc", r"__", false), - (r"\%abc", r"__", false), - (r"a\\_c%", r"__", false), - (r"", r"\__", false), - (r"\", r"\__", false), - (r"\\", r"\__", false), - (r"\\\", r"\__", false), - (r"\\\\", r"\__", false), - (r"a", r"\__", false), - (r"\a", r"\__", false), - (r"\\a", r"\__", false), - (r"%", r"\__", false), - (r"\%", r"\__", false), - (r"\\%", r"\__", false), - (r"%%", r"\__", false), - (r"\%%", r"\__", false), - (r"\\%%", r"\__", false), - (r"_", r"\__", false), - (r"\_", r"\__", false), - (r"\\_", r"\__", false), - (r"__", r"\__", true), - (r"\__", r"\__", false), - (r"\\__", r"\__", false), - (r"abc", r"\__", false), - (r"a_c", r"\__", false), - (r"a\bc", r"\__", false), - (r"a\_c", r"\__", false), - (r"%abc", r"\__", false), - (r"\%abc", r"\__", false), - (r"a\\_c%", r"\__", false), - (r"", r"\\__", false), - (r"\", r"\\__", false), - (r"\\", r"\\__", false), - (r"\\\", r"\\__", true), - (r"\\\\", r"\\__", false), - (r"a", r"\\__", false), - (r"\a", r"\\__", false), - (r"\\a", r"\\__", true), - (r"%", r"\\__", false), - (r"\%", r"\\__", false), - (r"\\%", r"\\__", true), - (r"%%", r"\\__", false), - (r"\%%", r"\\__", true), - (r"\\%%", r"\\__", false), - (r"_", r"\\__", false), - (r"\_", r"\\__", false), - (r"\\_", r"\\__", true), - (r"__", r"\\__", false), - (r"\__", r"\\__", true), - (r"\\__", r"\\__", false), - (r"abc", r"\\__", false), - (r"a_c", r"\\__", false), - (r"a\bc", r"\\__", false), - (r"a\_c", r"\\__", false), - (r"%abc", r"\\__", false), - (r"\%abc", r"\\__", false), - (r"a\\_c%", r"\\__", false), - (r"", r"abc", false), - (r"\", r"abc", false), - (r"\\", r"abc", false), - (r"\\\", r"abc", false), - (r"\\\\", r"abc", false), - (r"a", r"abc", false), - (r"\a", r"abc", false), - (r"\\a", r"abc", false), - (r"%", r"abc", false), - (r"\%", r"abc", false), - (r"\\%", r"abc", false), - (r"%%", r"abc", false), - (r"\%%", r"abc", false), - (r"\\%%", r"abc", false), - (r"_", r"abc", false), - (r"\_", r"abc", false), - (r"\\_", r"abc", false), - (r"__", r"abc", false), - (r"\__", r"abc", false), - (r"\\__", r"abc", false), - (r"abc", r"abc", true), - (r"a_c", r"abc", false), - (r"a\bc", r"abc", false), - (r"a\_c", r"abc", false), - (r"%abc", r"abc", false), - (r"\%abc", r"abc", false), - (r"a\\_c%", r"abc", false), - (r"", r"a_c", false), - (r"\", r"a_c", false), - (r"\\", r"a_c", false), - (r"\\\", r"a_c", false), - (r"\\\\", r"a_c", false), - (r"a", r"a_c", false), - (r"\a", r"a_c", false), - (r"\\a", r"a_c", false), - (r"%", r"a_c", false), - (r"\%", r"a_c", false), - (r"\\%", r"a_c", false), - (r"%%", r"a_c", false), - (r"\%%", r"a_c", false), - (r"\\%%", r"a_c", false), - (r"_", r"a_c", false), - (r"\_", r"a_c", false), - (r"\\_", r"a_c", false), - (r"__", r"a_c", false), - (r"\__", r"a_c", false), - (r"\\__", r"a_c", false), - (r"abc", r"a_c", true), - (r"a_c", r"a_c", true), - (r"a\bc", r"a_c", false), - (r"a\_c", r"a_c", false), - (r"%abc", r"a_c", false), - (r"\%abc", r"a_c", false), - (r"a\\_c%", r"a_c", false), - (r"", r"a\bc", false), - (r"\", r"a\bc", false), - (r"\\", r"a\bc", false), - (r"\\\", r"a\bc", false), - (r"\\\\", r"a\bc", false), - (r"a", r"a\bc", false), - (r"\a", r"a\bc", false), - (r"\\a", r"a\bc", false), - (r"%", r"a\bc", false), - (r"\%", r"a\bc", false), - (r"\\%", r"a\bc", false), - (r"%%", r"a\bc", false), - (r"\%%", r"a\bc", false), - (r"\\%%", r"a\bc", false), - (r"_", r"a\bc", false), - (r"\_", r"a\bc", false), - (r"\\_", r"a\bc", false), - (r"__", r"a\bc", false), - (r"\__", r"a\bc", false), - (r"\\__", r"a\bc", false), - (r"abc", r"a\bc", true), - (r"a_c", r"a\bc", false), - (r"a\bc", r"a\bc", false), - (r"a\_c", r"a\bc", false), - (r"%abc", r"a\bc", false), - (r"\%abc", r"a\bc", false), - (r"a\\_c%", r"a\bc", false), - (r"", r"a\_c", false), - (r"\", r"a\_c", false), - (r"\\", r"a\_c", false), - (r"\\\", r"a\_c", false), - (r"\\\\", r"a\_c", false), - (r"a", r"a\_c", false), - (r"\a", r"a\_c", false), - (r"\\a", r"a\_c", false), - (r"%", r"a\_c", false), - (r"\%", r"a\_c", false), - (r"\\%", r"a\_c", false), - (r"%%", r"a\_c", false), - (r"\%%", r"a\_c", false), - (r"\\%%", r"a\_c", false), - (r"_", r"a\_c", false), - (r"\_", r"a\_c", false), - (r"\\_", r"a\_c", false), - (r"__", r"a\_c", false), - (r"\__", r"a\_c", false), - (r"\\__", r"a\_c", false), - (r"abc", r"a\_c", false), - (r"a_c", r"a\_c", true), - (r"a\bc", r"a\_c", false), - (r"a\_c", r"a\_c", false), - (r"%abc", r"a\_c", false), - (r"\%abc", r"a\_c", false), - (r"a\\_c%", r"a\_c", false), - (r"", r"%abc", false), - (r"\", r"%abc", false), - (r"\\", r"%abc", false), - (r"\\\", r"%abc", false), - (r"\\\\", r"%abc", false), - (r"a", r"%abc", false), - (r"\a", r"%abc", false), - (r"\\a", r"%abc", false), - (r"%", r"%abc", false), - (r"\%", r"%abc", false), - (r"\\%", r"%abc", false), - (r"%%", r"%abc", false), - (r"\%%", r"%abc", false), - (r"\\%%", r"%abc", false), - (r"_", r"%abc", false), - (r"\_", r"%abc", false), - (r"\\_", r"%abc", false), - (r"__", r"%abc", false), - (r"\__", r"%abc", false), - (r"\\__", r"%abc", false), - (r"abc", r"%abc", true), - (r"a_c", r"%abc", false), - (r"a\bc", r"%abc", false), - (r"a\_c", r"%abc", false), - (r"%abc", r"%abc", true), - (r"\%abc", r"%abc", true), - (r"a\\_c%", r"%abc", false), - (r"", r"\%abc", false), - (r"\", r"\%abc", false), - (r"\\", r"\%abc", false), - (r"\\\", r"\%abc", false), - (r"\\\\", r"\%abc", false), - (r"a", r"\%abc", false), - (r"\a", r"\%abc", false), - (r"\\a", r"\%abc", false), - (r"%", r"\%abc", false), - (r"\%", r"\%abc", false), - (r"\\%", r"\%abc", false), - (r"%%", r"\%abc", false), - (r"\%%", r"\%abc", false), - (r"\\%%", r"\%abc", false), - (r"_", r"\%abc", false), - (r"\_", r"\%abc", false), - (r"\\_", r"\%abc", false), - (r"__", r"\%abc", false), - (r"\__", r"\%abc", false), - (r"\\__", r"\%abc", false), - (r"abc", r"\%abc", false), - (r"a_c", r"\%abc", false), - (r"a\bc", r"\%abc", false), - (r"a\_c", r"\%abc", false), - (r"%abc", r"\%abc", true), - (r"\%abc", r"\%abc", false), - (r"a\\_c%", r"\%abc", false), - (r"", r"a\\_c%", false), - (r"\", r"a\\_c%", false), - (r"\\", r"a\\_c%", false), - (r"\\\", r"a\\_c%", false), - (r"\\\\", r"a\\_c%", false), - (r"a", r"a\\_c%", false), - (r"\a", r"a\\_c%", false), - (r"\\a", r"a\\_c%", false), - (r"%", r"a\\_c%", false), - (r"\%", r"a\\_c%", false), - (r"\\%", r"a\\_c%", false), - (r"%%", r"a\\_c%", false), - (r"\%%", r"a\\_c%", false), - (r"\\%%", r"a\\_c%", false), - (r"_", r"a\\_c%", false), - (r"\_", r"a\\_c%", false), - (r"\\_", r"a\\_c%", false), - (r"__", r"a\\_c%", false), - (r"\__", r"a\\_c%", false), - (r"\\__", r"a\\_c%", false), - (r"abc", r"a\\_c%", false), - (r"a_c", r"a\\_c%", false), - (r"a\bc", r"a\\_c%", true), - (r"a\_c", r"a\\_c%", true), - (r"%abc", r"a\\_c%", false), - (r"\%abc", r"a\\_c%", false), - (r"a\\_c%", r"a\\_c%", false), - ]; - - let values = test_cases - .iter() - .map(|(value, _, _)| *value) - .collect::>(); - let patterns = test_cases - .iter() - .map(|(_, pattern, _)| *pattern) - .collect::>(); - let expected = BooleanArray::from( - test_cases - .iter() - .map(|(_, _, expected)| *expected) - .collect::>(), - ); - let unexpected = BooleanArray::from( - test_cases - .iter() - .map(|(_, _, expected)| !*expected) - .collect::>(), - ); - - for string_type in [DataType::Binary, DataType::LargeBinary] { - let values = make_binary_array(values.iter(), &string_type); - let patterns = make_binary_array(patterns.iter(), &string_type); - let (values, patterns) = (values.as_ref(), patterns.as_ref()); - - assert_eq!(like(&values, &patterns).unwrap(), expected,); - assert_eq!(ilike(&values, &patterns).unwrap(), expected,); - assert_eq!(nlike(&values, &patterns).unwrap(), unexpected,); - assert_eq!(nilike(&values, &patterns).unwrap(), unexpected,); - } - } - fn make_binary_datums( value: impl AsRef<[u8]>, data_type: &DataType, From 4d5c350e90523b08f33a737eb6297256d75d534b Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 15:50:44 +0000 Subject: [PATCH 3/9] remove BinaryArrayType --- arrow-array/src/array/mod.rs | 46 ------------------------------------ 1 file changed, 46 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index f2d06448b617..23b3cb628aaf 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -620,52 +620,6 @@ impl<'a> StringArrayType<'a> for &'a StringViewArray { } } -/// A trait for Arrow Binary Arrays, currently the following types are supported: -/// - `BinaryArray` -/// - `LargeBinaryArray` -/// - `BinaryViewArray` -/// - `FixedSizeBinaryArray` -/// -/// This trait helps to abstract over the different types of binary arrays -/// so that we don't need to duplicate the implementation for each type. -pub trait BinaryArrayType<'a>: ArrayAccessor + Sized { - /// Returns true if all data within this binary array is ASCII - fn is_ascii(&self) -> bool; - - /// Constructs a new iterator - fn iter(&self) -> ArrayIter; -} - -impl<'a, O: OffsetSizeTrait> BinaryArrayType<'a> for &'a GenericBinaryArray { - fn is_ascii(&self) -> bool { - GenericBinaryArray::::is_ascii(self) - } - - fn iter(&self) -> ArrayIter { - GenericBinaryArray::::iter(self) - } -} - -impl<'a> BinaryArrayType<'a> for &'a BinaryViewArray { - fn is_ascii(&self) -> bool { - BinaryViewArray::is_ascii(self) - } - - fn iter(&self) -> ArrayIter { - BinaryViewArray::iter(self) - } -} - -impl<'a> BinaryArrayType<'a> for &'a FixedSizeBinaryArray { - fn is_ascii(&self) -> bool { - FixedSizeBinaryArray::is_ascii(self) - } - - fn iter(&self) -> ArrayIter { - FixedSizeBinaryArray::iter(self) - } -} - impl PartialEq for dyn Array + '_ { fn eq(&self, other: &Self) -> bool { self.to_data().eq(&other.to_data()) From 20b03b1f7eca6509125778df6b6048fc738cf5a1 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 15:51:11 +0000 Subject: [PATCH 4/9] format --- arrow-array/src/array/byte_view_array.rs | 1 - .../src/array/fixed_size_binary_array.rs | 1 - arrow-string/src/like.rs | 125 ++++++++++++------ arrow-string/src/predicate.rs | 80 +++++------ 4 files changed, 130 insertions(+), 77 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 6d4f5f3ea8e3..c2cb46fa1394 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -754,7 +754,6 @@ where pub type BinaryViewArray = GenericByteViewArray; impl BinaryViewArray { - /// Returns true if all data within this array is ASCII pub fn is_ascii(&self) -> bool { // Alternative (but incorrect): directly check the underlying buffers diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index a65a4d391364..adbff30fbd5f 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -59,7 +59,6 @@ pub struct FixedSizeBinaryArray { } impl FixedSizeBinaryArray { - /// Returns true if all data within this array is ASCII pub fn is_ascii(&self) -> bool { // TODO - check if we can do similar to BinaryArray diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 34f249f101c1..15ca24bfc3ac 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -132,7 +132,7 @@ impl LikeSupportedArray for GenericStringArray impl Iterator> { + fn iter(&self) -> impl Iterator> { self.iter() } @@ -149,7 +149,7 @@ impl LikeSupportedArray for StringViewArray { self.is_ascii() } - fn iter(&self) -> impl Iterator> { + fn iter(&self) -> impl Iterator> { self.iter() } @@ -166,7 +166,7 @@ impl LikeSupportedArray for GenericBinaryArray impl Iterator> { + fn iter(&self) -> impl Iterator> { self.iter() } @@ -183,7 +183,7 @@ impl LikeSupportedArray for BinaryViewArray { self.is_ascii() } - fn iter(&self) -> impl Iterator> { + fn iter(&self) -> impl Iterator> { self.iter() } @@ -256,7 +256,10 @@ fn apply<'a, T: LikeSupportedArray>( r: &'a T, r_s: bool, r_v: Option<&'a dyn AnyDictionaryArray>, -) -> Result where &'a T: ArrayAccessor { +) -> Result +where + &'a T: ArrayAccessor, +{ let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); if r_s { let idx = match r_v { @@ -304,7 +307,10 @@ fn op_scalar<'a, T: LikeSupportedArray>( l: &'a T, l_v: Option<&dyn AnyDictionaryArray>, r: &'a T::UnsizedItem, -) -> Result where &'a T: arrow_array::ArrayAccessor { +) -> Result +where + &'a T: arrow_array::ArrayAccessor, +{ let r = match op { Op::Like(neg) => T::MatchingPredicate::like(r)?.evaluate_array(l, neg), Op::ILike(neg) => T::MatchingPredicate::ilike(r, l.is_ascii())?.evaluate_array(l, neg), @@ -322,7 +328,10 @@ fn op_scalar<'a, T: LikeSupportedArray>( fn vectored_iter<'a, T: LikeSupportedArray>( a: &'a T, a_v: &'a dyn AnyDictionaryArray, -) -> impl Iterator> + 'a where &'a T: arrow_array::ArrayAccessor + 'a { +) -> impl Iterator> + 'a +where + &'a T: arrow_array::ArrayAccessor + 'a, +{ let nulls = a_v.nulls(); let keys = a_v.normalized_keys(); keys.into_iter().enumerate().map(move |(idx, key)| { @@ -342,7 +351,10 @@ fn op_binary<'a, T: LikeSupportedArray + 'a>( match op { Op::Like(neg) => binary_predicate(l, r, neg, T::MatchingPredicate::like), Op::ILike(neg) => binary_predicate(l, r, neg, |s| T::MatchingPredicate::ilike(s, false)), - Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(T::item_as_bytes(l?), T::item_as_bytes(r?)))).collect()), + Op::Contains => Ok(l + .zip(r) + .map(|(l, r)| Some(str_contains(T::item_as_bytes(l?), T::item_as_bytes(r?)))) + .collect()), Op::StartsWith => Ok(l .zip(r) .map(|(l, r)| Some(T::MatchingPredicate::starts_with(r?).evaluate(l?))) @@ -387,7 +399,9 @@ fn make_scalar(data_type: &DataType, scalar: &str) -> Result Ok(Arc::new(LargeStringArray::from_iter_values([scalar]))), DataType::Dictionary(_, v) => make_scalar(v.as_ref(), scalar), DataType::Binary => Ok(Arc::new(BinaryArray::from_iter_values([scalar.as_bytes()]))), - DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter_values([scalar.as_bytes()]))), + DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter_values([ + scalar.as_bytes() + ]))), d => Err(ArrowError::InvalidArgumentError(format!( "Unsupported string scalar data type {d:?}", ))), @@ -495,12 +509,17 @@ legacy_kernels!( #[allow(deprecated)] mod tests { use super::*; + use arrow_array::builder::BinaryDictionaryBuilder; use arrow_array::types::{ArrowDictionaryKeyType, Int8Type}; use std::iter::zip; - use arrow_array::builder::BinaryDictionaryBuilder; - - fn convert_binary_iterator_to_binary_dictionary<'a, K: ArrowDictionaryKeyType, I: IntoIterator>(iter: I) -> DictionaryArray { + fn convert_binary_iterator_to_binary_dictionary< + 'a, + K: ArrowDictionaryKeyType, + I: IntoIterator, + >( + iter: I, + ) -> DictionaryArray { let it = iter.into_iter(); let (lower, _) = it.size_hint(); let mut builder = BinaryDictionaryBuilder::with_capacity(lower, 256, 1024); @@ -513,14 +532,17 @@ mod tests { builder.finish() } - fn convert_string_iterator_to_binary_dictionary<'a, K: ArrowDictionaryKeyType, I: IntoIterator>>(iter: I) -> DictionaryArray { + fn convert_string_iterator_to_binary_dictionary< + 'a, + K: ArrowDictionaryKeyType, + I: IntoIterator>, + >( + iter: I, + ) -> DictionaryArray { let it = iter.into_iter(); let (lower, _) = it.size_hint(); let mut builder = BinaryDictionaryBuilder::with_capacity(lower, 256, 1024); - it.for_each(|i| { - builder - .append_option(i.map(|i| i.as_bytes())) - }); + it.for_each(|i| builder.append_option(i.map(|i| i.as_bytes()))); builder.finish() } @@ -571,8 +593,10 @@ mod tests { let res = $op(&left, &right).unwrap(); assert_eq!(res, expected); - let left: DictionaryArray = convert_binary_iterator_to_binary_dictionary(left_binary); - let right: DictionaryArray = convert_binary_iterator_to_binary_dictionary(right_binary); + let left: DictionaryArray = + convert_binary_iterator_to_binary_dictionary(left_binary); + let right: DictionaryArray = + convert_binary_iterator_to_binary_dictionary(right_binary); let res = $op(&left, &right).unwrap(); assert_eq!(res, expected); } @@ -625,15 +649,16 @@ mod tests { let res = $op(&left, &Scalar::new(&right)).unwrap(); assert_eq!(res, expected); - let left: DictionaryArray = convert_binary_iterator_to_binary_dictionary(left_binary); - let right: DictionaryArray = convert_binary_iterator_to_binary_dictionary([right_binary]); + let left: DictionaryArray = + convert_binary_iterator_to_binary_dictionary(left_binary); + let right: DictionaryArray = + convert_binary_iterator_to_binary_dictionary([right_binary]); let res = $op(&left, &Scalar::new(&right)).unwrap(); assert_eq!(res, expected); } }; } - trait IntoBinaryScalar { fn into_binary_scalar(self) -> Scalar; fn into_large_binary_scalar(self) -> Scalar; @@ -1276,46 +1301,74 @@ mod tests { vec![true, false, true, true, true] ); - fn like_utf8_scalar(data: &Vec>, pattern: &str) -> Result { + fn like_utf8_scalar( + data: &Vec>, + pattern: &str, + ) -> Result { let dict_array: DictionaryArray = data.clone().into_iter().collect(); like_utf8_scalar_dyn(&dict_array, pattern) } - fn like_binary_scalar(data: &Vec>, pattern: &str) -> Result { - let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data.clone()); + fn like_binary_scalar( + data: &Vec>, + pattern: &str, + ) -> Result { + let dict_array: DictionaryArray = + convert_string_iterator_to_binary_dictionary(data.clone()); like(&dict_array, &pattern.into_binary_scalar()) } - fn nlike_utf8_scalar(data: &Vec>, pattern: &str) -> Result { + fn nlike_utf8_scalar( + data: &Vec>, + pattern: &str, + ) -> Result { let dict_array: DictionaryArray = data.clone().into_iter().collect(); nlike_utf8_scalar_dyn(&dict_array, pattern) } - fn nlike_binary_scalar(data: &Vec>, pattern: &str) -> Result { - let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data.clone()); + fn nlike_binary_scalar( + data: &Vec>, + pattern: &str, + ) -> Result { + let dict_array: DictionaryArray = + convert_string_iterator_to_binary_dictionary(data.clone()); nlike(&dict_array, &pattern.into_binary_scalar()) } - fn ilike_utf8_scalar(data: &Vec>, pattern: &str) -> Result { + fn ilike_utf8_scalar( + data: &Vec>, + pattern: &str, + ) -> Result { let dict_array: DictionaryArray = data.clone().into_iter().collect(); ilike_utf8_scalar_dyn(&dict_array, pattern) } - fn ilike_binary_scalar(data: &Vec>, pattern: &str) -> Result { - let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data.clone()); + fn ilike_binary_scalar( + data: &Vec>, + pattern: &str, + ) -> Result { + let dict_array: DictionaryArray = + convert_string_iterator_to_binary_dictionary(data.clone()); ilike(&dict_array, &pattern.into_binary_scalar()) } - fn nilike_utf8_scalar(data: &Vec>, pattern: &str) -> Result { + fn nilike_utf8_scalar( + data: &Vec>, + pattern: &str, + ) -> Result { let dict_array: DictionaryArray = data.clone().into_iter().collect(); nilike_utf8_scalar_dyn(&dict_array, pattern) } - fn nilike_binary_scalar(data: &Vec>, pattern: &str) -> Result { - let dict_array: DictionaryArray = convert_string_iterator_to_binary_dictionary(data.clone()); + fn nilike_binary_scalar( + data: &Vec>, + pattern: &str, + ) -> Result { + let dict_array: DictionaryArray = + convert_string_iterator_to_binary_dictionary(data.clone()); nilike(&dict_array, &pattern.into_binary_scalar()) } @@ -1477,8 +1530,6 @@ mod tests { Some("bbbbb\nAir"), ]; - - for func in &[nlike_utf8_scalar, nlike_binary_scalar] { assert_eq!( func(&data, "Air").unwrap(), @@ -1769,8 +1820,6 @@ mod tests { Some("bbbbb\nAir"), ]; - - for func in &[nilike_utf8_scalar, nilike_binary_scalar] { assert_eq!( func(&data, "air").unwrap(), diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 034175b43de1..e5c12b99706f 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -15,13 +15,14 @@ // specific language governing permissions and limitations // under the License. - use arrow_array::{Array, ArrayAccessor, BinaryViewArray, BooleanArray, StringViewArray}; use arrow_buffer::BooleanBuffer; use arrow_schema::ArrowError; use memchr::memchr3; use memchr::memmem::Finder; -use regex::{Regex, RegexBuilder, bytes::Regex as BinaryRegex, bytes::RegexBuilder as BinaryRegexBuilder}; +use regex::{ + bytes::Regex as BinaryRegex, bytes::RegexBuilder as BinaryRegexBuilder, Regex, RegexBuilder, +}; use std::iter::zip; /// A string based predicate @@ -59,7 +60,6 @@ pub enum BinaryPredicate<'a> { } pub trait PredicateImpl<'a>: Sized { - type UnsizedItem: ?Sized + PartialEq; type RegexType; @@ -86,7 +86,8 @@ pub trait PredicateImpl<'a>: Sized { #[inline(never)] fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray where - T: ArrayAccessor, Self::UnsizedItem: 'i; + T: ArrayAccessor, + Self::UnsizedItem: 'i; /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: /// @@ -95,7 +96,10 @@ pub trait PredicateImpl<'a>: Sized { /// 2. Replace `LIKE` single-character wildcards `_` => `.` /// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` /// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` - fn regex_like(pattern: &'a Self::UnsizedItem, case_insensitive: bool) -> Result; + fn regex_like( + pattern: &'a Self::UnsizedItem, + case_insensitive: bool, + ) -> Result; } impl<'a> PredicateImpl<'a> for Predicate<'a> { @@ -106,7 +110,9 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { fn like(pattern: &'a str) -> Result { if !contains_like_pattern(pattern.as_bytes()) { Ok(Self::Eq(pattern)) - } else if pattern.ends_with('%') && !contains_like_pattern(&pattern[..pattern.len() - 1].as_bytes()) { + } else if pattern.ends_with('%') + && !contains_like_pattern(&pattern[..pattern.len() - 1].as_bytes()) + { Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) } else if pattern.starts_with('%') && !contains_like_pattern(&pattern[1..].as_bytes()) { Ok(Self::EndsWith(&pattern[1..])) @@ -156,11 +162,17 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { Self::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), Self::Contains(finder) => finder.find(haystack.as_bytes()).is_some(), Self::StartsWith(v) => starts_with(haystack.as_bytes(), v.as_bytes(), equals_kernel), - Self::IStartsWithAscii(v) => { - starts_with(haystack.as_bytes(), v.as_bytes(), equals_ignore_ascii_case_kernel) - } + Self::IStartsWithAscii(v) => starts_with( + haystack.as_bytes(), + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ), Self::EndsWith(v) => ends_with(haystack.as_bytes(), v.as_bytes(), equals_kernel), - Self::IEndsWithAscii(v) => ends_with(haystack.as_bytes(), v.as_bytes(), equals_ignore_ascii_case_kernel), + Self::IEndsWithAscii(v) => ends_with( + haystack.as_bytes(), + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ), Self::Regex(v) => v.is_match(haystack), } } @@ -219,7 +231,11 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - starts_with(haystack.as_bytes(), v.as_bytes(), equals_ignore_ascii_case_kernel) != negate + starts_with( + haystack.as_bytes(), + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ) != negate }) } } @@ -259,7 +275,11 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - ends_with(haystack.as_bytes(), v.as_bytes(), equals_ignore_ascii_case_kernel) != negate + ends_with( + haystack.as_bytes(), + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ) != negate }) } } @@ -393,9 +413,7 @@ impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { Self::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), Self::Contains(finder) => finder.find(haystack).is_some(), Self::StartsWith(v) => starts_with(haystack, v, equals_kernel), - Self::IStartsWithAscii(v) => { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) - } + Self::IStartsWithAscii(v) => starts_with(haystack, v, equals_ignore_ascii_case_kernel), Self::EndsWith(v) => ends_with(haystack, v, equals_kernel), Self::IEndsWithAscii(v) => ends_with(haystack, v, equals_ignore_ascii_case_kernel), Self::Regex(v) => v.is_match(haystack), @@ -426,9 +444,7 @@ impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { let values = BooleanBuffer::from( binary_view_array .prefix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes(haystack, v, equals_kernel) != negate - }) + .map(|haystack| equals_bytes(haystack, v, equals_kernel) != negate) .collect::>(), ); BooleanArray::new(values, nulls) @@ -445,11 +461,7 @@ impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { binary_view_array .prefix_bytes_iter(v.len()) .map(|haystack| { - equals_bytes( - haystack, - v, - equals_ignore_ascii_case_kernel, - ) != negate + equals_bytes(haystack, v, equals_ignore_ascii_case_kernel) != negate }) .collect::>(), ); @@ -466,9 +478,7 @@ impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { let values = BooleanBuffer::from( binary_view_array .suffix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes(haystack, v, equals_kernel) != negate - }) + .map(|haystack| equals_bytes(haystack, v, equals_kernel) != negate) .collect::>(), ); BooleanArray::new(values, nulls) @@ -485,11 +495,7 @@ impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { binary_view_array .suffix_bytes_iter(v.len()) .map(|haystack| { - equals_bytes( - haystack, - v, - equals_ignore_ascii_case_kernel, - ) != negate + equals_bytes(haystack, v, equals_ignore_ascii_case_kernel) != negate }) .collect::>(), ); @@ -578,7 +584,11 @@ fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> b /// This is faster than `str::starts_with` for small strings. /// See for more details. -fn starts_with(haystack: &[u8], needle: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { +fn starts_with( + haystack: &[u8], + needle: &[u8], + byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, +) -> bool { if needle.len() > haystack.len() { false } else { @@ -591,11 +601,7 @@ fn ends_with(haystack: &[u8], needle: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) if needle.len() > haystack.len() { false } else { - zip( - haystack.iter().rev(), - needle.iter().rev(), - ) - .all(byte_eq_kernel) + zip(haystack.iter().rev(), needle.iter().rev()).all(byte_eq_kernel) } } From 9237d2526ca1a2ea543c0aaf1298b30d080ca7ce Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 18:34:58 +0000 Subject: [PATCH 5/9] avoid duplicate code even more --- arrow-string/src/like.rs | 56 ++-- arrow-string/src/predicate.rs | 553 ++++++++++++---------------------- 2 files changed, 213 insertions(+), 396 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 15ca24bfc3ac..70fce4f017ca 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -27,7 +27,6 @@ use arrow_select::take::take; use std::sync::Arc; pub use arrow_array::StringArrayType; -use arrow_schema::DataType::{LargeUtf8, Utf8, Utf8View}; #[derive(Debug)] enum Op { @@ -382,7 +381,7 @@ fn binary_predicate<'a, Predicate: PredicateImpl<'a> + 'a>( (Some(l), Some(r)) => { let p: &Predicate = match previous { Some((expr, ref predicate)) if expr == r => predicate, - _ => &previous.insert((r, f(&r)?)).1, + _ => &previous.insert((r, f(r)?)).1, }; Ok(Some(p.evaluate(l) != neg)) } @@ -535,14 +534,13 @@ mod tests { fn convert_string_iterator_to_binary_dictionary< 'a, K: ArrowDictionaryKeyType, - I: IntoIterator>, + I: Iterator>, >( iter: I, ) -> DictionaryArray { - let it = iter.into_iter(); - let (lower, _) = it.size_hint(); + let (lower, _) = iter.size_hint(); let mut builder = BinaryDictionaryBuilder::with_capacity(lower, 256, 1024); - it.for_each(|i| builder.append_option(i.map(|i| i.as_bytes()))); + iter.for_each(|i| builder.append_option(i.map(|i| i.as_bytes()))); builder.finish() } @@ -661,17 +659,12 @@ mod tests { trait IntoBinaryScalar { fn into_binary_scalar(self) -> Scalar; - fn into_large_binary_scalar(self) -> Scalar; } impl IntoBinaryScalar for &str { fn into_binary_scalar(self) -> Scalar { Scalar::new(make_scalar(&DataType::Binary, self).unwrap()) } - - fn into_large_binary_scalar(self) -> Scalar { - Scalar::new(make_scalar(&DataType::LargeBinary, self).unwrap()) - } } test_utf8!( @@ -1301,74 +1294,65 @@ mod tests { vec![true, false, true, true, true] ); - fn like_utf8_scalar( - data: &Vec>, - pattern: &str, - ) -> Result { - let dict_array: DictionaryArray = data.clone().into_iter().collect(); + fn like_utf8_scalar(data: &[Option<&str>], pattern: &str) -> Result { + let dict_array: DictionaryArray = data.iter().cloned().collect(); like_utf8_scalar_dyn(&dict_array, pattern) } fn like_binary_scalar( - data: &Vec>, + data: &[Option<&str>], pattern: &str, ) -> Result { let dict_array: DictionaryArray = - convert_string_iterator_to_binary_dictionary(data.clone()); + convert_string_iterator_to_binary_dictionary(data.iter()); like(&dict_array, &pattern.into_binary_scalar()) } - fn nlike_utf8_scalar( - data: &Vec>, - pattern: &str, - ) -> Result { - let dict_array: DictionaryArray = data.clone().into_iter().collect(); + fn nlike_utf8_scalar(data: &[Option<&str>], pattern: &str) -> Result { + let dict_array: DictionaryArray = data.iter().cloned().collect(); nlike_utf8_scalar_dyn(&dict_array, pattern) } fn nlike_binary_scalar( - data: &Vec>, + data: &[Option<&str>], pattern: &str, ) -> Result { let dict_array: DictionaryArray = - convert_string_iterator_to_binary_dictionary(data.clone()); + convert_string_iterator_to_binary_dictionary(data.iter()); nlike(&dict_array, &pattern.into_binary_scalar()) } - fn ilike_utf8_scalar( - data: &Vec>, - pattern: &str, - ) -> Result { - let dict_array: DictionaryArray = data.clone().into_iter().collect(); + fn ilike_utf8_scalar(data: &[Option<&str>], pattern: &str) -> Result { + let dict_array: DictionaryArray = data.iter().cloned().collect(); ilike_utf8_scalar_dyn(&dict_array, pattern) } fn ilike_binary_scalar( - data: &Vec>, + data: &[Option<&str>], pattern: &str, ) -> Result { let dict_array: DictionaryArray = - convert_string_iterator_to_binary_dictionary(data.clone()); + convert_string_iterator_to_binary_dictionary(data.iter()); ilike(&dict_array, &pattern.into_binary_scalar()) } fn nilike_utf8_scalar( - data: &Vec>, + data: &[Option<&str>], pattern: &str, ) -> Result { - let dict_array: DictionaryArray = data.clone().into_iter().collect(); + let dict_array: DictionaryArray = data.iter().cloned().collect(); nilike_utf8_scalar_dyn(&dict_array, pattern) } fn nilike_binary_scalar( - data: &Vec>, + data: &[Option<&str>], pattern: &str, ) -> Result { let dict_array: DictionaryArray = - convert_string_iterator_to_binary_dictionary(data.clone()); + convert_string_iterator_to_binary_dictionary(data.iter()); nilike(&dict_array, &pattern.into_binary_scalar()) } diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index e5c12b99706f..e4a6186964f0 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -25,40 +25,6 @@ use regex::{ }; use std::iter::zip; -/// A string based predicate -pub enum Predicate<'a> { - Eq(&'a str), - Contains(Finder<'a>), - StartsWith(&'a str), - EndsWith(&'a str), - - /// Equality ignoring ASCII case - IEqAscii(&'a str), - /// Starts with ignoring ASCII case - IStartsWithAscii(&'a str), - /// Ends with ignoring ASCII case - IEndsWithAscii(&'a str), - - Regex(Regex), -} - -/// A string based predicate -pub enum BinaryPredicate<'a> { - Eq(&'a [u8]), - Contains(Finder<'a>), - StartsWith(&'a [u8]), - EndsWith(&'a [u8]), - - /// Equality ignoring ASCII case - IEqAscii(&'a [u8]), - /// Starts with ignoring ASCII case - IStartsWithAscii(&'a [u8]), - /// Ends with ignoring ASCII case - IEndsWithAscii(&'a [u8]), - - Regex(BinaryRegex), -} - pub trait PredicateImpl<'a>: Sized { type UnsizedItem: ?Sized + PartialEq; type RegexType; @@ -83,7 +49,6 @@ pub trait PredicateImpl<'a>: Sized { /// Evaluate this predicate against the elements of `array` /// /// If `negate` is true the result of the predicate will be negated - #[inline(never)] fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray where T: ArrayAccessor, @@ -101,24 +66,64 @@ pub trait PredicateImpl<'a>: Sized { case_insensitive: bool, ) -> Result; } +macro_rules! impl_predicate { + ( -impl<'a> PredicateImpl<'a> for Predicate<'a> { - type UnsizedItem = str; - type RegexType = Regex; +type PredicateUnsizedItem = $unsized_item: ty; + type MatchingRegexBuilder = $RegexBuilder: ty; + type ViewArray = $ViewArray: ident; + +impl<'a> PredicateImpl<'a> for $predicate: ident<'a> { + type UnsizedItem = PredicateUnsizedItem; + type RegexType = $regex_type: ty; + + ... + +} + +fn as_bytes(item: &PredicateUnsizedItem) -> &[u8] { + item$($as_bytes_fn:tt)* +} + +fn to_char_iter(pattern: &PredicateUnsizedItem) -> impl Iterator { + pattern$($as_char_iter:tt)* +} + +const PERCENT: &'static PredicateUnsizedItem = $percent: literal; +const PERCENT_ESCAPED: &'static PredicateUnsizedItem = $percent_escaped: literal; + ) => { + +pub enum $predicate<'a> { + Eq(&'a $unsized_item), + Contains(Finder<'a>), + StartsWith(&'a $unsized_item), + EndsWith(&'a $unsized_item), + + /// Equality ignoring ASCII case + IEqAscii(&'a $unsized_item), + /// Starts with ignoring ASCII case + IStartsWithAscii(&'a $unsized_item), + /// Ends with ignoring ASCII case + IEndsWithAscii(&'a $unsized_item), + + Regex($regex_type), +} + +impl<'a> PredicateImpl<'a> for $predicate<'a> { + type UnsizedItem = $unsized_item; + type RegexType = $regex_type; /// Create a predicate for the given like pattern - fn like(pattern: &'a str) -> Result { - if !contains_like_pattern(pattern.as_bytes()) { + fn like(pattern: &'a Self::UnsizedItem) -> Result { + if !contains_like_pattern(pattern$($as_bytes_fn)*) { Ok(Self::Eq(pattern)) - } else if pattern.ends_with('%') - && !contains_like_pattern(&pattern[..pattern.len() - 1].as_bytes()) - { + } else if pattern.ends_with($percent) && !contains_like_pattern(&pattern[..pattern.len() - 1]$($as_bytes_fn)*) { Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) - } else if pattern.starts_with('%') && !contains_like_pattern(&pattern[1..].as_bytes()) { + } else if pattern.starts_with($percent) && !contains_like_pattern(&pattern[1..]$($as_bytes_fn)*) { Ok(Self::EndsWith(&pattern[1..])) - } else if pattern.starts_with('%') - && pattern.ends_with('%') - && !contains_like_pattern(&pattern[1..pattern.len() - 1].as_bytes()) + } else if pattern.starts_with($percent) + && pattern.ends_with($percent) + && !contains_like_pattern(&pattern[1..pattern.len() - 1]$($as_bytes_fn)*) { Ok(Self::contains(&pattern[1..pattern.len() - 1])) } else { @@ -126,53 +131,47 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { } } - fn contains(needle: &'a str) -> Self { - Self::Contains(Finder::new(needle.as_bytes())) + fn contains(needle: &'a Self::UnsizedItem) -> Self { + Self::Contains(Finder::new(needle$($as_bytes_fn)*)) } /// Create a predicate for the given ilike pattern - fn ilike(pattern: &'a str, is_ascii: bool) -> Result { + fn ilike(pattern: &'a Self::UnsizedItem, is_ascii: bool) -> Result { if is_ascii && pattern.is_ascii() { - if !contains_like_pattern(pattern.as_bytes()) { + if !contains_like_pattern(pattern$($as_bytes_fn)*) { return Ok(Self::IEqAscii(pattern)); - } else if pattern.ends_with('%') - && !pattern.ends_with("\\%") - && !contains_like_pattern(&pattern[..pattern.len() - 1].as_bytes()) + } else if pattern.ends_with($percent) + && !pattern.ends_with($percent_escaped) + && !contains_like_pattern(&pattern[..pattern.len() - 1]$($as_bytes_fn)*) { return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); - } else if pattern.starts_with('%') && !contains_like_pattern(&pattern[1..].as_bytes()) { + } else if pattern.starts_with($percent) && !contains_like_pattern(&pattern[1..]$($as_bytes_fn)*) { return Ok(Self::IEndsWithAscii(&pattern[1..])); } } Ok(Self::Regex(Self::regex_like(pattern, true)?)) } - fn starts_with(pattern: &'a str) -> Self { + fn starts_with(pattern: &'a Self::UnsizedItem) -> Self { Self::StartsWith(pattern) } - fn ends_with(pattern: &'a str) -> Self { + fn ends_with(pattern: &'a Self::UnsizedItem) -> Self { Self::EndsWith(pattern) } /// Evaluate this predicate against the given haystack - fn evaluate(&self, haystack: &'a str) -> bool { + fn evaluate(&self, haystack: &'a Self::UnsizedItem) -> bool { match self { Self::Eq(v) => *v == haystack, Self::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), - Self::Contains(finder) => finder.find(haystack.as_bytes()).is_some(), - Self::StartsWith(v) => starts_with(haystack.as_bytes(), v.as_bytes(), equals_kernel), - Self::IStartsWithAscii(v) => starts_with( - haystack.as_bytes(), - v.as_bytes(), - equals_ignore_ascii_case_kernel, - ), - Self::EndsWith(v) => ends_with(haystack.as_bytes(), v.as_bytes(), equals_kernel), - Self::IEndsWithAscii(v) => ends_with( - haystack.as_bytes(), - v.as_bytes(), - equals_ignore_ascii_case_kernel, - ), + Self::Contains(finder) => finder.find(haystack$($as_bytes_fn)*).is_some(), + Self::StartsWith(v) => starts_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_kernel), + Self::IStartsWithAscii(v) => { + starts_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel) + } + Self::EndsWith(v) => ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_kernel), + Self::IEndsWithAscii(v) => ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel), Self::Regex(v) => v.is_match(haystack), } } @@ -183,7 +182,7 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { #[inline(never)] fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray where - T: ArrayAccessor, + T: ArrayAccessor, { match self { Self::Eq(v) => BooleanArray::from_unary(array, |haystack| { @@ -193,36 +192,36 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { haystack.eq_ignore_ascii_case(v) != negate }), Self::Contains(finder) => BooleanArray::from_unary(array, |haystack| { - finder.find(haystack.as_bytes()).is_some() != negate + finder.find(haystack$($as_bytes_fn)*).is_some() != negate }), Self::StartsWith(v) => { - if let Some(string_view_array) = array.as_any().downcast_ref::() { - let nulls = string_view_array.logical_nulls(); + if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { + let nulls = view_array.logical_nulls(); let values = BooleanBuffer::from( - string_view_array + view_array .prefix_bytes_iter(v.len()) .map(|haystack| { - equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate + equals_bytes(haystack, v$($as_bytes_fn)*, equals_kernel) != negate }) .collect::>(), ); BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - starts_with(haystack.as_bytes(), v.as_bytes(), equals_kernel) != negate + starts_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_kernel) != negate }) } } Self::IStartsWithAscii(v) => { - if let Some(string_view_array) = array.as_any().downcast_ref::() { - let nulls = string_view_array.logical_nulls(); + if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { + let nulls = view_array.logical_nulls(); let values = BooleanBuffer::from( - string_view_array + view_array .prefix_bytes_iter(v.len()) .map(|haystack| { equals_bytes( haystack, - v.as_bytes(), + v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel, ) != negate }) @@ -231,42 +230,38 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - starts_with( - haystack.as_bytes(), - v.as_bytes(), - equals_ignore_ascii_case_kernel, - ) != negate + starts_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel) != negate }) } } Self::EndsWith(v) => { - if let Some(string_view_array) = array.as_any().downcast_ref::() { - let nulls = string_view_array.logical_nulls(); + if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { + let nulls = view_array.logical_nulls(); let values = BooleanBuffer::from( - string_view_array + view_array .suffix_bytes_iter(v.len()) .map(|haystack| { - equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate + equals_bytes(haystack, v$($as_bytes_fn)*, equals_kernel) != negate }) .collect::>(), ); BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - ends_with(haystack.as_bytes(), v.as_bytes(), equals_kernel) != negate + ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_kernel) != negate }) } } Self::IEndsWithAscii(v) => { - if let Some(string_view_array) = array.as_any().downcast_ref::() { - let nulls = string_view_array.logical_nulls(); + if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { + let nulls = view_array.logical_nulls(); let values = BooleanBuffer::from( - string_view_array + view_array .suffix_bytes_iter(v.len()) .map(|haystack| { equals_bytes( haystack, - v.as_bytes(), + v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel, ) != negate }) @@ -275,11 +270,7 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { BooleanArray::new(values, nulls) } else { BooleanArray::from_unary(array, |haystack| { - ends_with( - haystack.as_bytes(), - v.as_bytes(), - equals_ignore_ascii_case_kernel, - ) != negate + ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel) != negate }) } } @@ -289,61 +280,9 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { } } - /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: - /// - /// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, - /// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`) - /// 2. Replace `LIKE` single-character wildcards `_` => `.` - /// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` - /// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` - fn regex_like(pattern: &str, case_insensitive: bool) -> Result { - let mut result = String::with_capacity(pattern.len() * 2); - let mut chars_iter = pattern.chars().peekable(); - match chars_iter.peek() { - // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` - Some('%') => { - chars_iter.next(); - } - _ => result.push('^'), - }; - - while let Some(c) = chars_iter.next() { - match c { - '\\' => { - match chars_iter.peek() { - Some(&next) => { - if regex_syntax::is_meta_character(next) { - result.push('\\'); - } - result.push(next); - // Skipping the next char as it is already appended - chars_iter.next(); - } - None => { - // Trailing backslash in the pattern. E.g. PostgreSQL and Trino treat it as an error, but e.g. Snowflake treats it as a literal backslash - result.push('\\'); - result.push('\\'); - } - } - } - '%' => result.push_str(".*"), - '_' => result.push('.'), - c => { - if regex_syntax::is_meta_character(c) { - result.push('\\'); - } - result.push(c); - } - } - } - // instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex - if result.ends_with(".*") { - result.pop(); - result.pop(); - } else { - result.push('$'); - } - RegexBuilder::new(&result) + fn regex_like(pattern: &Self::UnsizedItem, case_insensitive: bool) -> Result { + let regex_pattern = transform_pattern_like_to_regex_compatible_pattern(pattern$($as_char_iter)+, pattern.len()); + <$RegexBuilder>::new(®ex_pattern) .case_insensitive(case_insensitive) .dot_matches_new_line(true) .build() @@ -354,228 +293,120 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { }) } } + } +} -impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { - type UnsizedItem = [u8]; - type RegexType = BinaryRegex; +impl_predicate!( - /// Create a predicate for the given like pattern - fn like(pattern: &'a [u8]) -> Result { - if !contains_like_pattern(pattern) { - Ok(Self::Eq(pattern)) - } else if pattern.ends_with(b"%") && !contains_like_pattern(&pattern[..pattern.len() - 1]) { - Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) - } else if pattern.starts_with(b"%") && !contains_like_pattern(&pattern[1..]) { - Ok(Self::EndsWith(&pattern[1..])) - } else if pattern.starts_with(b"%") - && pattern.ends_with(b"%") - && !contains_like_pattern(&pattern[1..pattern.len() - 1]) - { - Ok(Self::contains(&pattern[1..pattern.len() - 1])) - } else { - Ok(Self::Regex(Self::regex_like(pattern, false)?)) - } - } +type PredicateUnsizedItem = str; +type MatchingRegexBuilder = RegexBuilder; +type ViewArray = StringViewArray; - fn contains(needle: &'a [u8]) -> Self { - Self::Contains(Finder::new(needle)) - } +impl<'a> PredicateImpl<'a> for Predicate<'a> { + type UnsizedItem = PredicateUnsizedItem; + type RegexType = Regex; - /// Create a predicate for the given ilike pattern - fn ilike(pattern: &'a [u8], is_ascii: bool) -> Result { - if is_ascii && pattern.is_ascii() { - if !contains_like_pattern(pattern) { - return Ok(Self::IEqAscii(pattern)); - } else if pattern.ends_with(b"%") - && !pattern.ends_with(b"\\%") - && !contains_like_pattern(&pattern[..pattern.len() - 1]) - { - return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); - } else if pattern.starts_with(b"%") && !contains_like_pattern(&pattern[1..]) { - return Ok(Self::IEndsWithAscii(&pattern[1..])); - } - } - Ok(Self::Regex(Self::regex_like(pattern, true)?)) - } + ... - fn starts_with(pattern: &'a [u8]) -> Self { - Self::StartsWith(pattern) - } +} - fn ends_with(pattern: &'a [u8]) -> Self { - Self::EndsWith(pattern) - } +fn as_bytes(item: &PredicateUnsizedItem) -> &[u8] { + item.as_bytes() +} - /// Evaluate this predicate against the given haystack - fn evaluate(&self, haystack: &[u8]) -> bool { - match self { - Self::Eq(v) => *v == haystack, - Self::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), - Self::Contains(finder) => finder.find(haystack).is_some(), - Self::StartsWith(v) => starts_with(haystack, v, equals_kernel), - Self::IStartsWithAscii(v) => starts_with(haystack, v, equals_ignore_ascii_case_kernel), - Self::EndsWith(v) => ends_with(haystack, v, equals_kernel), - Self::IEndsWithAscii(v) => ends_with(haystack, v, equals_ignore_ascii_case_kernel), - Self::Regex(v) => v.is_match(haystack), - } - } +fn to_char_iter(pattern: &PredicateUnsizedItem) -> impl Iterator { + pattern.chars() +} - /// Evaluate this predicate against the elements of `array` - /// - /// If `negate` is true the result of the predicate will be negated - #[inline(never)] - fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray - where - T: ArrayAccessor, - { - match self { - Self::Eq(v) => BooleanArray::from_unary(array, |haystack| { - (haystack.len() == v.len() && haystack == *v) != negate - }), - Self::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { - haystack.eq_ignore_ascii_case(v) != negate - }), - Self::Contains(finder) => BooleanArray::from_unary(array, |haystack| { - finder.find(haystack).is_some() != negate - }), - Self::StartsWith(v) => { - if let Some(binary_view_array) = array.as_any().downcast_ref::() { - let nulls = binary_view_array.logical_nulls(); - let values = BooleanBuffer::from( - binary_view_array - .prefix_bytes_iter(v.len()) - .map(|haystack| equals_bytes(haystack, v, equals_kernel) != negate) - .collect::>(), - ); - BooleanArray::new(values, nulls) - } else { - BooleanArray::from_unary(array, |haystack| { - starts_with(haystack, v, equals_kernel) != negate - }) - } - } - Self::IStartsWithAscii(v) => { - if let Some(binary_view_array) = array.as_any().downcast_ref::() { - let nulls = binary_view_array.logical_nulls(); - let values = BooleanBuffer::from( - binary_view_array - .prefix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes(haystack, v, equals_ignore_ascii_case_kernel) != negate - }) - .collect::>(), - ); - BooleanArray::new(values, nulls) - } else { - BooleanArray::from_unary(array, |haystack| { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate - }) - } - } - Self::EndsWith(v) => { - if let Some(binary_view_array) = array.as_any().downcast_ref::() { - let nulls = binary_view_array.logical_nulls(); - let values = BooleanBuffer::from( - binary_view_array - .suffix_bytes_iter(v.len()) - .map(|haystack| equals_bytes(haystack, v, equals_kernel) != negate) - .collect::>(), - ); - BooleanArray::new(values, nulls) - } else { - BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_kernel) != negate - }) - } - } - Self::IEndsWithAscii(v) => { - if let Some(binary_view_array) = array.as_any().downcast_ref::() { - let nulls = binary_view_array.logical_nulls(); - let values = BooleanBuffer::from( - binary_view_array - .suffix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes(haystack, v, equals_ignore_ascii_case_kernel) != negate - }) - .collect::>(), - ); - BooleanArray::new(values, nulls) - } else { - BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate - }) - } - } - Self::Regex(v) => { - BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) - } - } - } +const PERCENT: &'static PredicateUnsizedItem = "%"; +const PERCENT_ESCAPED: &'static PredicateUnsizedItem = "\\%"; +); - /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: - /// - /// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, - /// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`) - /// 2. Replace `LIKE` single-character wildcards `_` => `.` - /// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` - /// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` - fn regex_like(pattern: &[u8], case_insensitive: bool) -> Result { - let mut result = String::with_capacity(pattern.len() * 2); - let mut chars_iter = pattern.iter().peekable(); - match chars_iter.peek() { - // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` - Some(b'%') => { - chars_iter.next(); - } - _ => result.push('^'), - }; - - while let Some(b) = chars_iter.next() { - match b { - b'\\' => { - match chars_iter.peek() { - Some(&next) => { - if regex_syntax::is_meta_character(*next as char) { - result.push('\\'); - } - result.push(*next as char); - // Skipping the next char as it is already appended - chars_iter.next(); - } - None => { - // Trailing backslash in the pattern. E.g. PostgreSQL and Trino treat it as an error, but e.g. Snowflake treats it as a literal backslash - result.push('\\'); +impl_predicate!( + +type PredicateUnsizedItem = [u8]; +type MatchingRegexBuilder = BinaryRegexBuilder; +type ViewArray = BinaryViewArray; + +impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { + type UnsizedItem = PredicateUnsizedItem; + type RegexType = BinaryRegex; + + ... + +} + +fn as_bytes(item: &PredicateUnsizedItem) -> &[u8] { + item +} + +fn to_char_iter(pattern: &PredicateUnsizedItem) -> impl Iterator { + pattern.iter().map(|&b| b as char) +} + +const PERCENT: &'static PredicateUnsizedItem = b"%"; +const PERCENT_ESCAPED: &'static PredicateUnsizedItem = b"\\%"; +); + +/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: +/// +/// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, +/// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`) +/// 2. Replace `LIKE` single-character wildcards `_` => `.` +/// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` +/// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` +fn transform_pattern_like_to_regex_compatible_pattern>( + pattern: PatternIter, + length: usize, +) -> String { + let mut result = String::with_capacity(length * 2); + let mut chars_iter = pattern.peekable(); + match chars_iter.peek() { + // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` + Some('%') => { + chars_iter.next(); + } + _ => result.push('^'), + }; + + while let Some(c) = chars_iter.next() { + match c { + '\\' => { + match chars_iter.peek() { + Some(&next) => { + if regex_syntax::is_meta_character(next) { result.push('\\'); } + result.push(next); + // Skipping the next char as it is already appended + chars_iter.next(); } - } - b'%' => result.push_str(".*"), - b'_' => result.push('.'), - b => { - if regex_syntax::is_meta_character(*b as char) { + None => { + // Trailing backslash in the pattern. E.g. PostgreSQL and Trino treat it as an error, but e.g. Snowflake treats it as a literal backslash + result.push('\\'); result.push('\\'); } - result.push(*b as char); } } + '%' => result.push_str(".*"), + '_' => result.push('.'), + c => { + if regex_syntax::is_meta_character(c) { + result.push('\\'); + } + result.push(c); + } } - // instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex - if result.ends_with(".*") { - result.pop(); - result.pop(); - } else { - result.push('$'); - } - BinaryRegexBuilder::new(&result) - .case_insensitive(case_insensitive) - .dot_matches_new_line(true) - .build() - .map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) } + // instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex + if result.ends_with(".*") { + result.pop(); + result.pop(); + } else { + result.push('$'); + } + + result } fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { @@ -647,6 +478,8 @@ mod tests { for (like_pattern, expected_regexp) in test_cases { let r = Predicate::regex_like(like_pattern, false).unwrap(); assert_eq!(r.to_string(), expected_regexp); + let r = BinaryPredicate::regex_like(like_pattern.as_bytes(), false).unwrap(); + assert_eq!(r.to_string(), expected_regexp); } } From e32425e5094efbb5f93b4b0e2d66dcf453045782 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 19:19:46 +0000 Subject: [PATCH 6/9] try to add back as much original code as possible --- arrow-string/src/predicate.rs | 510 ++++++++++++++++++---------------- 1 file changed, 272 insertions(+), 238 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index e4a6186964f0..09e53df3db60 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -25,8 +25,62 @@ use regex::{ }; use std::iter::zip; +pub trait SupportedPredicateItem: PartialEq +where + Self: 'static, +{ + const PERCENT: &'static Self; + const PERCENT_ESCAPED: &'static Self; + + fn len(&self) -> usize; + + fn as_bytes(&self) -> &[u8]; + + fn to_char_iter(&self) -> impl Iterator; +} + +impl SupportedPredicateItem for str { + const PERCENT: &'static str = "%"; + const PERCENT_ESCAPED: &'static str = "\\%"; + + #[inline] + fn len(&self) -> usize { + self.len() + } + + #[inline] + fn as_bytes(&self) -> &[u8] { + self.as_bytes() + } + + #[inline] + fn to_char_iter(&self) -> impl Iterator { + self.chars() + } +} + +impl SupportedPredicateItem for [u8] { + const PERCENT: &'static [u8] = b"%"; + const PERCENT_ESCAPED: &'static [u8] = b"\\%"; + + #[inline] + fn len(&self) -> usize { + self.len() + } + + #[inline] + fn as_bytes(&self) -> &[u8] { + self + } + + #[inline] + fn to_char_iter(&self) -> impl Iterator { + self.iter().map(|&b| b as char) + } +} + pub trait PredicateImpl<'a>: Sized { - type UnsizedItem: ?Sized + PartialEq; + type UnsizedItem: SupportedPredicateItem + ?Sized; type RegexType; /// Create a predicate for the given like pattern @@ -68,232 +122,223 @@ pub trait PredicateImpl<'a>: Sized { } macro_rules! impl_predicate { ( - -type PredicateUnsizedItem = $unsized_item: ty; - type MatchingRegexBuilder = $RegexBuilder: ty; - type ViewArray = $ViewArray: ident; +type PredicateUnsizedItem = $PredicateItem: ty; +type MatchingRegexBuilder = $RegexBuilder: ty; +type ViewArray = $ViewArray: ident; impl<'a> PredicateImpl<'a> for $predicate: ident<'a> { type UnsizedItem = PredicateUnsizedItem; type RegexType = $regex_type: ty; ... - -} - -fn as_bytes(item: &PredicateUnsizedItem) -> &[u8] { - item$($as_bytes_fn:tt)* -} - -fn to_char_iter(pattern: &PredicateUnsizedItem) -> impl Iterator { - pattern$($as_char_iter:tt)* } - -const PERCENT: &'static PredicateUnsizedItem = $percent: literal; -const PERCENT_ESCAPED: &'static PredicateUnsizedItem = $percent_escaped: literal; ) => { - -pub enum $predicate<'a> { - Eq(&'a $unsized_item), - Contains(Finder<'a>), - StartsWith(&'a $unsized_item), - EndsWith(&'a $unsized_item), - - /// Equality ignoring ASCII case - IEqAscii(&'a $unsized_item), - /// Starts with ignoring ASCII case - IStartsWithAscii(&'a $unsized_item), - /// Ends with ignoring ASCII case - IEndsWithAscii(&'a $unsized_item), - - Regex($regex_type), -} - -impl<'a> PredicateImpl<'a> for $predicate<'a> { - type UnsizedItem = $unsized_item; - type RegexType = $regex_type; - - /// Create a predicate for the given like pattern - fn like(pattern: &'a Self::UnsizedItem) -> Result { - if !contains_like_pattern(pattern$($as_bytes_fn)*) { - Ok(Self::Eq(pattern)) - } else if pattern.ends_with($percent) && !contains_like_pattern(&pattern[..pattern.len() - 1]$($as_bytes_fn)*) { - Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) - } else if pattern.starts_with($percent) && !contains_like_pattern(&pattern[1..]$($as_bytes_fn)*) { - Ok(Self::EndsWith(&pattern[1..])) - } else if pattern.starts_with($percent) - && pattern.ends_with($percent) - && !contains_like_pattern(&pattern[1..pattern.len() - 1]$($as_bytes_fn)*) - { - Ok(Self::contains(&pattern[1..pattern.len() - 1])) - } else { - Ok(Self::Regex(Self::regex_like(pattern, false)?)) + pub enum $predicate<'a> { + Eq(&'a $PredicateItem), + Contains(Finder<'a>), + StartsWith(&'a $PredicateItem), + EndsWith(&'a $PredicateItem), + + /// Equality ignoring ASCII case + IEqAscii(&'a $PredicateItem), + /// Starts with ignoring ASCII case + IStartsWithAscii(&'a $PredicateItem), + /// Ends with ignoring ASCII case + IEndsWithAscii(&'a $PredicateItem), + + Regex($regex_type), } - } - fn contains(needle: &'a Self::UnsizedItem) -> Self { - Self::Contains(Finder::new(needle$($as_bytes_fn)*)) - } + impl<'a> PredicateImpl<'a> for $predicate<'a> { + type UnsizedItem = $PredicateItem; + type RegexType = $regex_type; + + /// Create a predicate for the given like pattern + fn like(pattern: &'a Self::UnsizedItem) -> Result { + if !contains_like_pattern(pattern) { + Ok(Self::Eq(pattern)) + } else if pattern.ends_with(Self::UnsizedItem::PERCENT) + && !contains_like_pattern(&pattern[..pattern.len() - 1]) + { + Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) + } else if pattern.starts_with(Self::UnsizedItem::PERCENT) + && !contains_like_pattern(&pattern[1..]) + { + Ok(Self::EndsWith(&pattern[1..])) + } else if pattern.starts_with(Self::UnsizedItem::PERCENT) + && pattern.ends_with(Self::UnsizedItem::PERCENT) + && !contains_like_pattern(&pattern[1..pattern.len() - 1]) + { + Ok(Self::contains(&pattern[1..pattern.len() - 1])) + } else { + Ok(Self::Regex(Self::regex_like(pattern, false)?)) + } + } - /// Create a predicate for the given ilike pattern - fn ilike(pattern: &'a Self::UnsizedItem, is_ascii: bool) -> Result { - if is_ascii && pattern.is_ascii() { - if !contains_like_pattern(pattern$($as_bytes_fn)*) { - return Ok(Self::IEqAscii(pattern)); - } else if pattern.ends_with($percent) - && !pattern.ends_with($percent_escaped) - && !contains_like_pattern(&pattern[..pattern.len() - 1]$($as_bytes_fn)*) - { - return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); - } else if pattern.starts_with($percent) && !contains_like_pattern(&pattern[1..]$($as_bytes_fn)*) { - return Ok(Self::IEndsWithAscii(&pattern[1..])); + fn contains(needle: &'a Self::UnsizedItem) -> Self { + Self::Contains(Finder::new(needle.as_bytes())) } - } - Ok(Self::Regex(Self::regex_like(pattern, true)?)) - } - fn starts_with(pattern: &'a Self::UnsizedItem) -> Self { - Self::StartsWith(pattern) - } + /// Create a predicate for the given ilike pattern + fn ilike(pattern: &'a Self::UnsizedItem, is_ascii: bool) -> Result { + if is_ascii && pattern.is_ascii() { + if !contains_like_pattern(pattern) { + return Ok(Self::IEqAscii(pattern)); + } else if pattern.ends_with(Self::UnsizedItem::PERCENT) + && !pattern.ends_with(Self::UnsizedItem::PERCENT_ESCAPED) + && !contains_like_pattern(&pattern[..pattern.len() - 1]) + { + return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); + } else if pattern.starts_with(Self::UnsizedItem::PERCENT) + && !contains_like_pattern(&pattern[1..]) + { + return Ok(Self::IEndsWithAscii(&pattern[1..])); + } + } + Ok(Self::Regex(Self::regex_like(pattern, true)?)) + } - fn ends_with(pattern: &'a Self::UnsizedItem) -> Self { - Self::EndsWith(pattern) - } + fn starts_with(pattern: &'a Self::UnsizedItem) -> Self { + Self::StartsWith(pattern) + } - /// Evaluate this predicate against the given haystack - fn evaluate(&self, haystack: &'a Self::UnsizedItem) -> bool { - match self { - Self::Eq(v) => *v == haystack, - Self::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), - Self::Contains(finder) => finder.find(haystack$($as_bytes_fn)*).is_some(), - Self::StartsWith(v) => starts_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_kernel), - Self::IStartsWithAscii(v) => { - starts_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel) + fn ends_with(pattern: &'a Self::UnsizedItem) -> Self { + Self::EndsWith(pattern) } - Self::EndsWith(v) => ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_kernel), - Self::IEndsWithAscii(v) => ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel), - Self::Regex(v) => v.is_match(haystack), - } - } - /// Evaluate this predicate against the elements of `array` - /// - /// If `negate` is true the result of the predicate will be negated - #[inline(never)] - fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray - where - T: ArrayAccessor, - { - match self { - Self::Eq(v) => BooleanArray::from_unary(array, |haystack| { - (haystack.len() == v.len() && haystack == *v) != negate - }), - Self::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { - haystack.eq_ignore_ascii_case(v) != negate - }), - Self::Contains(finder) => BooleanArray::from_unary(array, |haystack| { - finder.find(haystack$($as_bytes_fn)*).is_some() != negate - }), - Self::StartsWith(v) => { - if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { - let nulls = view_array.logical_nulls(); - let values = BooleanBuffer::from( - view_array - .prefix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes(haystack, v$($as_bytes_fn)*, equals_kernel) != negate - }) - .collect::>(), - ); - BooleanArray::new(values, nulls) - } else { - BooleanArray::from_unary(array, |haystack| { - starts_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_kernel) != negate - }) + /// Evaluate this predicate against the given haystack + fn evaluate(&self, haystack: &'a Self::UnsizedItem) -> bool { + match self { + Self::Eq(v) => *v == haystack, + Self::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), + Self::Contains(finder) => finder.find(haystack.as_bytes()).is_some(), + Self::StartsWith(v) => starts_with(haystack, v, equals_kernel), + Self::IStartsWithAscii(v) => { + starts_with(haystack, v, equals_ignore_ascii_case_kernel) + } + Self::EndsWith(v) => ends_with(haystack, v, equals_kernel), + Self::IEndsWithAscii(v) => { + ends_with(haystack, v, equals_ignore_ascii_case_kernel) + } + Self::Regex(v) => v.is_match(haystack), } } - Self::IStartsWithAscii(v) => { - if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { - let nulls = view_array.logical_nulls(); - let values = BooleanBuffer::from( - view_array - .prefix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes( - haystack, - v$($as_bytes_fn)*, - equals_ignore_ascii_case_kernel, - ) != negate + + /// Evaluate this predicate against the elements of `array` + /// + /// If `negate` is true the result of the predicate will be negated + #[inline(never)] + fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray + where + T: ArrayAccessor, + { + match self { + Self::Eq(v) => BooleanArray::from_unary(array, |haystack| { + (haystack.len() == v.len() && haystack == *v) != negate + }), + Self::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { + haystack.eq_ignore_ascii_case(v) != negate + }), + Self::Contains(finder) => BooleanArray::from_unary(array, |haystack| { + finder.find(haystack.as_bytes()).is_some() != negate + }), + Self::StartsWith(v) => { + if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { + let nulls = view_array.logical_nulls(); + let values = BooleanBuffer::from( + view_array + .prefix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, *v, equals_kernel) != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_kernel) != negate }) - .collect::>(), - ); - BooleanArray::new(values, nulls) - } else { - BooleanArray::from_unary(array, |haystack| { - starts_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel) != negate - }) - } - } - Self::EndsWith(v) => { - if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { - let nulls = view_array.logical_nulls(); - let values = BooleanBuffer::from( - view_array - .suffix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes(haystack, v$($as_bytes_fn)*, equals_kernel) != negate + } + } + Self::IStartsWithAscii(v) => { + if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { + let nulls = view_array.logical_nulls(); + let values = BooleanBuffer::from( + view_array + .prefix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, *v, equals_ignore_ascii_case_kernel) + != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate }) - .collect::>(), - ); - BooleanArray::new(values, nulls) - } else { - BooleanArray::from_unary(array, |haystack| { - ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_kernel) != negate - }) - } - } - Self::IEndsWithAscii(v) => { - if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { - let nulls = view_array.logical_nulls(); - let values = BooleanBuffer::from( - view_array - .suffix_bytes_iter(v.len()) - .map(|haystack| { - equals_bytes( - haystack, - v$($as_bytes_fn)*, - equals_ignore_ascii_case_kernel, - ) != negate + } + } + Self::EndsWith(v) => { + if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { + let nulls = view_array.logical_nulls(); + let values = BooleanBuffer::from( + view_array + .suffix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, *v, equals_kernel) != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_kernel) != negate }) - .collect::>(), - ); - BooleanArray::new(values, nulls) - } else { - BooleanArray::from_unary(array, |haystack| { - ends_with(haystack$($as_bytes_fn)*, v$($as_bytes_fn)*, equals_ignore_ascii_case_kernel) != negate - }) + } + } + Self::IEndsWithAscii(v) => { + if let Some(view_array) = array.as_any().downcast_ref::<$ViewArray>() { + let nulls = view_array.logical_nulls(); + let values = BooleanBuffer::from( + view_array + .suffix_bytes_iter(v.len()) + .map(|haystack| { + equals_bytes(haystack, *v, equals_ignore_ascii_case_kernel) + != negate + }) + .collect::>(), + ); + BooleanArray::new(values, nulls) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }) + } + } + Self::Regex(v) => { + BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) + } } } - Self::Regex(v) => { - BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) + + fn regex_like( + pattern: &Self::UnsizedItem, + case_insensitive: bool, + ) -> Result { + let regex_pattern = transform_pattern_like_to_regex_compatible_pattern(pattern); + <$RegexBuilder>::new(®ex_pattern) + .case_insensitive(case_insensitive) + .dot_matches_new_line(true) + .build() + .map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Unable to build regex from LIKE pattern: {e}" + )) + }) } } - } - - fn regex_like(pattern: &Self::UnsizedItem, case_insensitive: bool) -> Result { - let regex_pattern = transform_pattern_like_to_regex_compatible_pattern(pattern$($as_char_iter)+, pattern.len()); - <$RegexBuilder>::new(®ex_pattern) - .case_insensitive(case_insensitive) - .dot_matches_new_line(true) - .build() - .map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - } -} - } + }; } impl_predicate!( @@ -309,17 +354,6 @@ impl<'a> PredicateImpl<'a> for Predicate<'a> { ... } - -fn as_bytes(item: &PredicateUnsizedItem) -> &[u8] { - item.as_bytes() -} - -fn to_char_iter(pattern: &PredicateUnsizedItem) -> impl Iterator { - pattern.chars() -} - -const PERCENT: &'static PredicateUnsizedItem = "%"; -const PERCENT_ESCAPED: &'static PredicateUnsizedItem = "\\%"; ); impl_predicate!( @@ -335,17 +369,6 @@ impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { ... } - -fn as_bytes(item: &PredicateUnsizedItem) -> &[u8] { - item -} - -fn to_char_iter(pattern: &PredicateUnsizedItem) -> impl Iterator { - pattern.iter().map(|&b| b as char) -} - -const PERCENT: &'static PredicateUnsizedItem = b"%"; -const PERCENT_ESCAPED: &'static PredicateUnsizedItem = b"\\%"; ); /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: @@ -355,12 +378,11 @@ const PERCENT_ESCAPED: &'static PredicateUnsizedItem = b"\\%"; /// 2. Replace `LIKE` single-character wildcards `_` => `.` /// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` /// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` -fn transform_pattern_like_to_regex_compatible_pattern>( - pattern: PatternIter, - length: usize, +fn transform_pattern_like_to_regex_compatible_pattern( + pattern: &T, ) -> String { - let mut result = String::with_capacity(length * 2); - let mut chars_iter = pattern.peekable(); + let mut result = String::with_capacity(pattern.len() * 2); + let mut chars_iter = pattern.to_char_iter().peekable(); match chars_iter.peek() { // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` Some('%') => { @@ -409,30 +431,42 @@ fn transform_pattern_like_to_regex_compatible_pattern bool) -> bool { - lhs.len() == rhs.len() && zip(lhs, rhs).all(byte_eq_kernel) +fn equals_bytes( + lhs: &Lhs, + rhs: &Rhs, + byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, +) -> bool { + lhs.len() == rhs.len() && zip(lhs.as_bytes(), rhs.as_bytes()).all(byte_eq_kernel) } /// This is faster than `str::starts_with` for small strings. /// See for more details. -fn starts_with( - haystack: &[u8], - needle: &[u8], +fn starts_with( + haystack: &T, + needle: &T, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, ) -> bool { if needle.len() > haystack.len() { false } else { - zip(haystack, needle).all(byte_eq_kernel) + zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) } } /// This is faster than `str::ends_with` for small strings. /// See for more details. -fn ends_with(haystack: &[u8], needle: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { +fn ends_with( + haystack: &T, + needle: &T, + byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, +) -> bool { if needle.len() > haystack.len() { false } else { - zip(haystack.iter().rev(), needle.iter().rev()).all(byte_eq_kernel) + zip( + haystack.as_bytes().iter().rev(), + needle.as_bytes().iter().rev(), + ) + .all(byte_eq_kernel) } } @@ -444,8 +478,8 @@ fn equals_ignore_ascii_case_kernel((n, h): (&u8, &u8)) -> bool { n.eq_ignore_ascii_case(h) } -fn contains_like_pattern(pattern: &[u8]) -> bool { - memchr3(b'%', b'_', b'\\', pattern).is_some() +fn contains_like_pattern(pattern: &T) -> bool { + memchr3(b'%', b'_', b'\\', pattern.as_bytes()).is_some() } #[cfg(test)] From 333ecb9f5cda176177601c4ab722308a9a14b4bd Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 19:21:37 +0000 Subject: [PATCH 7/9] reorder functions to have less diff --- arrow-string/src/predicate.rs | 94 +++++++++++++++++------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 09e53df3db60..e4c8140e0225 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -371,6 +371,53 @@ impl<'a> PredicateImpl<'a> for BinaryPredicate<'a> { } ); +fn equals_bytes( + lhs: &Lhs, + rhs: &Rhs, + byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, +) -> bool { + lhs.len() == rhs.len() && zip(lhs.as_bytes(), rhs.as_bytes()).all(byte_eq_kernel) +} + +/// This is faster than `str::starts_with` for small strings. +/// See for more details. +fn starts_with( + haystack: &T, + needle: &T, + byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, +) -> bool { + if needle.len() > haystack.len() { + false + } else { + zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) + } +} +/// This is faster than `str::ends_with` for small strings. +/// See for more details. +fn ends_with( + haystack: &T, + needle: &T, + byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, +) -> bool { + if needle.len() > haystack.len() { + false + } else { + zip( + haystack.as_bytes().iter().rev(), + needle.as_bytes().iter().rev(), + ) + .all(byte_eq_kernel) + } +} + +fn equals_kernel((n, h): (&u8, &u8)) -> bool { + n == h +} + +fn equals_ignore_ascii_case_kernel((n, h): (&u8, &u8)) -> bool { + n.eq_ignore_ascii_case(h) +} + /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: /// /// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, @@ -431,53 +478,6 @@ fn transform_pattern_like_to_regex_compatible_pattern( - lhs: &Lhs, - rhs: &Rhs, - byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, -) -> bool { - lhs.len() == rhs.len() && zip(lhs.as_bytes(), rhs.as_bytes()).all(byte_eq_kernel) -} - -/// This is faster than `str::starts_with` for small strings. -/// See for more details. -fn starts_with( - haystack: &T, - needle: &T, - byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, -) -> bool { - if needle.len() > haystack.len() { - false - } else { - zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) - } -} -/// This is faster than `str::ends_with` for small strings. -/// See for more details. -fn ends_with( - haystack: &T, - needle: &T, - byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, -) -> bool { - if needle.len() > haystack.len() { - false - } else { - zip( - haystack.as_bytes().iter().rev(), - needle.as_bytes().iter().rev(), - ) - .all(byte_eq_kernel) - } -} - -fn equals_kernel((n, h): (&u8, &u8)) -> bool { - n == h -} - -fn equals_ignore_ascii_case_kernel((n, h): (&u8, &u8)) -> bool { - n.eq_ignore_ascii_case(h) -} - fn contains_like_pattern(pattern: &T) -> bool { memchr3(b'%', b'_', b'\\', pattern.as_bytes()).is_some() } From 1a695fbbf395468be3af1ef7f3762d541afe64b5 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 19:39:21 +0000 Subject: [PATCH 8/9] remove use of impl Trait from function return type --- arrow-string/src/predicate.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index e4c8140e0225..044a4deeb1aa 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -23,12 +23,14 @@ use memchr::memmem::Finder; use regex::{ bytes::Regex as BinaryRegex, bytes::RegexBuilder as BinaryRegexBuilder, Regex, RegexBuilder, }; -use std::iter::zip; +use std::{iter::zip, str::Chars}; pub trait SupportedPredicateItem: PartialEq where Self: 'static, { + type CharIter<'a>: Iterator; + const PERCENT: &'static Self; const PERCENT_ESCAPED: &'static Self; @@ -36,10 +38,14 @@ where fn as_bytes(&self) -> &[u8]; - fn to_char_iter(&self) -> impl Iterator; + // After the minimum supported Rust version is >= 1.75.0 we can change the return type to be + // `impl Iterator` + fn to_char_iter(&self) -> Self::CharIter<'_>; } impl SupportedPredicateItem for str { + type CharIter<'a> = Chars<'a>; + const PERCENT: &'static str = "%"; const PERCENT_ESCAPED: &'static str = "\\%"; @@ -54,12 +60,14 @@ impl SupportedPredicateItem for str { } #[inline] - fn to_char_iter(&self) -> impl Iterator { + fn to_char_iter(&self) -> Self::CharIter<'_> { self.chars() } } impl SupportedPredicateItem for [u8] { + type CharIter<'a> = std::iter::Map, for<'b> fn(&'b u8) -> char>; + const PERCENT: &'static [u8] = b"%"; const PERCENT_ESCAPED: &'static [u8] = b"\\%"; @@ -74,7 +82,7 @@ impl SupportedPredicateItem for [u8] { } #[inline] - fn to_char_iter(&self) -> impl Iterator { + fn to_char_iter(&self) -> Self::CharIter<'_> { self.iter().map(|&b| b as char) } } From 30b432ee8e820bdc3d84d67f0559a9ff9ff743e6 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 31 Dec 2024 19:58:14 +0000 Subject: [PATCH 9/9] remove use of impl Trait from function return type --- arrow-string/src/like.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 70fce4f017ca..64e98adbb650 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -20,6 +20,7 @@ use crate::predicate::{BinaryPredicate, Predicate, PredicateImpl}; use arrow_array::cast::AsArray; +use arrow_array::iterator::ArrayIter; use arrow_array::*; use arrow_schema::*; use arrow_select::take::take; @@ -115,10 +116,13 @@ pub fn contains(left: &dyn Datum, right: &dyn Datum) -> Result: PredicateImpl<'a, UnsizedItem = Self::UnsizedItem>; + type Iter<'a>: Iterator> + where + Self: 'a; fn is_ascii(&self) -> bool; - fn iter(&self) -> impl Iterator>; + fn iter(&self) -> Self::Iter<'_>; fn item_as_bytes(item: &Self::UnsizedItem) -> &[u8]; } @@ -126,12 +130,13 @@ trait LikeSupportedArray: Array { impl LikeSupportedArray for GenericStringArray { type UnsizedItem = str; type MatchingPredicate<'a> = Predicate<'a>; + type Iter<'a> = ArrayIter<&'a Self>; fn is_ascii(&self) -> bool { self.is_ascii() } - fn iter(&self) -> impl Iterator> { + fn iter(&self) -> Self::Iter<'_> { self.iter() } @@ -143,12 +148,13 @@ impl LikeSupportedArray for GenericStringArray = Predicate<'a>; + type Iter<'a> = ArrayIter<&'a Self>; fn is_ascii(&self) -> bool { self.is_ascii() } - fn iter(&self) -> impl Iterator> { + fn iter(&self) -> Self::Iter<'_> { self.iter() } @@ -160,12 +166,13 @@ impl LikeSupportedArray for StringViewArray { impl LikeSupportedArray for GenericBinaryArray { type UnsizedItem = [u8]; type MatchingPredicate<'a> = BinaryPredicate<'a>; + type Iter<'a> = ArrayIter<&'a Self>; fn is_ascii(&self) -> bool { self.is_ascii() } - fn iter(&self) -> impl Iterator> { + fn iter(&self) -> Self::Iter<'_> { self.iter() } @@ -177,12 +184,13 @@ impl LikeSupportedArray for GenericBinaryArray = BinaryPredicate<'a>; + type Iter<'a> = ArrayIter<&'a Self>; fn is_ascii(&self) -> bool { self.is_ascii() } - fn iter(&self) -> impl Iterator> { + fn iter(&self) -> Self::Iter<'_> { self.iter() }