From 2500eaec282857166f87b1711063cc82501db752 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Wed, 17 Jan 2024 11:56:03 +0100 Subject: [PATCH] Add support for PG LIKE operators --- Cargo.toml | 2 +- datafusion-cli/Cargo.lock | 6 +- datafusion/expr/src/operator.rs | 24 ++++ datafusion/expr/src/type_coercion/binary.rs | 7 ++ .../physical-expr/src/expressions/binary.rs | 105 +++++++++++++++++- datafusion/sql/src/expr/binary_op.rs | 4 + datafusion/sql/src/planner.rs | 2 + datafusion/sql/src/statement.rs | 3 + .../sqllogictest/test_files/predicates.slt | 24 ++++ .../substrait/src/logical_plan/producer.rs | 4 + docs/source/user-guide/sql/operators.md | 52 +++++++++ 11 files changed, 226 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cc18616774760..ee63826be2028 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,7 +70,7 @@ parquet = { version = "50.0.0", default-features = false, features = ["arrow", " rand = "0.8" rstest = "0.18.0" serde_json = "1" -sqlparser = { version = "0.41.0", features = ["visitor"] } +sqlparser = { git = "https://github.com/splitgraph/sqlparser-rs", branch = "pg-like-op-support", features = ["visitor"] } tempfile = "3" thiserror = "1.0.44" url = "2.2" diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 5663e736dbd8a..e5602efd5ca19 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -3159,8 +3159,7 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" version = "0.41.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cc2c25a6c66789625ef164b4c7d2e548d627902280c13710d33da8222169964" +source = "git+https://github.com/splitgraph/sqlparser-rs?branch=pg-like-op-support#3de081e2098690aaf0e27f2d2e8a3a2468674689" dependencies = [ "log", "sqlparser_derive", @@ -3169,8 +3168,7 @@ dependencies = [ [[package]] name = "sqlparser_derive" version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" +source = "git+https://github.com/splitgraph/sqlparser-rs?branch=pg-like-op-support#3de081e2098690aaf0e27f2d2e8a3a2468674689" dependencies = [ "proc-macro2", "quote", diff --git a/datafusion/expr/src/operator.rs b/datafusion/expr/src/operator.rs index 57888a11d426c..a10312e234460 100644 --- a/datafusion/expr/src/operator.rs +++ b/datafusion/expr/src/operator.rs @@ -69,6 +69,14 @@ pub enum Operator { RegexNotMatch, /// Case insensitive regex not match RegexNotIMatch, + /// Case sensitive pattern match + LikeMatch, + /// Case insensitive pattern match + ILikeMatch, + /// Case sensitive pattern not match + NotLikeMatch, + /// Case insensitive pattern not match + NotILikeMatch, /// Bitwise and, like `&` BitwiseAnd, /// Bitwise or, like `|` @@ -100,6 +108,10 @@ impl Operator { Operator::GtEq => Some(Operator::Lt), Operator::IsDistinctFrom => Some(Operator::IsNotDistinctFrom), Operator::IsNotDistinctFrom => Some(Operator::IsDistinctFrom), + Operator::LikeMatch => Some(Operator::NotLikeMatch), + Operator::ILikeMatch => Some(Operator::NotILikeMatch), + Operator::NotLikeMatch => Some(Operator::LikeMatch), + Operator::NotILikeMatch => Some(Operator::ILikeMatch), Operator::Plus | Operator::Minus | Operator::Multiply @@ -192,6 +204,10 @@ impl Operator { | Operator::RegexIMatch | Operator::RegexNotMatch | Operator::RegexNotIMatch + | Operator::LikeMatch + | Operator::ILikeMatch + | Operator::NotLikeMatch + | Operator::NotILikeMatch | Operator::BitwiseAnd | Operator::BitwiseOr | Operator::BitwiseXor @@ -221,6 +237,10 @@ impl Operator { | Operator::RegexNotMatch | Operator::RegexIMatch | Operator::RegexNotIMatch + | Operator::LikeMatch + | Operator::ILikeMatch + | Operator::NotLikeMatch + | Operator::NotILikeMatch | Operator::BitwiseAnd | Operator::BitwiseOr | Operator::BitwiseShiftLeft @@ -253,6 +273,10 @@ impl fmt::Display for Operator { Operator::RegexIMatch => "~*", Operator::RegexNotMatch => "!~", Operator::RegexNotIMatch => "!~*", + Operator::LikeMatch => "~~", + Operator::ILikeMatch => "~~*", + Operator::NotLikeMatch => "!~~", + Operator::NotILikeMatch => "!~~*", Operator::IsDistinctFrom => "IS DISTINCT FROM", Operator::IsNotDistinctFrom => "IS NOT DISTINCT FROM", Operator::BitwiseAnd => "&", diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 6bacc18700798..70015c6992966 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -101,6 +101,13 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result ) }) } + LikeMatch | ILikeMatch | NotLikeMatch | NotILikeMatch => { + regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + plan_datafusion_err!( + "Cannot infer common argument type for regex operation {lhs} {op} {rhs}" + ) + }) + } BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseShiftRight | BitwiseShiftLeft => { bitwise_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| { plan_datafusion_err!( diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 8c4078dbce8ce..8e10402228b30 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -28,7 +28,7 @@ use crate::sort_properties::SortProperties; use crate::PhysicalExpr; use arrow::array::*; -use arrow::compute::cast; +use arrow::compute::{cast, like, ilike, nlike, nilike}; use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene}; use arrow::compute::kernels::cmp::*; use arrow::compute::kernels::comparison::regexp_is_match_utf8; @@ -281,6 +281,10 @@ impl PhysicalExpr for BinaryExpr { Operator::GtEq => return apply_cmp(&lhs, &rhs, gt_eq), Operator::IsDistinctFrom => return apply_cmp(&lhs, &rhs, distinct), Operator::IsNotDistinctFrom => return apply_cmp(&lhs, &rhs, not_distinct), + Operator::LikeMatch => return apply_cmp(&lhs, &rhs, like), + Operator::ILikeMatch => return apply_cmp(&lhs, &rhs, ilike), + Operator::NotLikeMatch => return apply_cmp(&lhs, &rhs, nlike), + Operator::NotILikeMatch => return apply_cmp(&lhs, &rhs, nilike), _ => {} } @@ -554,7 +558,8 @@ impl BinaryExpr { use Operator::*; match &self.op { IsDistinctFrom | IsNotDistinctFrom | Lt | LtEq | Gt | GtEq | Eq | NotEq - | Plus | Minus | Multiply | Divide | Modulo => unreachable!(), + | Plus | Minus | Multiply | Divide | Modulo | LikeMatch | ILikeMatch + | NotLikeMatch | NotILikeMatch => unreachable!(), And => { if left_data_type == &DataType::Boolean { boolean_op!(&left, &right, and_kleene) @@ -970,6 +975,102 @@ mod tests { DataType::Boolean, [false, false, false, false, true], ); + test_coercion!( + StringArray, + DataType::Utf8, + vec!["abc"; 5], + StringArray, + DataType::Utf8, + vec!["a__", "A%BC", "A_BC", "abc", "a%C"], + Operator::LikeMatch, + BooleanArray, + DataType::Boolean, + [true, false, false, true, false], + ); + test_coercion!( + StringArray, + DataType::Utf8, + vec!["abc"; 5], + StringArray, + DataType::Utf8, + vec!["a__", "A%BC", "A_BC", "abc", "a%C"], + Operator::ILikeMatch, + BooleanArray, + DataType::Boolean, + [true, true, false, true, true], + ); + test_coercion!( + StringArray, + DataType::Utf8, + vec!["abc"; 5], + StringArray, + DataType::Utf8, + vec!["a__", "A%BC", "A_BC", "abc", "a%C"], + Operator::NotLikeMatch, + BooleanArray, + DataType::Boolean, + [false, true, true, false, true], + ); + test_coercion!( + StringArray, + DataType::Utf8, + vec!["abc"; 5], + StringArray, + DataType::Utf8, + vec!["a__", "A%BC", "A_BC", "abc", "a%C"], + Operator::NotILikeMatch, + BooleanArray, + DataType::Boolean, + [false, false, true, false, false], + ); + test_coercion!( + LargeStringArray, + DataType::LargeUtf8, + vec!["abc"; 5], + LargeStringArray, + DataType::LargeUtf8, + vec!["a__", "A%BC", "A_BC", "abc", "a%C"], + Operator::LikeMatch, + BooleanArray, + DataType::Boolean, + [true, false, false, true, false], + ); + test_coercion!( + LargeStringArray, + DataType::LargeUtf8, + vec!["abc"; 5], + LargeStringArray, + DataType::LargeUtf8, + vec!["a__", "A%BC", "A_BC", "abc", "a%C"], + Operator::ILikeMatch, + BooleanArray, + DataType::Boolean, + [true, true, false, true, true], + ); + test_coercion!( + LargeStringArray, + DataType::LargeUtf8, + vec!["abc"; 5], + LargeStringArray, + DataType::LargeUtf8, + vec!["a__", "A%BC", "A_BC", "abc", "a%C"], + Operator::NotLikeMatch, + BooleanArray, + DataType::Boolean, + [false, true, true, false, true], + ); + test_coercion!( + LargeStringArray, + DataType::LargeUtf8, + vec!["abc"; 5], + LargeStringArray, + DataType::LargeUtf8, + vec!["a__", "A%BC", "A_BC", "abc", "a%C"], + Operator::NotILikeMatch, + BooleanArray, + DataType::Boolean, + [false, false, true, false, false], + ); test_coercion!( Int16Array, DataType::Int16, diff --git a/datafusion/sql/src/expr/binary_op.rs b/datafusion/sql/src/expr/binary_op.rs index d9c85663e50e2..78efaca09938d 100644 --- a/datafusion/sql/src/expr/binary_op.rs +++ b/datafusion/sql/src/expr/binary_op.rs @@ -40,6 +40,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { BinaryOperator::PGRegexIMatch => Ok(Operator::RegexIMatch), BinaryOperator::PGRegexNotMatch => Ok(Operator::RegexNotMatch), BinaryOperator::PGRegexNotIMatch => Ok(Operator::RegexNotIMatch), + BinaryOperator::PGLikeMatch => Ok(Operator::LikeMatch), + BinaryOperator::PGILikeMatch => Ok(Operator::ILikeMatch), + BinaryOperator::PGNotLikeMatch => Ok(Operator::NotLikeMatch), + BinaryOperator::PGNotILikeMatch => Ok(Operator::NotILikeMatch), BinaryOperator::BitwiseAnd => Ok(Operator::BitwiseAnd), BinaryOperator::BitwiseOr => Ok(Operator::BitwiseOr), BinaryOperator::BitwiseXor => Ok(Operator::BitwiseXor), diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index a04df5589b856..824e6af2f9ea5 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -452,6 +452,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { | SQLDataType::Int64 | SQLDataType::Float64 | SQLDataType::Struct(_) + | SQLDataType::JSONB + | SQLDataType::Unspecified => not_impl_err!( "Unsupported SQL type {sql_type:?}" ), diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index b9fb4c65dc2c4..2b07207109ac6 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -435,10 +435,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { overwrite, source, partitioned, + priority: _, after_columns, table, + table_alias: _, on, returning, + replace_into: _, ignore, } => { if or.is_some() { diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index e32e415338a7a..763829359d1d3 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -220,6 +220,30 @@ SELECT * FROM test WHERE column1 !~* 'z' foo Barrr +query T +SELECT * FROM test WHERE column1 ~~ '__z%' +---- +Bazzz + +query T +SELECT * FROM test WHERE column1 ~~* '__z%' +---- +Bazzz +ZZZZZ + +query T +SELECT * FROM test WHERE column1 !~~ '__z%' +---- +foo +Barrr +ZZZZZ + +query T +SELECT * FROM test WHERE column1 !~~* '__z%' +---- +foo +Barrr + statement ok DROP TABLE test; diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index ab0e8c860858e..fc9517c90a45e 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -577,6 +577,10 @@ pub fn operator_to_name(op: Operator) -> &'static str { Operator::RegexIMatch => "regex_imatch", Operator::RegexNotMatch => "regex_not_match", Operator::RegexNotIMatch => "regex_not_imatch", + Operator::LikeMatch => "like_match", + Operator::ILikeMatch => "like_imatch", + Operator::NotLikeMatch => "like_not_match", + Operator::NotILikeMatch => "like_not_imatch", Operator::BitwiseAnd => "bitwise_and", Operator::BitwiseOr => "bitwise_or", Operator::StringConcat => "str_concat", diff --git a/docs/source/user-guide/sql/operators.md b/docs/source/user-guide/sql/operators.md index 265e56bb2c348..872ef55dd39d7 100644 --- a/docs/source/user-guide/sql/operators.md +++ b/docs/source/user-guide/sql/operators.md @@ -263,6 +263,58 @@ Not Regex Case-Insensitive Match +---------------------------------------------------+ ``` +### `~~` + +Like Match + +```sql +❯ SELECT 'datafusion' ~~ 'dat_f%n'; ++---------------------------------------+ +| Utf8("datafusion") ~~ Utf8("dat_f%n") | ++---------------------------------------+ +| true | ++---------------------------------------+ +``` + +### `~~*` + +Case-Insensitive Like Match + +```sql +❯ SELECT 'datafusion' ~~* 'Dat_F%n'; ++----------------------------------------+ +| Utf8("datafusion") ~~* Utf8("Dat_F%n") | ++----------------------------------------+ +| true | ++----------------------------------------+ +``` + +### `!~~` + +Not Like Match + +```sql +❯ SELECT 'datafusion' !~~ 'Dat_F%n'; ++----------------------------------------+ +| Utf8("datafusion") !~~ Utf8("Dat_F%n") | ++----------------------------------------+ +| true | ++----------------------------------------+ +``` + +### `!~~*` + +Not Case-Insensitive Like Match + +```sql +❯ SELECT 'datafusion' !~~* 'Dat%F_n'; ++-----------------------------------------+ +| Utf8("datafusion") !~~* Utf8("Dat%F_n") | ++-----------------------------------------+ +| true | ++-----------------------------------------+ +``` + ## Logical Operators - [AND](#and)