Skip to content

Commit

Permalink
Fix incorrect ... LIKE '%' simplification
Browse files Browse the repository at this point in the history
`expr LIKE '%'` was previously simplified to `true`, but the expression
returns `NULL` when `expr` is null.
The conversion was conditional on `!is_null(expr)` which means "is not
always true, i.e. is not a null literal".

This commit adds correct simplification logic. It additionally expands
the rule coverage to include string view (Utf8View) and large string
(LargeUtf8). This allows writing shared test cases even despite
`utf8_view LIKE '%'` returning incorrect results at execution time
(tracked by apache#12637). I.e. the
simplification masks the bug for cases where pattern is statically
known.
  • Loading branch information
findepi committed Nov 5, 2024
1 parent 2e52580 commit 3085835
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 14 deletions.
63 changes: 49 additions & 14 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1476,13 +1476,28 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
negated,
escape_char: _,
case_insensitive: _,
}) if !is_null(&expr)
&& matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%"
) =>
}) if matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%"
) || matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%"
) || matches!(
pattern.as_ref(),
Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%"
) =>
{
Transformed::yes(lit(!negated))
// exp LIKE '%' is
// - when exp is not NULL, it's true
// - when exp is NULL, it's NULL
// exp NOT LIKE '%' is
// - when exp is not NULL, it's false
// - when exp is NULL, it's NULL
Transformed::yes(Expr::Case(Case {
expr: Some(Box::new(Expr::IsNotNull(expr))),
when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))],
else_expr: None,
}))
}

// a is not null/unknown --> true (if a is not nullable)
Expand Down Expand Up @@ -2777,10 +2792,22 @@ mod tests {
assert_no_change(regex_match(col("c1"), lit("f_o")));

// empty cases
assert_change(regex_match(col("c1"), lit("")), lit(true));
assert_change(regex_not_match(col("c1"), lit("")), lit(false));
assert_change(regex_imatch(col("c1"), lit("")), lit(true));
assert_change(regex_not_imatch(col("c1"), lit("")), lit(false));
assert_change(
regex_match(col("c1"), lit("")),
if_not_null(col("c1"), true),
);
assert_change(
regex_not_match(col("c1"), lit("")),
if_not_null(col("c1"), false),
);
assert_change(
regex_imatch(col("c1"), lit("")),
if_not_null(col("c1"), true),
);
assert_change(
regex_not_imatch(col("c1"), lit("")),
if_not_null(col("c1"), false),
);

// single character
assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%"));
Expand Down Expand Up @@ -3608,16 +3635,16 @@ mod tests {
fn test_like_and_ilke() {
// test non-null values
let expr = like(col("c1"), "%");
assert_eq!(simplify(expr), lit(true));
assert_eq!(simplify(expr), if_not_null(col("c1"), true));

let expr = not_like(col("c1"), "%");
assert_eq!(simplify(expr), lit(false));
assert_eq!(simplify(expr), if_not_null(col("c1"), false));

let expr = ilike(col("c1"), "%");
assert_eq!(simplify(expr), lit(true));
assert_eq!(simplify(expr), if_not_null(col("c1"), true));

let expr = not_ilike(col("c1"), "%");
assert_eq!(simplify(expr), lit(false));
assert_eq!(simplify(expr), if_not_null(col("c1"), false));

// test null values
let null = lit(ScalarValue::Utf8(None));
Expand Down Expand Up @@ -4043,4 +4070,12 @@ mod tests {
);
}
}

fn if_not_null(expr: Expr, then: bool) -> Expr {
Expr::Case(Case {
expr: Some(expr.is_not_null().into()),
when_then_expr: vec![(lit(true).into(), lit(then).into())],
else_expr: None,
})
}
}
8 changes: 8 additions & 0 deletions datafusion/sqllogictest/test_files/string/string.slt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ statement ok
create table test_substr as
select arrow_cast(col1, 'Utf8') as c1 from test_substr_base;

query BBB
SELECT
NULL LIKE '%',
'' LIKE '%',
'a' LIKE '%'
----
NULL true true

# TODO: move it back to `string_query.slt.part` after fixing the issue
# see detail: https://github.com/apache/datafusion/issues/12637
# Test pattern with wildcard characters
Expand Down
52 changes: 52 additions & 0 deletions datafusion/sqllogictest/test_files/string/string_query.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,58 @@ NULL NULL NULL NULL NULL
#Raphael datafusionДатаФусион false false false false
#NULL NULL NULL NULL NULL NULL

# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections
query TTBB
SELECT ascii_1, unicode_1,
ascii_1 LIKE '%' AS ascii_1_like_percent,
unicode_1 LIKE '%' AS unicode_1_like_percent
-- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
-- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
FROM test_basic_operator
----
Andrew datafusion📊🔥 true true
Xiangpeng datafusion数据融合 true true
Raphael datafusionДатаФусион true true
under_score un iść core true true
percent pan Tadeusz ma iść w kąt true true
(empty) (empty) true true
NULL NULL NULL NULL
NULL NULL NULL NULL

# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections
query TTBB
SELECT ascii_1, unicode_1,
ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent,
unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent
-- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
-- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
FROM test_basic_operator
----
Andrew datafusion📊🔥 false false
Xiangpeng datafusion数据融合 false false
Raphael datafusionДатаФусион false false
under_score un iść core false false
percent pan Tadeusz ma iść w kąt false false
(empty) (empty) false false
NULL NULL NULL NULL
NULL NULL NULL NULL

query T
SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%'
----
Andrew
Xiangpeng
Raphael
under_score
percent
(empty)

# TODO: move it back to `string_query.slt.part` after fixing the issue
# see detail: https://github.com/apache/datafusion/issues/12637
query T
SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 NOT LIKE '%'
----

# Test pattern without wildcard characters
query TTBBBB
select ascii_1, unicode_1,
Expand Down

0 comments on commit 3085835

Please sign in to comment.