diff --git a/datafusion/core/tests/fuzz_cases/mod.rs b/datafusion/core/tests/fuzz_cases/mod.rs index 49db0d31a8e9..d5511e2970f4 100644 --- a/datafusion/core/tests/fuzz_cases/mod.rs +++ b/datafusion/core/tests/fuzz_cases/mod.rs @@ -24,6 +24,8 @@ mod sort_fuzz; mod aggregation_fuzzer; mod equivalence; +mod pruning; + mod limit_fuzz; mod sort_preserving_repartition_fuzz; mod window_fuzz; diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs new file mode 100644 index 000000000000..ad35850a5f18 --- /dev/null +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -0,0 +1,247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_array::{Array, RecordBatch, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use bytes::{BufMut, Bytes, BytesMut}; +use datafusion::{ + datasource::{ + listing::PartitionedFile, + physical_plan::{parquet::ParquetExecBuilder, FileScanConfig}, + }, + prelude::*, +}; +use datafusion_common::DFSchema; +use datafusion_execution::object_store::ObjectStoreUrl; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::{collect, filter::FilterExec, ExecutionPlan}; +use itertools::Itertools; +use object_store::{memory::InMemory, path::Path, ObjectStore, PutPayload}; +use parquet::{ + arrow::ArrowWriter, + file::properties::{EnabledStatistics, WriterProperties}, +}; +use rand::seq::SliceRandom; +use url::Url; + +#[tokio::test] +async fn test_fuzz_utf8() { + // Fuzz testing for UTF8 predicate pruning + // The basic idea is that query results should always be the same with or without stats/pruning + // If we get this right we at least guarantee that there are no incorrect results + // There may still be suboptimal pruning or stats but that's something we can try to catch + // with more targeted tests. + + // Since we know where the edge cases might be we don't do random black box fuzzing. + // Instead we fuzz on specific pre-defined axis: + // + // - Which characters are in each value. We want to make sure to include characters that when + // incremented, truncated or otherwise manipulated might cause issues. + // - The values in each row group. This impacts which min/max stats are generated for each rg. + // We'll generate combinations of the characters with lengths ranging from 1 to 4. + // - Truncation of statistics to 1, 2 or 3 characters as well as no truncation. + + let mut rng = rand::thread_rng(); + + let characters = [ + "z", + "0", + "~", + "ß", + "℣", + "%", // this one is useful for like/not like tests since it will result in randomly inserted wildcards + "_", // this one is useful for like/not like tests since it will result in randomly inserted wildcards + "\u{7F}", + "\u{7FF}", + "\u{FF}", + "\u{10FFFF}", + "\u{D7FF}", + "\u{FDCF}", + // null character + "\u{0}", + ]; + + let value_lengths = [1, 2, 3]; + + // generate all combinations of characters with lengths ranging from 1 to 4 + let mut values = vec![]; + for length in &value_lengths { + values.extend( + characters + .iter() + .cloned() + .combinations(*length) + // now get all permutations of each combination + .flat_map(|c| c.into_iter().permutations(*length)) + // and join them into strings + .map(|c| c.join("")), + ); + } + + println!("Generated {} values", values.len()); + + // randomly pick 100 values + values.shuffle(&mut rng); + values.truncate(100); + + let mut row_groups = vec![]; + // generate all combinations of values for row groups (1 or 2 values per rg, more is unnecessary since we only get min/max stats out) + for rg_length in [1, 2] { + row_groups.extend(values.iter().cloned().combinations(rg_length)); + } + + println!("Generated {} row groups", row_groups.len()); + + // Randomly pick 100 row groups (combinations of said values) + row_groups.shuffle(&mut rng); + row_groups.truncate(100); + + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); + let df_schema = DFSchema::try_from(schema.clone()).unwrap(); + + let store = InMemory::new(); + let mut files = vec![]; + for (idx, truncation_length) in [Some(1), Some(2), None].iter().enumerate() { + // parquet files only support 32767 row groups per file, so chunk up into multiple files so we don't error if running on a large number of row groups + for (rg_idx, row_groups) in row_groups.chunks(32766).enumerate() { + let buf = write_parquet_file( + *truncation_length, + schema.clone(), + row_groups.to_vec(), + ) + .await; + let filename = format!("test_fuzz_utf8_{idx}_{rg_idx}.parquet"); + files.push((filename.clone(), buf.len())); + let payload = PutPayload::from(buf); + let path = Path::from(filename); + store.put(&path, payload).await.unwrap(); + } + } + + println!("Generated {} parquet files", files.len()); + + let ctx = SessionContext::new(); + + ctx.register_object_store(&Url::parse("memory://").unwrap(), Arc::new(store)); + + let mut predicates = vec![]; + for value in values { + predicates.push(col("a").eq(lit(value.clone()))); + predicates.push(col("a").not_eq(lit(value.clone()))); + predicates.push(col("a").lt(lit(value.clone()))); + predicates.push(col("a").lt_eq(lit(value.clone()))); + predicates.push(col("a").gt(lit(value.clone()))); + predicates.push(col("a").gt_eq(lit(value.clone()))); + predicates.push(col("a").like(lit(value.clone()))); + predicates.push(col("a").not_like(lit(value.clone()))); + predicates.push(col("a").like(lit(format!("%{}", value.clone())))); + predicates.push(col("a").like(lit(format!("{}%", value.clone())))); + predicates.push(col("a").not_like(lit(format!("%{}", value.clone())))); + predicates.push(col("a").not_like(lit(format!("{}%", value.clone())))); + } + + for predicate in predicates { + println!("Testing predicate {:?}", predicate); + let phys_expr_predicate = ctx + .create_physical_expr(predicate.clone(), &df_schema) + .unwrap(); + let expected = execute_with_predicate( + &files, + phys_expr_predicate.clone(), + false, + schema.clone(), + &ctx, + ) + .await; + let with_pruning = execute_with_predicate( + &files, + phys_expr_predicate, + true, + schema.clone(), + &ctx, + ) + .await; + assert_eq!(expected, with_pruning); + } +} + +async fn execute_with_predicate( + files: &[(String, usize)], + predicate: Arc, + prune_stats: bool, + schema: Arc, + ctx: &SessionContext, +) -> Vec { + let scan = + FileScanConfig::new(ObjectStoreUrl::parse("memory://").unwrap(), schema.clone()) + .with_file_group( + files + .iter() + .map(|(path, size)| PartitionedFile::new(path.clone(), *size as u64)) + .collect(), + ); + let mut builder = ParquetExecBuilder::new(scan); + if prune_stats { + builder = builder.with_predicate(predicate.clone()) + } + let exec = Arc::new(builder.build()) as Arc; + let exec = + Arc::new(FilterExec::try_new(predicate, exec).unwrap()) as Arc; + + let batches = collect(exec, ctx.task_ctx()).await.unwrap(); + let mut values = vec![]; + for batch in batches { + let column = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..column.len() { + values.push(column.value(i).to_string()); + } + } + values +} + +async fn write_parquet_file( + truncation_length: Option, + schema: Arc, + row_groups: Vec>, +) -> Bytes { + let mut buf = BytesMut::new().writer(); + let mut props = WriterProperties::builder(); + if let Some(truncation_length) = truncation_length { + props = props.set_max_statistics_size(truncation_length); + } + props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row group level + let props = props.build(); + { + let mut writer = + ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); + for rg_values in row_groups.iter() { + let arr = StringArray::from_iter_values(rg_values.iter()); + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(arr)]).unwrap(); + writer.write(&batch).unwrap(); + writer.flush().unwrap(); // finishes the current row group and starts a new one + } + writer.finish().unwrap(); + } + buf.into_inner().freeze() +} diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 77fc76c35352..c16ed306efdf 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1190,6 +1190,8 @@ fn is_compare_op(op: Operator) -> bool { | Operator::LtEq | Operator::Gt | Operator::GtEq + | Operator::LikeMatch + | Operator::NotLikeMatch ) } @@ -1482,6 +1484,21 @@ fn build_predicate_expression( *bin_expr.op(), Arc::clone(bin_expr.right()), ) + } else if let Some(like_expr) = expr_any.downcast_ref::() { + if like_expr.case_insensitive() { + return unhandled_hook.handle(expr); + } + let op = match (like_expr.negated(), like_expr.case_insensitive()) { + (false, false) => Operator::LikeMatch, + (true, false) => Operator::NotLikeMatch, + (false, true) => Operator::ILikeMatch, + (true, true) => Operator::NotILikeMatch, + }; + ( + Arc::clone(like_expr.expr()), + op, + Arc::clone(like_expr.pattern()), + ) } else { return unhandled_hook.handle(expr); } @@ -1562,6 +1579,11 @@ fn build_statistics_expr( )), )) } + Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| { + plan_datafusion_err!( + "LIKE expression with wildcard at the beginning is not supported" + ) + })?, Operator::Gt => { // column > literal => (min, max) > literal => max > literal Arc::new(phys_expr::BinaryExpr::new( @@ -1605,7 +1627,129 @@ fn build_statistics_expr( Ok(statistics_expr) } +/// Convert `column LIKE literal` where P is a constant prefix of the literal +/// to a range check on the column: `P <= column && column < P'`, where P' is the +/// lowest string after all P* strings. +fn build_like_match( + expr_builder: &mut PruningExpressionBuilder, +) -> Option> { + // column LIKE literal => (min, max) LIKE literal split at % => min <= split literal && split literal <= max + // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max + // column LIKE '%foo' => min <= '' && '' <= max => true + // column LIKE '%foo%' => min <= '' && '' <= max => true + // column LIKE 'foo' => min <= 'foo' && 'foo' <= max + + fn unpack_string(s: &ScalarValue) -> Option<&String> { + match s { + ScalarValue::Utf8(Some(s)) => Some(s), + ScalarValue::LargeUtf8(Some(s)) => Some(s), + ScalarValue::Utf8View(Some(s)) => Some(s), + ScalarValue::Dictionary(_, value) => unpack_string(value), + _ => None, + } + } + + fn extract_string_literal(expr: &Arc) -> Option<&String> { + if let Some(lit) = expr.as_any().downcast_ref::() { + let s = unpack_string(lit.value())?; + return Some(s); + } + None + } + + // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase + // this may involve building the physical expressions that call lower() and upper() + let min_column_expr = expr_builder.min_column_expr().ok()?; + let max_column_expr = expr_builder.max_column_expr().ok()?; + let scalar_expr = expr_builder.scalar_expr(); + // check that the scalar is a string literal + let s = extract_string_literal(scalar_expr)?; + // ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character. + let first_wildcard_index = s.find(['%', '_']); + if first_wildcard_index == Some(0) { + // there's no filtering we could possibly do, return an error and have this be handled by the unhandled hook + return None; + } + let (lower_bound, upper_bound) = if let Some(wildcard_index) = first_wildcard_index { + let prefix = &s[..wildcard_index]; + let lower_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( + prefix.to_string(), + )))); + let upper_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( + increment_utf8(prefix)?, + )))); + (lower_bound_lit, upper_bound_lit) + } else { + // the like expression is a literal and can be converted into a comparison + let bound = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone())))); + (Arc::clone(&bound), bound) + }; + let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new( + lower_bound, + Operator::LtEq, + Arc::clone(&max_column_expr), + )); + let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new( + Arc::clone(&min_column_expr), + Operator::LtEq, + upper_bound, + )); + let combined = Arc::new(phys_expr::BinaryExpr::new( + upper_bound_expr, + Operator::And, + lower_bound_expr, + )); + Some(combined) +} + +/// Increment a UTF8 string by one, returning `None` if it can't be incremented. +/// This makes it so that the returned string will always compare greater than the input string +/// or any other string with the same prefix. +/// This is necessary since the statistics may have been truncated: if we have a min statistic +/// of "fo" that may have originally been "foz" or anything else with the prefix "fo". +/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"` +/// In this example `increment_utf8("foo") == "fop" +fn increment_utf8(data: &str) -> Option { + // Helper function to check if a character is valid to use + fn is_valid_unicode(c: char) -> bool { + let cp = c as u32; + + // Filter out non-characters (https://www.unicode.org/versions/corrigendum9.html) + if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp) { + return false; + } + + // Filter out private use area + if cp >= 0x110000 { + return false; + } + + true + } + + // Convert string to vector of code points + let mut code_points: Vec = data.chars().collect(); + + // Work backwards through code points + for idx in (0..code_points.len()).rev() { + let original = code_points[idx] as u32; + + // Try incrementing the code point + if let Some(next_char) = char::from_u32(original + 1) { + if is_valid_unicode(next_char) { + code_points[idx] = next_char; + // truncate the string to the current index + code_points.truncate(idx + 1); + return Some(code_points.into_iter().collect()); + } + } + } + + None +} + /// Wrap the statistics expression in a check that skips the expression if the column is all nulls. +/// /// This is important not only as an optimization but also because statistics may not be /// accurate for columns that are all nulls. /// For example, for an `int` column `x` with all nulls, the min/max/null_count statistics @@ -3462,6 +3606,425 @@ mod tests { ); } + #[test] + fn test_increment_utf8() { + // Basic ASCII + assert_eq!(increment_utf8("abc").unwrap(), "abd"); + assert_eq!(increment_utf8("abz").unwrap(), "ab{"); + + // Test around ASCII 127 (DEL) + assert_eq!(increment_utf8("~").unwrap(), "\u{7f}"); // 126 -> 127 + assert_eq!(increment_utf8("\u{7f}").unwrap(), "\u{80}"); // 127 -> 128 + + // Test 2-byte UTF-8 sequences + assert_eq!(increment_utf8("ß").unwrap(), "à"); // U+00DF -> U+00E0 + + // Test 3-byte UTF-8 sequences + assert_eq!(increment_utf8("℣").unwrap(), "ℤ"); // U+2123 -> U+2124 + + // Test at UTF-8 boundaries + assert_eq!(increment_utf8("\u{7FF}").unwrap(), "\u{800}"); // 2-byte to 3-byte boundary + assert_eq!(increment_utf8("\u{FFFF}").unwrap(), "\u{10000}"); // 3-byte to 4-byte boundary + + // Test that if we can't increment we return None + assert!(increment_utf8("").is_none()); + assert!(increment_utf8("\u{10FFFF}").is_none()); // U+10FFFF is the max code point + + // Test that if we can't increment the last character we do the previous one and truncate + assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b"); + + // Test surrogate pair range (0xD800..=0xDFFF) + assert_eq!(increment_utf8("a\u{D7FF}").unwrap(), "b"); + assert!(increment_utf8("\u{D7FF}").is_none()); + + // Test non-characters range (0xFDD0..=0xFDEF) + assert_eq!(increment_utf8("a\u{FDCF}").unwrap(), "b"); + assert!(increment_utf8("\u{FDCF}").is_none()); + + // Test private use area limit (>= 0x110000) + assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b"); + assert!(increment_utf8("\u{10FFFF}").is_none()); // Can't increment past max valid codepoint + } + + /// Creates a setup for chunk pruning, modeling a utf8 column "s1" + /// with 5 different containers (e.g. RowGroups). They have [min, + /// max]: + /// s1 ["A", "Z"] + /// s1 ["A", "L"] + /// s1 ["N", "Z"] + /// s1 [NULL, NULL] + /// s1 ["A", NULL] + /// s1 ["", "A"] + /// s1 ["", ""] + /// s1 ["AB", "A\u{10ffff}"] + /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] + fn utf8_setup() -> (SchemaRef, TestStatistics) { + let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); + + let statistics = TestStatistics::new().with( + "s1", + ContainerStats::new_utf8( + vec![ + Some("A"), + Some("A"), + Some("N"), + Some("M"), + None, + Some("A"), + Some(""), + Some(""), + Some("AB"), + Some("A\u{10ffff}\u{10ffff}"), + ], // min + vec![ + Some("Z"), + Some("L"), + Some("Z"), + Some("M"), + None, + None, + Some("A"), + Some(""), + Some("A\u{10ffff}\u{10ffff}\u{10ffff}"), + Some("A\u{10ffff}\u{10ffff}"), + ], // max + ), + ); + (schema, statistics) + } + + #[test] + fn prune_utf8_eq() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").eq(lit("A")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> no rows can pass (not keep) + false, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").eq(lit("")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["A", "L"] ==> no rows can pass (not keep) + false, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> no rows can pass (not keep) + false, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_not_eq() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").not_eq(lit("A")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["M", "M"] ==> all rows must pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").not_eq(lit("")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["A", "L"] ==> all rows must pass (must keep) + true, + // s1 ["N", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["M", "M"] ==> all rows must pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> no rows can pass (not keep) + false, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_like_one() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").like(lit("A_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> no rows can pass (not keep) + false, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("_A_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["A", "L"] ==> all rows must pass (must keep) + true, + // s1 ["N", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["M", "M"] ==> all rows must pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> all rows must pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["A", "L"] ==> no rows can pass (not keep) + false, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> no rows can pass (not keep) + false, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_like_many() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").like(lit("A%")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> no rows can pass (not keep) + false, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("%A%")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("%")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["A", "L"] ==> all rows must pass (must keep) + true, + // s1 ["N", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["M", "M"] ==> all rows must pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> all rows must pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["A", "L"] ==> no rows can pass (not keep) + false, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> no rows can pass (not keep) + false, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + #[test] fn test_rewrite_expr_to_prunable() { let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);