From 4ed6b1771e62b9886f723cf26d681973d2e56cec Mon Sep 17 00:00:00 2001 From: my-vegetable-has-exploded Date: Tue, 17 Sep 2024 22:29:21 +0800 Subject: [PATCH] feat:Support applying parquet bloom filters to StringView columns --- .../physical_plan/parquet/row_group_filter.rs | 47 ++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs index ccd77d90be57..4cdcb005018e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs @@ -264,8 +264,12 @@ impl PruningStatistics for BloomFilterStatistics { .iter() .map(|value| { match value { - ScalarValue::Utf8(Some(v)) => sbbf.check(&v.as_str()), - ScalarValue::Binary(Some(v)) => sbbf.check(v), + ScalarValue::Utf8(Some(v)) | ScalarValue::Utf8View(Some(v)) => { + sbbf.check(&v.as_str()) + } + ScalarValue::Binary(Some(v)) | ScalarValue::BinaryView(Some(v)) => { + sbbf.check(v) + } ScalarValue::FixedSizeBinary(_size, Some(v)) => sbbf.check(v), ScalarValue::Boolean(Some(v)) => sbbf.check(v), ScalarValue::Float64(Some(v)) => sbbf.check(v), @@ -1219,6 +1223,25 @@ mod tests { .await } + #[tokio::test] + async fn test_row_group_bloom_filter_pruning_predicate_multiple_expr_view() { + BloomFilterTest::new_data_index_bloom_encoding_stats() + .with_expect_all_pruned() + // generate pruning predicate `(String = "Hello_Not_exists" OR String = "Hello_Not_exists2")` + .run( + lit("1").eq(lit("1")).and( + col(r#""String""#) + .eq(Expr::Literal(ScalarValue::Utf8View(Some(String::from( + "Hello_Not_Exists", + ))))) + .or(col(r#""String""#).eq(Expr::Literal(ScalarValue::Utf8View( + Some(String::from("Hello_Not_Exists2")), + )))), + ), + ) + .await + } + #[tokio::test] async fn test_row_group_bloom_filter_pruning_predicate_sql_in() { // load parquet file @@ -1286,6 +1309,26 @@ mod tests { .await } + #[tokio::test] + async fn test_row_group_bloom_filter_pruning_predicate_with_exists_3_values_view() { + BloomFilterTest::new_data_index_bloom_encoding_stats() + .with_expect_none_pruned() + // generate pruning predicate `(String = "Hello") OR (String = "the quick") OR (String = "are you")` + .run( + col(r#""String""#) + .eq(Expr::Literal(ScalarValue::Utf8View(Some(String::from( + "Hello", + ))))) + .or(col(r#""String""#).eq(Expr::Literal(ScalarValue::Utf8View( + Some(String::from("the quick")), + )))) + .or(col(r#""String""#).eq(Expr::Literal(ScalarValue::Utf8View( + Some(String::from("are you")), + )))), + ) + .await + } + #[tokio::test] async fn test_row_group_bloom_filter_pruning_predicate_with_or_not_eq() { BloomFilterTest::new_data_index_bloom_encoding_stats()