From ccb4baf0fc6b4dee983bb29f2282b9c19510a481 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Jul 2024 15:52:35 -0400 Subject: [PATCH] Initial support for `StringView`, merge changes from `string-view` development branch (#11402) * Update `string-view` branch to arrow-rs main (#10966) * Pin to arrow main * Fix clippy with latest arrow * Uncomment test that needs new arrow-rs to work * Update datafusion-cli Cargo.lock * Update Cargo.lock * tapelo * feat: Implement equality = and inequality <> support for StringView (#10985) * feat: Implement equality = and inequality <> support for StringView * chore: Add tests for the StringView * chore * chore: Update tests for NULL * fix: Used build_array_string! * chore: Update string_coercion function to handle Utf8View type in binary.rs * chore: add tests * chore: ci * Add more StringView comparison test coverage (#10997) * Add more StringView comparison test coverage * add reference * Add another test showing casting on columns works correctly * feat: Implement equality = and inequality <> support for BinaryView (#11004) * feat: Implement equality = and inequality <> support for BinaryView Signed-off-by: Chojan Shang * chore: make fmt happy Signed-off-by: Chojan Shang --------- Signed-off-by: Chojan Shang * Implement support for LargeString and LargeBinary for StringView and BinaryView (#11034) * implement large binary * add tests for large string * better comments for string coercion * Improve filter predicates with `Utf8View` literals (#11043) * refactor: Improve type coercion logic in TypeCoercionRewriter * refactor: Improve type coercion logic in TypeCoercionRewriter * chore * chore: Update test * refactor: Improve type coercion logic in TypeCoercionRewriter * refactor: Remove unused import and update code formatting in unwrap_cast_in_comparison.rs * Remove arrow-patch --------- Signed-off-by: Chojan Shang Co-authored-by: Alex Huang Co-authored-by: Chojan Shang Co-authored-by: Xiangpeng Hao --- datafusion/common/src/scalar/mod.rs | 8 +- datafusion/expr/src/type_coercion/binary.rs | 36 +- .../src/unwrap_cast_in_comparison.rs | 26 +- .../sqllogictest/test_files/binary_view.slt | 202 +++++++++++ .../sqllogictest/test_files/string_view.slt | 326 ++++++++++++++++++ 5 files changed, 566 insertions(+), 32 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/binary_view.slt create mode 100644 datafusion/sqllogictest/test_files/string_view.slt diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index c891e85aa59b..38f70e4c1466 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -1682,8 +1682,10 @@ impl ScalarValue { DataType::UInt16 => build_array_primitive!(UInt16Array, UInt16), DataType::UInt32 => build_array_primitive!(UInt32Array, UInt32), DataType::UInt64 => build_array_primitive!(UInt64Array, UInt64), + DataType::Utf8View => build_array_string!(StringViewArray, Utf8View), DataType::Utf8 => build_array_string!(StringArray, Utf8), DataType::LargeUtf8 => build_array_string!(LargeStringArray, LargeUtf8), + DataType::BinaryView => build_array_string!(BinaryViewArray, BinaryView), DataType::Binary => build_array_string!(BinaryArray, Binary), DataType::LargeBinary => build_array_string!(LargeBinaryArray, LargeBinary), DataType::Date32 => build_array_primitive!(Date32Array, Date32), @@ -1841,8 +1843,6 @@ impl ScalarValue { | DataType::Time64(TimeUnit::Millisecond) | DataType::Map(_, _) | DataType::RunEndEncoded(_, _) - | DataType::Utf8View - | DataType::BinaryView | DataType::ListView(_) | DataType::LargeListView(_) => { return _internal_err!( @@ -5695,16 +5695,12 @@ mod tests { DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), ); - // needs https://github.com/apache/arrow-rs/issues/5893 - /* check_scalar_cast(ScalarValue::Utf8(None), DataType::Utf8View); check_scalar_cast(ScalarValue::from("foo"), DataType::Utf8View); check_scalar_cast( ScalarValue::from("larger than 12 bytes string"), DataType::Utf8View, ); - - */ } // mimics how casting work on scalar values by `casting` `scalar` to `desired_type` diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 4f79f3fa2b22..70139aaa4a0c 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -919,16 +919,21 @@ fn string_concat_internal_coercion( } } -/// Coercion rules for string types (Utf8/LargeUtf8): If at least one argument is -/// a string type and both arguments can be coerced into a string type, coerce -/// to string type. +/// Coercion rules for string view types (Utf8/LargeUtf8/Utf8View): +/// If at least one argument is a string view, we coerce to string view +/// based on the observation that StringArray to StringViewArray is cheap but not vice versa. +/// +/// Between Utf8 and LargeUtf8, we coerce to LargeUtf8. fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { + // If Utf8View is in any side, we coerce to Utf8View. + (Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => { + Some(Utf8View) + } + // Then, if LargeUtf8 is in any side, we coerce to LargeUtf8. + (LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8), (Utf8, Utf8) => Some(Utf8), - (LargeUtf8, Utf8) => Some(LargeUtf8), - (Utf8, LargeUtf8) => Some(LargeUtf8), - (LargeUtf8, LargeUtf8) => Some(LargeUtf8), _ => None, } } @@ -975,15 +980,26 @@ fn binary_to_string_coercion( } } -/// Coercion rules for binary types (Binary/LargeBinary): If at least one argument is +/// Coercion rules for binary types (Binary/LargeBinary/BinaryView): If at least one argument is /// a binary type and both arguments can be coerced into a binary type, coerce /// to binary type. fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { - (Binary | Utf8, Binary) | (Binary, Utf8) => Some(Binary), - (LargeBinary | Binary | Utf8 | LargeUtf8, LargeBinary) - | (LargeBinary, Binary | Utf8 | LargeUtf8) => Some(LargeBinary), + // If BinaryView is in any side, we coerce to BinaryView. + (BinaryView, BinaryView | Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View) + | (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, BinaryView) => { + Some(BinaryView) + } + // Prefer LargeBinary over Binary + (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary) + | (LargeBinary, Binary | Utf8 | LargeUtf8 | Utf8View) => Some(LargeBinary), + + // If Utf8View/LargeUtf8 presents need to be large Binary + (Utf8View | LargeUtf8, Binary) | (Binary, Utf8View | LargeUtf8) => { + Some(LargeBinary) + } + (Binary, Utf8) | (Utf8, Binary) => Some(Binary), _ => None, } } diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 9941da9dd65e..7238dd5bbd97 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -33,7 +33,7 @@ use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; use datafusion_common::{internal_err, DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::expr::{BinaryExpr, Cast, InList, TryCast}; use datafusion_expr::utils::merge_schema; -use datafusion_expr::{lit, Expr, ExprSchemable, LogicalPlan, Operator}; +use datafusion_expr::{lit, Expr, ExprSchemable, LogicalPlan}; /// [`UnwrapCastInComparison`] attempts to remove casts from /// comparisons to literals ([`ScalarValue`]s) by applying the casts @@ -146,7 +146,7 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter { }; is_supported_type(&left_type) && is_supported_type(&right_type) - && is_comparison_op(op) + && op.is_comparison_operator() } => { match (left.as_mut(), right.as_mut()) { @@ -262,18 +262,6 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter { } } -fn is_comparison_op(op: &Operator) -> bool { - matches!( - op, - Operator::Eq - | Operator::NotEq - | Operator::Gt - | Operator::GtEq - | Operator::Lt - | Operator::LtEq - ) -} - /// Returns true if [UnwrapCastExprRewriter] supports this data type fn is_supported_type(data_type: &DataType) -> bool { is_supported_numeric_type(data_type) @@ -300,7 +288,10 @@ fn is_supported_numeric_type(data_type: &DataType) -> bool { /// Returns true if [UnwrapCastExprRewriter] supports casting this value as a string fn is_supported_string_type(data_type: &DataType) -> bool { - matches!(data_type, DataType::Utf8 | DataType::LargeUtf8) + matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + ) } /// Returns true if [UnwrapCastExprRewriter] supports casting this value as a dictionary @@ -473,12 +464,15 @@ fn try_cast_string_literal( target_type: &DataType, ) -> Option { let string_value = match lit_value { - ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => s.clone(), + ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) | ScalarValue::Utf8View(s) => { + s.clone() + } _ => return None, }; let scalar_value = match target_type { DataType::Utf8 => ScalarValue::Utf8(string_value), DataType::LargeUtf8 => ScalarValue::LargeUtf8(string_value), + DataType::Utf8View => ScalarValue::Utf8View(string_value), _ => return None, }; Some(scalar_value) diff --git a/datafusion/sqllogictest/test_files/binary_view.slt b/datafusion/sqllogictest/test_files/binary_view.slt new file mode 100644 index 000000000000..de0f0bea7ffb --- /dev/null +++ b/datafusion/sqllogictest/test_files/binary_view.slt @@ -0,0 +1,202 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +######## +## Test setup +######## + +statement ok +create table test_source as values + ('Andrew', 'X'), + ('Xiangpeng', 'Xiangpeng'), + ('Raphael', 'R'), + (NULL, 'R') +; + +# Table with the different combination of column types +statement ok +CREATE TABLE test AS +SELECT + arrow_cast(column1, 'Utf8') as column1_utf8, + arrow_cast(column2, 'Utf8') as column2_utf8, + arrow_cast(column1, 'Binary') AS column1_binary, + arrow_cast(column2, 'Binary') AS column2_binary, + arrow_cast(column1, 'LargeBinary') AS column1_large_binary, + arrow_cast(column2, 'LargeBinary') AS column2_large_binary, + arrow_cast(arrow_cast(column1, 'Binary'), 'BinaryView') AS column1_binaryview, + arrow_cast(arrow_cast(column2, 'Binary'), 'BinaryView') AS column2_binaryview, + arrow_cast(column1, 'Dictionary(Int32, Binary)') AS column1_dict, + arrow_cast(column2, 'Dictionary(Int32, Binary)') AS column2_dict +FROM test_source; + +statement ok +drop table test_source + +######## +## BinaryView to BinaryView +######## + +# BinaryView scalar to BinaryView scalar + +query BBBB +SELECT + arrow_cast(arrow_cast('NULL', 'Binary'), 'BinaryView') = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison1, + arrow_cast(arrow_cast('NULL', 'Binary'), 'BinaryView') <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison2, + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison3, + arrow_cast(arrow_cast('Xiangpeng', 'Binary'), 'BinaryView') <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison4; +---- +false true true true + + +# BinaryView column to BinaryView column comparison as filters + +query TT +select column1_utf8, column2_utf8 from test where column1_binaryview = column2_binaryview; +---- +Xiangpeng Xiangpeng + +query TT +select column1_utf8, column2_utf8 from test where column1_binaryview <> column2_binaryview; +---- +Andrew X +Raphael R + +# BinaryView column to BinaryView column +query TTBB +select + column1_utf8, column2_utf8, + column1_binaryview = column2_binaryview, + column1_binaryview <> column2_binaryview +from test; +---- +Andrew X false true +Xiangpeng Xiangpeng true false +Raphael R false true +NULL R NULL NULL + +# BinaryView column to BinaryView scalar comparison +query TTBBBB +select + column1_utf8, column2_utf8, + column1_binaryview = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'), + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_binaryview, + column1_binaryview <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'), + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_binaryview +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +######## +## BinaryView to Binary +######## + +# test BinaryViewArray with Binary columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_binaryview = column2_binary, + column2_binary = column1_binaryview, + column1_binaryview <> column2_binary, + column2_binary <> column1_binaryview +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# test BinaryViewArray with LargeBinary columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_binaryview = column2_large_binary, + column2_large_binary = column1_binaryview, + column1_binaryview <> column2_large_binary, + column2_large_binary <> column1_binaryview +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# BinaryView column to Binary scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_binaryview = arrow_cast('Andrew', 'Binary'), + arrow_cast('Andrew', 'Binary') = column1_binaryview, + column1_binaryview <> arrow_cast('Andrew', 'Binary'), + arrow_cast('Andrew', 'Binary') <> column1_binaryview +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# BinaryView column to LargeBinary scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_binaryview = arrow_cast('Andrew', 'LargeBinary'), + arrow_cast('Andrew', 'LargeBinary') = column1_binaryview, + column1_binaryview <> arrow_cast('Andrew', 'LargeBinary'), + arrow_cast('Andrew', 'LargeBinary') <> column1_binaryview +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# Binary column to BinaryView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_binary = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'), + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_binary, + column1_binary <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'), + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_binary +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + + +# LargeBinary column to BinaryView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_large_binary = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'), + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_large_binary, + column1_large_binary <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'), + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_large_binary +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +statement ok +drop table test; \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt new file mode 100644 index 000000000000..3ba4e271c2f6 --- /dev/null +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -0,0 +1,326 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +######## +## Test setup +######## + +statement ok +create table test_source as values + ('Andrew', 'X'), + ('Xiangpeng', 'Xiangpeng'), + ('Raphael', 'R'), + (NULL, 'R') +; + +# Table with the different combination of column types +statement ok +create table test as +SELECT + arrow_cast(column1, 'Utf8') as column1_utf8, + arrow_cast(column2, 'Utf8') as column2_utf8, + arrow_cast(column1, 'LargeUtf8') as column1_large_utf8, + arrow_cast(column2, 'LargeUtf8') as column2_large_utf8, + arrow_cast(column1, 'Utf8View') as column1_utf8view, + arrow_cast(column2, 'Utf8View') as column2_utf8view, + arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1_dict, + arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2_dict +FROM test_source; + +statement ok +drop table test_source + +######## +## StringView to StringView +######## + +# StringView scalar to StringView scalar + +query BBBB +select + arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); +---- +false true true true + + +# StringView column to StringView column comparison as filters + +query TT +select column1_utf8, column2_utf8 from test where column1_utf8view = column2_utf8view; +---- +Xiangpeng Xiangpeng + +query TT +select column1_utf8, column2_utf8 from test where column1_utf8view <> column2_utf8view; +---- +Andrew X +Raphael R + +# StringView column to StringView column +query TTBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_utf8view, + column1_utf8view <> column2_utf8view +from test; +---- +Andrew X false true +Xiangpeng Xiangpeng true false +Raphael R false true +NULL R NULL NULL + +# StringView column to StringView scalar comparison +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_utf8view +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +######## +## StringView to String +######## + +# test StringViewArray with Utf8 columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_utf8, + column2_utf8 = column1_utf8view, + column1_utf8view <> column2_utf8, + column2_utf8 <> column1_utf8view +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# test StringViewArray with LargeUtf8 columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_large_utf8, + column2_large_utf8 = column1_utf8view, + column1_utf8view <> column2_large_utf8, + column2_large_utf8 <> column1_utf8view +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + + +# StringView column to String scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'Utf8'), + arrow_cast('Andrew', 'Utf8') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'Utf8'), + arrow_cast('Andrew', 'Utf8') <> column1_utf8view +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# StringView column to LargeString scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'LargeUtf8'), + arrow_cast('Andrew', 'LargeUtf8') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'LargeUtf8'), + arrow_cast('Andrew', 'LargeUtf8') <> column1_utf8view +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# String column to StringView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8 = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_utf8, + column1_utf8 <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_utf8 +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# LargeString column to StringView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_large_utf8 = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_large_utf8, + column1_large_utf8 <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_large_utf8 +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +######## +## StringView to Dictionary +######## + +# test StringViewArray with Dictionary columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_dict, + column2_dict = column1_utf8view, + column1_utf8view <> column2_dict, + column2_dict <> column1_utf8view +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# StringView column to Dict scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'), + arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'), + arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') <> column1_utf8view +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# Dict column to StringView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_dict = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_dict, + column1_dict <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_dict +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + + +######## +## Coercion Rules +######## + + +statement ok +set datafusion.explain.logical_plan_only = true; + + +# Filter should have a StringView literal and no column cast +query TT +explain SELECT column1_utf8 from test where column1_utf8view = 'Andrew'; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +# reverse order should be the same +query TT +explain SELECT column1_utf8 from test where 'Andrew' = column1_utf8view; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +query TT +explain SELECT column1_utf8 from test where column1_utf8 = arrow_cast('Andrew', 'Utf8View'); +---- +logical_plan +01)Filter: test.column1_utf8 = Utf8("Andrew") +02)--TableScan: test projection=[column1_utf8] + +query TT +explain SELECT column1_utf8 from test where arrow_cast('Andrew', 'Utf8View') = column1_utf8; +---- +logical_plan +01)Filter: test.column1_utf8 = Utf8("Andrew") +02)--TableScan: test projection=[column1_utf8] + +query TT +explain SELECT column1_utf8 from test where column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'); +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +query TT +explain SELECT column1_utf8 from test where arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') = column1_utf8view; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +# compare string / stringview +# Should cast string -> stringview (which is cheap), not stringview -> string (which is not) +query TT +explain SELECT column1_utf8 from test where column1_utf8view = column2_utf8; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = CAST(test.column2_utf8 AS Utf8View) +03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] + +query TT +explain SELECT column1_utf8 from test where column2_utf8 = column1_utf8view; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: CAST(test.column2_utf8 AS Utf8View) = test.column1_utf8view +03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] + + +statement ok +drop table test;