From 19ed182918b102bf0b67b59e3359adae85839a01 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 21 Jun 2024 12:18:33 -0400 Subject: [PATCH] Implement support for LargeString and LargeBinary for StringView and BinaryView (#11034) * implement large binary * add tests for large string * better comments for string coercion --- datafusion/expr/src/type_coercion/binary.rs | 36 +++++++++----- .../sqllogictest/test_files/binary_view.slt | 48 +++++++++++++++++++ .../sqllogictest/test_files/string_view.slt | 47 ++++++++++++++++++ 3 files changed, 119 insertions(+), 12 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 694e5e13f9e1..36100a0b727c 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -922,17 +922,21 @@ fn string_concat_internal_coercion( } } -/// Coercion rules for string types (Utf8/LargeUtf8): If at least one argument is -/// a string type and both arguments can be coerced into a string type, coerce -/// to string type. +/// Coercion rules for string view types (Utf8/LargeUtf8/Utf8View): +/// If at least one argument is a string view, we coerce to string view +/// based on the observation that StringArray to StringViewArray is cheap but not vice versa. +/// +/// Between Utf8 and LargeUtf8, we coerce to LargeUtf8. fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { + // If Utf8View is in any side, we coerce to Utf8View. + (Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => { + Some(Utf8View) + } + // Then, if LargeUtf8 is in any side, we coerce to LargeUtf8. + (LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8), (Utf8, Utf8) => Some(Utf8), - (LargeUtf8, Utf8) => Some(LargeUtf8), - (Utf8, LargeUtf8) => Some(LargeUtf8), - (LargeUtf8, LargeUtf8) => Some(LargeUtf8), - (Utf8View, Utf8View) | (Utf8View, Utf8) | (Utf8, Utf8View) => Some(Utf8View), _ => None, } } @@ -982,18 +986,26 @@ fn binary_to_string_coercion( } } -/// Coercion rules for binary types (Binary/LargeBinary): If at least one argument is +/// Coercion rules for binary types (Binary/LargeBinary/BinaryView): If at least one argument is /// a binary type and both arguments can be coerced into a binary type, coerce /// to binary type. fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { - (Binary | Utf8, Binary) | (Binary, Utf8) => Some(Binary), - (LargeBinary | Binary | Utf8 | LargeUtf8, LargeBinary) - | (LargeBinary, Binary | Utf8 | LargeUtf8) => Some(LargeBinary), - (BinaryView, BinaryView) | (BinaryView, Binary) | (Binary, BinaryView) => { + // If BinaryView is in any side, we coerce to BinaryView. + (BinaryView, BinaryView | Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View) + | (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, BinaryView) => { Some(BinaryView) } + // Prefer LargeBinary over Binary + (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary) + | (LargeBinary, Binary | Utf8 | LargeUtf8 | Utf8View) => Some(LargeBinary), + + // If Utf8View/LargeUtf8 presents need to be large Binary + (Utf8View | LargeUtf8, Binary) | (Binary, Utf8View | LargeUtf8) => { + Some(LargeBinary) + } + (Binary, Utf8) | (Utf8, Binary) => Some(Binary), _ => None, } } diff --git a/datafusion/sqllogictest/test_files/binary_view.slt b/datafusion/sqllogictest/test_files/binary_view.slt index 2728d4803ce7..de0f0bea7ffb 100644 --- a/datafusion/sqllogictest/test_files/binary_view.slt +++ b/datafusion/sqllogictest/test_files/binary_view.slt @@ -35,6 +35,8 @@ SELECT arrow_cast(column2, 'Utf8') as column2_utf8, arrow_cast(column1, 'Binary') AS column1_binary, arrow_cast(column2, 'Binary') AS column2_binary, + arrow_cast(column1, 'LargeBinary') AS column1_large_binary, + arrow_cast(column2, 'LargeBinary') AS column2_large_binary, arrow_cast(arrow_cast(column1, 'Binary'), 'BinaryView') AS column1_binaryview, arrow_cast(arrow_cast(column2, 'Binary'), 'BinaryView') AS column2_binaryview, arrow_cast(column1, 'Dictionary(Int32, Binary)') AS column1_dict, @@ -120,6 +122,21 @@ Xiangpeng Xiangpeng true true false false Raphael R false false true true NULL R NULL NULL NULL NULL +# test BinaryViewArray with LargeBinary columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_binaryview = column2_large_binary, + column2_large_binary = column1_binaryview, + column1_binaryview <> column2_large_binary, + column2_large_binary <> column1_binaryview +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + # BinaryView column to Binary scalar query TTBBBB select @@ -135,6 +152,21 @@ Xiangpeng Xiangpeng false false true true Raphael R false false true true NULL R NULL NULL NULL NULL +# BinaryView column to LargeBinary scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_binaryview = arrow_cast('Andrew', 'LargeBinary'), + arrow_cast('Andrew', 'LargeBinary') = column1_binaryview, + column1_binaryview <> arrow_cast('Andrew', 'LargeBinary'), + arrow_cast('Andrew', 'LargeBinary') <> column1_binaryview +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + # Binary column to BinaryView scalar query TTBBBB select @@ -150,5 +182,21 @@ Xiangpeng Xiangpeng false false true true Raphael R false false true true NULL R NULL NULL NULL NULL + +# LargeBinary column to BinaryView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_large_binary = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'), + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_large_binary, + column1_large_binary <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'), + arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_large_binary +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + statement ok drop table test; \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index f8824b23d1b9..7c9fbf4735fb 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -33,6 +33,8 @@ create table test as SELECT arrow_cast(column1, 'Utf8') as column1_utf8, arrow_cast(column2, 'Utf8') as column2_utf8, + arrow_cast(column1, 'LargeUtf8') as column1_large_utf8, + arrow_cast(column2, 'LargeUtf8') as column2_large_utf8, arrow_cast(column1, 'Utf8View') as column1_utf8view, arrow_cast(column2, 'Utf8View') as column2_utf8view, arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1_dict, @@ -118,6 +120,22 @@ Xiangpeng Xiangpeng true true false false Raphael R false false true true NULL R NULL NULL NULL NULL +# test StringViewArray with LargeUtf8 columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_large_utf8, + column2_large_utf8 = column1_utf8view, + column1_utf8view <> column2_large_utf8, + column2_large_utf8 <> column1_utf8view +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + + # StringView column to String scalar query TTBBBB select @@ -133,6 +151,21 @@ Xiangpeng Xiangpeng false false true true Raphael R false false true true NULL R NULL NULL NULL NULL +# StringView column to LargeString scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'LargeUtf8'), + arrow_cast('Andrew', 'LargeUtf8') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'LargeUtf8'), + arrow_cast('Andrew', 'LargeUtf8') <> column1_utf8view +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + # String column to StringView scalar query TTBBBB select @@ -148,6 +181,20 @@ Xiangpeng Xiangpeng false false true true Raphael R false false true true NULL R NULL NULL NULL NULL +# LargeString column to StringView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_large_utf8 = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_large_utf8, + column1_large_utf8 <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_large_utf8 +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL ######## ## StringView to Dictionary