diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index f6103cb84136..9f552ec72502 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -225,10 +225,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Timestamp(Millisecond, _) | Timestamp(Microsecond, _) | Timestamp(Nanosecond, _) - | Interval(_), + | Interval(_) + | BinaryView, ) => true, (Utf8 | LargeUtf8, Utf8View) => true, - (BinaryView, Binary | LargeBinary) => true, + (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) => true, (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -1229,6 +1230,9 @@ pub fn cast_with_options( cast_byte_container::(&binary) } Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::()))), + BinaryView => Ok(Arc::new( + StringViewArray::from(array.as_string::()).to_binary_view(), + )), LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { @@ -1282,6 +1286,7 @@ pub fn cast_with_options( Date64 => parse_string_view::(array, cast_options), Binary => cast_view_to_byte::>(array), LargeBinary => cast_view_to_byte::>(array), + BinaryView => Ok(Arc::new(array.as_string_view().clone().to_binary_view())), Utf8 => cast_view_to_byte::>(array), LargeUtf8 => cast_view_to_byte::>(array), Time32(TimeUnit::Second) => parse_string_view::(array, cast_options), @@ -1339,6 +1344,13 @@ pub fn cast_with_options( array.as_string::().clone(), ))), Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::()))), + BinaryView => Ok(Arc::new(BinaryViewArray::from( + array + .as_string::() + .into_iter() + .map(|x| x.map(|x| x.as_bytes())) + .collect::>(), + ))), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { parse_string::(array, cast_options) @@ -1417,6 +1429,20 @@ pub fn cast_with_options( (BinaryView, LargeBinary) => { cast_view_to_byte::>(array) } + (BinaryView, Utf8) => { + let binary_arr = cast_view_to_byte::>(array)?; + cast_binary_to_string::(&binary_arr, cast_options) + } + (BinaryView, LargeUtf8) => { + let binary_arr = cast_view_to_byte::>(array)?; + cast_binary_to_string::(&binary_arr, cast_options) + } + (BinaryView, Utf8View) => { + Ok(Arc::new(array.as_binary_view().clone().to_string_view()?) as ArrayRef) + } + (BinaryView, _) => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported", + ))), (from_type, LargeUtf8) if from_type.is_primitive() => { value_to_string::(array, cast_options) } @@ -2008,7 +2034,6 @@ pub fn cast_with_options( })?, )) } - (Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( array .as_primitive::() @@ -5256,12 +5281,6 @@ mod tests { } } - #[test] - fn test_string_to_view() { - _test_string_to_view::(); - _test_string_to_view::(); - } - const VIEW_TEST_DATA: [Option<&str>; 5] = [ Some("hello"), Some("repeated"), @@ -5270,6 +5289,44 @@ mod tests { Some("repeated"), ]; + #[test] + fn test_string_view_to_binary_view() { + let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); + + assert!(can_cast_types( + string_view_array.data_type(), + &DataType::BinaryView + )); + + let binary_view_array = cast(&string_view_array, &DataType::BinaryView).unwrap(); + assert_eq!(binary_view_array.data_type(), &DataType::BinaryView); + + let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array); + } + + #[test] + fn test_binary_view_to_string_view() { + let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + + assert!(can_cast_types( + binary_view_array.data_type(), + &DataType::Utf8View + )); + + let string_view_array = cast(&binary_view_array, &DataType::Utf8View).unwrap(); + assert_eq!(string_view_array.data_type(), &DataType::Utf8View); + + let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); + assert_eq!(string_view_array.as_ref(), &expect_string_view_array); + } + + #[test] + fn test_string_to_view() { + _test_string_to_view::(); + _test_string_to_view::(); + } + fn _test_string_to_view() where O: OffsetSizeTrait, @@ -5281,11 +5338,22 @@ mod tests { &DataType::Utf8View )); + assert!(can_cast_types( + string_array.data_type(), + &DataType::BinaryView + )); + let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap(); assert_eq!(string_view_array.data_type(), &DataType::Utf8View); + let binary_view_array = cast(&string_array, &DataType::BinaryView).unwrap(); + assert_eq!(binary_view_array.data_type(), &DataType::BinaryView); + let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); assert_eq!(string_view_array.as_ref(), &expect_string_view_array); + + let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array); } #[test] @@ -5380,7 +5448,7 @@ mod tests { where O: OffsetSizeTrait, { - let view_array = { + let string_view_array = { let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers. for s in VIEW_TEST_DATA.iter() { builder.append_option(*s); @@ -5388,15 +5456,21 @@ mod tests { builder.finish() }; + let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + let expected_string_array = GenericStringArray::::from_iter(VIEW_TEST_DATA); let expected_type = expected_string_array.data_type(); - assert!(can_cast_types(view_array.data_type(), expected_type)); + assert!(can_cast_types(string_view_array.data_type(), expected_type)); + assert!(can_cast_types(binary_view_array.data_type(), expected_type)); - let string_array = cast(&view_array, expected_type).unwrap(); - assert_eq!(string_array.data_type(), expected_type); + let string_view_casted_array = cast(&string_view_array, expected_type).unwrap(); + assert_eq!(string_view_casted_array.data_type(), expected_type); + assert_eq!(string_view_casted_array.as_ref(), &expected_string_array); - assert_eq!(string_array.as_ref(), &expected_string_array); + let binary_view_casted_array = cast(&binary_view_array, expected_type).unwrap(); + assert_eq!(binary_view_casted_array.data_type(), expected_type); + assert_eq!(binary_view_casted_array.as_ref(), &expected_string_array); } #[test] diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index 8803e8eea878..ec7990d3d764 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -114,6 +114,18 @@ fn build_decimal256_array(size: usize, precision: u8, scale: i8) -> ArrayRef { ) } +fn build_string_array(size: usize) -> ArrayRef { + let mut builder = StringBuilder::new(); + for v in 0..size { + match v % 3 { + 0 => builder.append_value("small"), + 1 => builder.append_value("larger string more than 12 bytes"), + _ => builder.append_null(), + } + } + Arc::new(builder.finish()) +} + fn build_dict_array(size: usize) -> ArrayRef { let values = StringArray::from_iter([ Some("small"), @@ -148,9 +160,12 @@ fn add_benchmark(c: &mut Criterion) { let decimal128_array = build_decimal128_array(512, 10, 3); let decimal256_array = build_decimal256_array(512, 50, 3); + let string_array = build_string_array(512); + let wide_string_array = cast(&string_array, &DataType::LargeUtf8).unwrap(); let dict_array = build_dict_array(10_000); let string_view_array = cast(&dict_array, &DataType::Utf8View).unwrap(); + let binary_view_array = cast(&string_view_array, &DataType::BinaryView).unwrap(); c.bench_function("cast int32 to int32 512", |b| { b.iter(|| cast_array(&i32_array, DataType::Int32)) @@ -262,6 +277,30 @@ fn add_benchmark(c: &mut Criterion) { ) }) }); + c.bench_function("cast string view to string", |b| { + b.iter(|| cast_array(&string_view_array, DataType::Utf8)) + }); + c.bench_function("cast string view to wide string", |b| { + b.iter(|| cast_array(&string_view_array, DataType::LargeUtf8)) + }); + c.bench_function("cast binary view to string", |b| { + b.iter(|| cast_array(&binary_view_array, DataType::Utf8)) + }); + c.bench_function("cast binary view to wide string", |b| { + b.iter(|| cast_array(&binary_view_array, DataType::LargeUtf8)) + }); + c.bench_function("cast string to binary view 512", |b| { + b.iter(|| cast_array(&string_array, DataType::BinaryView)) + }); + c.bench_function("cast wide string to binary view 512", |b| { + b.iter(|| cast_array(&wide_string_array, DataType::BinaryView)) + }); + c.bench_function("cast string view to binary view", |b| { + b.iter(|| cast_array(&string_view_array, DataType::BinaryView)) + }); + c.bench_function("cast binary view to string view", |b| { + b.iter(|| cast_array(&binary_view_array, DataType::Utf8View)) + }); } criterion_group!(benches, add_benchmark);