From 815a85ee5605376bc28522e9c3fe7051792129d9 Mon Sep 17 00:00:00 2001 From: kikkon Date: Mon, 13 Jan 2025 00:31:54 +0800 Subject: [PATCH 1/2] feat: add ListView equal --- arrow-data/src/equal/list_view.rs | 73 ++++++++++++++++ arrow-data/src/equal/mod.rs | 7 +- arrow/tests/array_equal.rs | 139 +++++++++++++++++++++++++++++- 3 files changed, 213 insertions(+), 6 deletions(-) create mode 100644 arrow-data/src/equal/list_view.rs diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs new file mode 100644 index 000000000000..fe98762953a1 --- /dev/null +++ b/arrow-data/src/equal/list_view.rs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ArrayData; +use arrow_buffer::ArrowNativeType; +use num::Integer; + +use super::equal_range; + +pub(super) fn list_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_offsets = lhs.buffer::(0); + let rhs_offsets = rhs.buffer::(0); + let lhs_sizes = lhs.buffer::(1); + let rhs_sizes = rhs.buffer::(1); + let lhs_nulls = lhs.nulls(); + let rhs_nulls = rhs.nulls(); + for i in 0..len { + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; + + // get offset and size + let lhs_offset_start = lhs_offsets[lhs_pos].to_usize().unwrap(); + let rhs_offset_start = rhs_offsets[rhs_pos].to_usize().unwrap(); + let lhs_size = lhs_sizes[lhs_pos].to_usize().unwrap(); + let rhs_size = rhs_sizes[rhs_pos].to_usize().unwrap(); + + if lhs_size != rhs_size { + return false; + } + + // check if null + if let (Some(lhs_null), Some(rhs_null)) = (lhs_nulls, rhs_nulls) { + if lhs_null.is_null(lhs_pos) != rhs_null.is_null(rhs_pos) { + return false; + } + if lhs_null.is_null(lhs_pos) { + continue; + } + } + + // compare values + if !equal_range( + &lhs.child_data()[0], + &rhs.child_data()[0], + lhs_offset_start, + rhs_offset_start, + lhs_size, + ) { + return false; + } + } + true +} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index f24179b61700..9aa06df857c0 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -30,6 +30,7 @@ mod dictionary; mod fixed_binary; mod fixed_list; mod list; +mod list_view; mod null; mod primitive; mod run; @@ -47,6 +48,7 @@ use dictionary::dictionary_equal; use fixed_binary::fixed_binary_equal; use fixed_list::fixed_list_equal; use list::list_equal; +use list_view::list_view_equal; use null::null_equal; use primitive::primitive_equal; use structure::struct_equal; @@ -102,9 +104,8 @@ fn equal_values( byte_view_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not yet implemented") - } + DataType::ListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::LargeListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 94fb85030bf3..e56c9badd0db 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -18,12 +18,12 @@ use arrow::array::{ make_array, Array, ArrayRef, BooleanArray, Decimal128Array, FixedSizeBinaryArray, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, GenericStringArray, - Int32Array, Int32Builder, Int64Builder, ListArray, ListBuilder, NullArray, OffsetSizeTrait, - StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, + Int32Array, Int32Builder, Int64Builder, ListArray, ListBuilder, ListViewBuilder, NullArray, + OffsetSizeTrait, StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, }; use arrow::datatypes::{Int16Type, Int32Type}; use arrow_array::builder::{StringBuilder, StringViewBuilder, StructBuilder}; -use arrow_array::{DictionaryArray, FixedSizeListArray, StringViewArray}; +use arrow_array::{DictionaryArray, FixedSizeListArray, ListViewArray, StringViewArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, Field, Fields}; @@ -1292,3 +1292,136 @@ fn test_list_excess_children_equal() { assert_eq!(b.value_offsets(), &[0, 0, 2]); assert_eq!(a, b); } + +fn create_list_view_array, T: AsRef<[Option]>>(data: T) -> ListViewArray { + let mut builder = ListViewBuilder::new(Int32Builder::with_capacity(10)); + for d in data.as_ref() { + if let Some(v) = d { + builder.values().append_slice(v.as_ref()); + builder.append(true); + } else { + builder.append(false); + } + } + builder.finish() +} + +#[test] +fn test_list_view_equal() { + let a = create_list_view_array([Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let b = create_list_view_array([Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + test_equal(&a, &b, true); + + let b = create_list_view_array([Some(&[1, 2, 3]), Some(&[4, 5, 7])]); + test_equal(&a, &b, false); +} + +#[test] +fn test_empty_offsets_list_view_equal() { + let empty: Vec = vec![]; + let values = Int32Array::from(empty); + let empty_offsets: [u8; 0] = []; + let empty_sizes: [u8; 0] = []; + let a: ListViewArray = ArrayDataBuilder::new(DataType::ListView(Arc::new( + Field::new_list_field(DataType::Int32, true), + ))) + .len(0) + .add_buffer(Buffer::from(&empty_offsets)) + .add_buffer(Buffer::from(&empty_sizes)) + .add_child_data(values.to_data()) + .null_bit_buffer(Some(Buffer::from(&empty_offsets))) + .build() + .unwrap() + .into(); + + let b: ListViewArray = ArrayDataBuilder::new(DataType::ListView(Arc::new( + Field::new_list_field(DataType::Int32, true), + ))) + .len(0) + .add_buffer(Buffer::from(&empty_offsets)) + .add_buffer(Buffer::from(&empty_sizes)) + .add_child_data(values.to_data()) + .null_bit_buffer(Some(Buffer::from(&empty_offsets))) + .build() + .unwrap() + .into(); + + test_equal(&a, &b, true); +} + +// Test the case where null_count > 0 +#[test] +fn test_list_view_null() { + let a = create_list_view_array([Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let b = create_list_view_array([Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + test_equal(&a, &b, true); + + let b = create_list_view_array([ + Some(&[1, 2]), + None, + Some(&[5, 6]), + Some(&[3, 4]), + None, + None, + ]); + test_equal(&a, &b, false); + + let b = create_list_view_array([Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); + test_equal(&a, &b, false); + + // a list where the nullness of values is determined by the list's bitmap + let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); + let c: ListViewArray = ArrayDataBuilder::new(DataType::ListView(Arc::new( + Field::new_list_field(DataType::Int32, true), + ))) + .len(8) + .add_buffer(Buffer::from([0i32, 2, 3, 4, 4, 1, 4, 4].to_byte_slice())) + .add_buffer(Buffer::from([3i32, 2, 1, 2, 1, 1, 1, 1].to_byte_slice())) + .add_child_data(c_values.into_data()) + .null_bit_buffer(Some(Buffer::from([0b0001001]))) + .build() + .unwrap() + .into(); + + let d_values = Int32Array::from(vec![ + Some(1), + Some(2), + Some(-1), + None, + Some(3), + Some(4), + None, + None, + ]); + let d: ListViewArray = ArrayDataBuilder::new(DataType::ListView(Arc::new( + Field::new_list_field(DataType::Int32, true), + ))) + .len(8) + .add_buffer(Buffer::from([0i32, 2, 3, 4, 4, 1, 4, 4].to_byte_slice())) + .add_buffer(Buffer::from([3i32, 2, 1, 2, 1, 1, 1, 1].to_byte_slice())) + .add_child_data(d_values.into_data()) + .null_bit_buffer(Some(Buffer::from([0b0001001]))) + .build() + .unwrap() + .into(); + test_equal(&c, &d, true); +} + +// Test the case where offset != 0 +#[test] +fn test_list_view_offsets() { + let a = create_list_view_array([Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let b = create_list_view_array([Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(0, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(0, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(4, 1); + test_equal(&a_slice, &b_slice, true); +} From 25cb0eaa791bd391aeaaa9c7dfb987d031f14f1d Mon Sep 17 00:00:00 2001 From: kikkon Date: Mon, 13 Jan 2025 22:55:28 +0800 Subject: [PATCH 2/2] fix --- arrow-data/src/data.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index a35b5e8629e9..0647c6a2eaf5 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -944,7 +944,7 @@ impl ArrayData { ) -> Result<(), ArrowError> { let offsets: &[T] = self.typed_buffer(0, self.len)?; let sizes: &[T] = self.typed_buffer(1, self.len)?; - for i in 0..values_length { + for i in 0..sizes.len() { let size = sizes[i].to_usize().ok_or_else(|| { ArrowError::InvalidArgumentError(format!( "Error converting size[{}] ({}) to usize for {}",