diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 0a18062d9ae1..9ac8102dec8e 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -94,7 +94,7 @@ pub struct GenericByteArray { impl Clone for GenericByteArray { fn clone(&self) -> Self { Self { - data_type: self.data_type.clone(), + data_type: T::DATA_TYPE, value_offsets: self.value_offsets.clone(), value_data: self.value_data.clone(), nulls: self.nulls.clone(), diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs new file mode 100644 index 000000000000..52f5ee05840e --- /dev/null +++ b/arrow-array/src/array/byte_view_array.rs @@ -0,0 +1,346 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::print_long_array; +use crate::builder::GenericByteViewBuilder; +use crate::iterator::ArrayIter; +use crate::types::bytes::ByteArrayNativeType; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; +use crate::{Array, ArrayAccessor, ArrayRef}; +use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; +use arrow_data::view::View; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType}; +use std::any::Any; +use std::marker::PhantomData; +use std::sync::Arc; + +/// An array of variable length byte view arrays +pub struct GenericByteViewArray { + data_type: DataType, + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + phantom: PhantomData, +} + +impl Clone for GenericByteViewArray { + fn clone(&self) -> Self { + Self { + data_type: T::DATA_TYPE, + views: self.views.clone(), + buffers: self.buffers.clone(), + nulls: self.nulls.clone(), + phantom: Default::default(), + } + } +} + +impl GenericByteViewArray { + /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure + /// + /// # Panics + /// + /// Panics if [`GenericByteViewArray::try_new`] returns an error + pub fn new( + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + ) -> Self { + Self::try_new(views, buffers, nulls).unwrap() + } + + /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// * `views.len() != nulls.len()` + /// * [ByteViewType::validate] fails + pub fn try_new( + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + ) -> Result { + // Verify data is valid + T::validate(&views, &buffers)?; + + if let Some(n) = nulls.as_ref() { + if n.len() != views.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ViewArray, expected {} got {}", + T::PREFIX, + views.len(), + n.len(), + ))); + } + } + + Ok(Self { + data_type: T::DATA_TYPE, + phantom: Default::default(), + views, + buffers, + nulls, + }) + } + + /// Create a new [`GenericByteViewArray`] from the provided parts, without validation + /// + /// # Safety + /// + /// Safe if [`Self::try_new`] would not error + pub unsafe fn new_unchecked( + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + ) -> Self { + Self { + data_type: T::DATA_TYPE, + phantom: Default::default(), + views, + buffers, + nulls, + } + } + + /// Create a new [`GenericByteViewArray`] of length `len` where all values are null + pub fn new_null(len: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + views: vec![0; len].into(), + buffers: vec![], + nulls: Some(NullBuffer::new_null(len)), + phantom: Default::default(), + } + } + + /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls + pub fn from_iter_values(iter: I) -> Self + where + Ptr: AsRef, + I: IntoIterator, + { + let iter = iter.into_iter(); + let mut builder = GenericByteViewBuilder::::with_capacity(iter.size_hint().0); + for v in iter { + builder.append_value(v); + } + builder.finish() + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (ScalarBuffer, Vec, Option) { + (self.views, self.buffers, self.nulls) + } + + /// Returns the views buffer + #[inline] + pub fn views(&self) -> &ScalarBuffer { + &self.views + } + + /// Returns the buffers storing string data + #[inline] + pub fn data_buffers(&self) -> &[Buffer] { + &self.buffers + } + + /// Returns the element at index `i` + /// # Panics + /// Panics if index `i` is out of bounds. + pub fn value(&self, i: usize) -> &T::Native { + assert!( + i < self.len(), + "Trying to access an element at index {} from a {}ViewArray of length {}", + i, + T::PREFIX, + self.len() + ); + + assert!(i < self.views.len()); + unsafe { self.value_unchecked(i) } + } + + /// Returns the element at index `i` + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array + pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native { + let v = self.views.get_unchecked(idx); + let len = *v as u32; + let b = if len <= 12 { + let ptr = self.views.as_ptr() as *const u8; + std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize) + } else { + let view = View::from(*v); + let data = self.buffers.get_unchecked(view.buffer_index as usize); + let offset = view.offset as usize; + data.get_unchecked(offset..offset + len as usize) + }; + T::Native::from_bytes_unchecked(b) + } + + /// constructs a new iterator + pub fn iter(&self) -> ArrayIter<&Self> { + ArrayIter::new(self) + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + views: self.views.slice(offset, length), + buffers: self.buffers.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + phantom: Default::default(), + } + } +} + +impl std::fmt::Debug for GenericByteViewArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}ViewArray\n[\n", T::PREFIX)?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl Array for GenericByteViewArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.clone().into() + } + + fn into_data(self) -> ArrayData { + self.into() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.views.len() + } + + fn is_empty(&self) -> bool { + self.views.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::(); + sum += self.views.inner().capacity(); + if let Some(x) = &self.nulls { + sum += x.buffer().capacity() + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() + } +} + +impl<'a, T: ByteViewType> ArrayAccessor for &'a GenericByteViewArray { + type Item = &'a T::Native; + + fn value(&self, index: usize) -> Self::Item { + GenericByteViewArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericByteViewArray::value_unchecked(self, index) + } +} + +impl<'a, T: ByteViewType> IntoIterator for &'a GenericByteViewArray { + type Item = Option<&'a T::Native>; + type IntoIter = ArrayIter; + + fn into_iter(self) -> Self::IntoIter { + ArrayIter::new(self) + } +} + +impl From for GenericByteViewArray { + fn from(value: ArrayData) -> Self { + let views = value.buffers()[0].clone(); + let views = ScalarBuffer::new(views, value.offset(), value.len()); + let buffers = value.buffers()[1..].to_vec(); + Self { + data_type: T::DATA_TYPE, + views, + buffers, + nulls: value.nulls().cloned(), + phantom: Default::default(), + } + } +} + +impl From> for ArrayData { + fn from(mut array: GenericByteViewArray) -> Self { + let len = array.len(); + array.buffers.insert(0, array.views.into_inner()); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .buffers(array.buffers) + .nulls(array.nulls); + + unsafe { builder.build_unchecked() } + } +} + +impl FromIterator> for GenericByteViewArray +where + Ptr: AsRef, +{ + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = GenericByteViewBuilder::::with_capacity(iter.size_hint().0); + builder.extend(iter); + builder.finish() + } +} + +/// A [`GenericByteViewArray`] of `str` +/// +/// ``` +/// # use arrow_array::StringViewArray; +/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "foo", "large payload over 12 bytes"]); +/// assert_eq!(array.value(0), "hello"); +/// assert_eq!(array.value(3), "large payload over 12 bytes"); +/// ``` +pub type StringViewArray = GenericByteViewArray; + +/// A [`GenericByteViewArray`] of `[u8]` +pub type BinaryViewArray = GenericByteViewArray; diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 9312770644a3..428e09bbdcaf 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -34,6 +34,9 @@ pub use boolean_array::*; mod byte_array; pub use byte_array::*; +mod byte_view_array; +pub use byte_view_array::*; + mod dictionary_array; pub use dictionary_array::*; diff --git a/arrow-array/src/builder/byte_view_builder.rs b/arrow-array/src/builder/byte_view_builder.rs new file mode 100644 index 000000000000..1119d55e321a --- /dev/null +++ b/arrow-array/src/builder/byte_view_builder.rs @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder::{ArrayBuilder, BufferBuilder}; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; +use crate::{ArrayRef, GenericByteViewArray}; +use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer}; +use arrow_data::view::View; +use std::any::Any; +use std::marker::PhantomData; +use std::sync::Arc; + +const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024; + +/// A builder for [`GenericByteViewArray`] +/// +/// See [`Self::append_value`] for the allocation strategy +pub struct GenericByteViewBuilder { + views_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + completed: Vec, + in_progress: Vec, + block_size: u32, + phantom: PhantomData, +} + +impl GenericByteViewBuilder { + /// Creates a new [`GenericByteViewBuilder`]. + pub fn new() -> Self { + Self::with_capacity(1024) + } + + /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` strings + pub fn with_capacity(capacity: usize) -> Self { + Self { + views_builder: BufferBuilder::new(capacity), + null_buffer_builder: NullBufferBuilder::new(capacity), + completed: vec![], + in_progress: vec![], + block_size: DEFAULT_BLOCK_SIZE, + phantom: Default::default(), + } + } + + /// Override the minimum size of buffers to allocate for string data + pub fn with_block_size(self, block_size: u32) -> Self { + Self { block_size, ..self } + } + + /// Appends a value into the builder + /// + /// # Panics + /// + /// Panics if + /// - String buffer count exceeds `u32::MAX` + /// - String length exceeds `u32::MAX` + #[inline] + pub fn append_value(&mut self, value: impl AsRef) { + let v: &[u8] = value.as_ref().as_ref(); + let length: u32 = v.len().try_into().unwrap(); + if length <= 12 { + let mut offset = [0; 16]; + offset[0..4].copy_from_slice(&length.to_le_bytes()); + offset[4..4 + v.len()].copy_from_slice(v); + self.views_builder.append(u128::from_le_bytes(offset)); + self.null_buffer_builder.append_non_null(); + return; + } + + let required_cap = self.in_progress.len() + v.len(); + if self.in_progress.capacity() < required_cap { + let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize)); + let flushed = std::mem::replace(&mut self.in_progress, in_progress); + if !flushed.is_empty() { + assert!(self.completed.len() < u32::MAX as usize); + self.completed.push(flushed.into()); + } + }; + let offset = self.in_progress.len() as u32; + self.in_progress.extend_from_slice(v); + + let view = View { + length, + prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), + buffer_index: self.completed.len() as u32, + offset, + }; + self.views_builder.append(view.into()); + self.null_buffer_builder.append_non_null(); + } + + /// Append an `Option` value into the builder + #[inline] + pub fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Append a null value into the builder + #[inline] + pub fn append_null(&mut self) { + self.null_buffer_builder.append_null(); + self.views_builder.append(0); + } + + /// Builds the [`GenericByteViewArray`] and reset this builder + pub fn finish(&mut self) -> GenericByteViewArray { + let mut completed = std::mem::take(&mut self.completed); + if !self.in_progress.is_empty() { + completed.push(std::mem::take(&mut self.in_progress).into()); + } + let len = self.views_builder.len(); + let views = ScalarBuffer::new(self.views_builder.finish(), 0, len); + let nulls = self.null_buffer_builder.finish(); + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } + + /// Builds the [`GenericByteViewArray`] without resetting the builder + pub fn finish_cloned(&self) -> GenericByteViewArray { + let mut completed = self.completed.clone(); + if !self.in_progress.is_empty() { + completed.push(Buffer::from_slice_ref(&self.in_progress)); + } + let len = self.views_builder.len(); + let views = Buffer::from_slice_ref(self.views_builder.as_slice()); + let views = ScalarBuffer::new(views, 0, len); + let nulls = self.null_buffer_builder.finish_cloned(); + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } +} + +impl Default for GenericByteViewBuilder { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for GenericByteViewBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}ViewBuilder", T::PREFIX)?; + f.debug_struct("") + .field("views_builder", &self.views_builder) + .field("in_progress", &self.in_progress) + .field("completed", &self.completed) + .field("null_buffer_builder", &self.null_buffer_builder) + .finish() + } +} + +impl ArrayBuilder for GenericByteViewBuilder { + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn into_box_any(self: Box) -> Box { + self + } +} + +impl> Extend> + for GenericByteViewBuilder +{ + #[inline] + fn extend>>(&mut self, iter: I) { + for v in iter { + self.append_option(v) + } + } +} + +/// Array builder for [`StringViewArray`][crate::StringViewArray] +/// +/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with +/// [`GenericByteBuilder::append_null`] as normal. +pub type StringViewBuilder = GenericByteViewBuilder; + +/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] +pub type BinaryViewBuilder = GenericByteViewBuilder; diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 38a7500dd55f..48b3c551b57d 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -154,6 +154,8 @@ mod boolean_builder; pub use boolean_builder::*; mod buffer_builder; pub use buffer_builder::*; +mod byte_view_builder; +pub use byte_view_builder::*; mod fixed_size_binary_builder; pub use fixed_size_binary_builder::*; mod fixed_size_list_builder; diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 769dbf974b93..4f1957e118c3 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -26,6 +26,7 @@ use crate::timezone::Tz; use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; +use arrow_data::view::{validate_binary_view, validate_string_view}; use arrow_schema::{ ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, @@ -1378,6 +1379,10 @@ pub(crate) mod bytes { std::str::from_utf8_unchecked(b) } } + + pub trait ByteViewTypeSealed {} + impl ByteViewTypeSealed for BinaryViewType {} + impl ByteViewTypeSealed for StringViewType {} } /// A trait over the variable-size byte array types @@ -1490,6 +1495,49 @@ pub type BinaryType = GenericBinaryType; /// An arrow binary array with i64 offsets pub type LargeBinaryType = GenericBinaryType; +/// A trait over the variable-size byte view array types +pub trait ByteViewType: 'static + Send + Sync + bytes::ByteViewTypeSealed { + /// Type for representing its equivalent rust type i.e + /// Utf8Array will have native type has &str + /// BinaryArray will have type as [u8] + type Native: bytes::ByteArrayNativeType + AsRef + AsRef<[u8]> + ?Sized; + + /// Datatype of array elements + const DATA_TYPE: DataType; + + /// "Binary" or "String", for use in error messages + const PREFIX: &'static str; + + /// Verifies that the provided buffers are valid for this array type + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError>; +} + +/// [`ByteViewType`] for string arrays +pub struct StringViewType {} + +impl ByteViewType for StringViewType { + type Native = str; + const DATA_TYPE: DataType = DataType::Utf8View; + const PREFIX: &'static str = "String"; + + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { + validate_string_view(views, buffers) + } +} + +/// [`ByteViewType`] for binary arrays +pub struct BinaryViewType {} + +impl ByteViewType for BinaryViewType { + type Native = [u8]; + const DATA_TYPE: DataType = DataType::BinaryView; + const PREFIX: &'static str = "Binary"; + + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { + validate_binary_view(views, buffers) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 8fe6cf2b7894..6a6c330007b2 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -149,6 +149,7 @@ native_integer!(u8); native_integer!(u16); native_integer!(u32); native_integer!(u64); +native_integer!(u128); macro_rules! native_float { ($t:ty, $s:ident, $as_usize: expr, $i:ident, $usize_as: expr) => { diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data.rs similarity index 97% rename from arrow-data/src/data/mod.rs rename to arrow-data/src/data.rs index 32aae1e92a51..3b268529b64e 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data.rs @@ -28,9 +28,10 @@ use std::ops::Range; use std::sync::Arc; use crate::equal; +use crate::view::{validate_binary_view, validate_string_view}; -mod buffers; -pub use buffers::*; +/// A collection of [`Buffer`] +pub type Buffers<'a> = &'a [Buffer]; #[inline] pub(crate) fn contains_nulls( @@ -109,6 +110,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff buffer.push(0i64); [buffer, MutableBuffer::new(capacity * mem::size_of::())] } + DataType::Utf8View | DataType::BinaryView => [ + MutableBuffer::new(capacity * mem::size_of::()), + empty_buffer, + ], DataType::List(_) | DataType::Map(_, _) => { // offset buffer always starts with a zero let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); @@ -148,30 +153,6 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff } } -/// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`. -#[inline] -pub(crate) fn into_buffers( - data_type: &DataType, - buffer1: MutableBuffer, - buffer2: MutableBuffer, -) -> Vec { - match data_type { - DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => vec![], - DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary => vec![buffer1.into(), buffer2.into()], - DataType::Union(_, mode) => { - match mode { - // Based on Union's DataTypeLayout - UnionMode::Sparse => vec![buffer1.into()], - UnionMode::Dense => vec![buffer1.into(), buffer2.into()], - } - } - _ => vec![buffer1.into()], - } -} - /// An generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., /// primitive, list, struct) are implemented in `Array`. @@ -345,10 +326,9 @@ impl ArrayData { &self.data_type } - /// Returns the [`Buffers`] storing data for this [`ArrayData`] - pub fn buffers(&self) -> Buffers<'_> { - // In future ArrayData won't store data contiguously as `Vec` (#1799) - Buffers::from_slice(&self.buffers) + /// Returns the [`Buffer`] storing data for this [`ArrayData`] + pub fn buffers(&self) -> &[Buffer] { + &self.buffers } /// Returns a slice of children [`ArrayData`]. This will be non @@ -719,7 +699,7 @@ impl ArrayData { ))); } - if self.buffers.len() != layout.buffers.len() { + if self.buffers.len() < layout.buffers.len() { return Err(ArrowError::InvalidArgumentError(format!( "Expected {} buffers in array of type {:?}, got {}", layout.buffers.len(), @@ -1213,6 +1193,14 @@ impl ArrayData { DataType::LargeBinary => { self.validate_offsets_full::(self.buffers[1].len()) } + DataType::BinaryView => { + let views = self.typed_buffer::(0, self.len)?; + validate_binary_view(views, &self.buffers[1..]) + } + DataType::Utf8View => { + let views = self.typed_buffer::(0, self.len)?; + validate_string_view(views, &self.buffers[1..]) + } DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; self.validate_offsets_full::(child.len) @@ -1486,6 +1474,9 @@ impl ArrayData { /// Return the expected [`DataTypeLayout`] Arrays of this data /// type are expected to have +/// +/// For types with a variadic number of buffers, such as [`DataType::Utf8View`], +/// this only returns the required buffers pub fn layout(data_type: &DataType) -> DataTypeLayout { // based on C/C++ implementation in // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) @@ -1527,8 +1518,10 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataTypeLayout::new_fixed_width(bytes_per_value) } DataType::LargeBinary => DataTypeLayout::new_binary(size_of::()), + DataType::BinaryView => DataTypeLayout::new_fixed_width(size_of::()), DataType::Utf8 => DataTypeLayout::new_binary(size_of::()), DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::()), + DataType::Utf8View => DataTypeLayout::new_fixed_width(size_of::()), DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::()), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), @@ -2036,21 +2029,4 @@ mod tests { assert!(!contains_nulls(Some(&buffer), 3, 2)); assert!(!contains_nulls(Some(&buffer), 0, 0)); } - - #[test] - fn test_into_buffers() { - let data_types = vec![ - DataType::Union(UnionFields::empty(), UnionMode::Dense), - DataType::Union(UnionFields::empty(), UnionMode::Sparse), - ]; - - for data_type in data_types { - let buffers = new_buffers(&data_type, 0); - let [buffer1, buffer2] = buffers; - let buffers = into_buffers(&data_type, buffer1, buffer2); - - let layout = layout(&data_type); - assert_eq!(buffers.len(), layout.buffers.len()); - } - } } diff --git a/arrow-data/src/data/buffers.rs b/arrow-data/src/data/buffers.rs deleted file mode 100644 index 883e92e36d82..000000000000 --- a/arrow-data/src/data/buffers.rs +++ /dev/null @@ -1,96 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_buffer::Buffer; -use std::iter::Chain; -use std::ops::Index; - -/// A collection of [`Buffer`] -#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)] -pub struct Buffers<'a>([Option<&'a Buffer>; 2]); - -impl<'a> Buffers<'a> { - /// Temporary will be removed once ArrayData does not store `Vec` directly (#3769) - pub(crate) fn from_slice(a: &'a [Buffer]) -> Self { - match a.len() { - 0 => Self([None, None]), - 1 => Self([Some(&a[0]), None]), - _ => Self([Some(&a[0]), Some(&a[1])]), - } - } - - /// Returns the number of [`Buffer`] in this collection - #[inline] - pub fn len(&self) -> usize { - self.0[0].is_some() as usize + self.0[1].is_some() as usize - } - - /// Returns `true` if this collection is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.0[0].is_none() && self.0[1].is_none() - } - - #[inline] - pub fn iter(&self) -> IntoIter<'a> { - self.into_iter() - } - - /// Converts this [`Buffers`] to a `Vec` - #[inline] - pub fn to_vec(&self) -> Vec { - self.iter().cloned().collect() - } -} - -impl<'a> Index for Buffers<'a> { - type Output = &'a Buffer; - - #[inline] - fn index(&self, index: usize) -> &Self::Output { - self.0[index].as_ref().unwrap() - } -} - -impl<'a> IntoIterator for Buffers<'a> { - type Item = &'a Buffer; - type IntoIter = IntoIter<'a>; - - #[inline] - fn into_iter(self) -> Self::IntoIter { - IntoIter(self.0[0].into_iter().chain(self.0[1].into_iter())) - } -} - -type OptionIter<'a> = std::option::IntoIter<&'a Buffer>; - -/// [`Iterator`] for [`Buffers`] -pub struct IntoIter<'a>(Chain, OptionIter<'a>>); - -impl<'a> Iterator for IntoIter<'a> { - type Item = &'a Buffer; - - #[inline] - fn next(&mut self) -> Option { - self.0.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() - } -} diff --git a/arrow-data/src/equal/binary_view.rs b/arrow-data/src/equal/binary_view.rs new file mode 100644 index 000000000000..0908def473ed --- /dev/null +++ b/arrow-data/src/equal/binary_view.rs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::view::View; +use crate::ArrayData; + +pub(super) fn binary_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_views = &lhs.buffer::(0)[lhs_start..lhs_start + len]; + let lhs_b = lhs.buffers(); + let rhs_views = &rhs.buffer::(0)[rhs_start..rhs_start + len]; + let rhs_b = rhs.buffers(); + + for (idx, (l, r)) in lhs_views.iter().zip(rhs_views).enumerate() { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if lhs.is_null(idx) { + continue; + } + + let l_len = *l as u32; + let r_len = *r as u32; + if l_len != r_len { + return false; + } else if l_len <= 12 { + // Inline storage + if l != r { + return false; + } + } else { + let l_view = View::from(*l); + let r_view = View::from(*r); + let l_b = &lhs_b[(l_view.buffer_index as usize) + 1]; + let r_b = &rhs_b[(r_view.buffer_index as usize) + 1]; + + let l_o = l_view.offset as usize; + let r_o = r_view.offset as usize; + let len = l_len as usize; + if l_b[l_o..l_o + len] != r_b[r_o..r_o + len] { + return false; + } + } + } + true +} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index fbc868d3f5c4..8bf17a422822 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -24,6 +24,7 @@ use arrow_buffer::i256; use arrow_schema::{DataType, IntervalUnit}; use half::f16; +mod binary_view; mod boolean; mod dictionary; mod fixed_binary; @@ -40,6 +41,7 @@ mod variable_size; // these methods assume the same type, len and null count. // For this reason, they are not exposed and are instead used // to build the generic functions below (`equal_range` and `equal`). +use crate::equal::binary_view::binary_view_equal; use boolean::boolean_equal; use dictionary::dictionary_equal; use fixed_binary::fixed_binary_equal; @@ -74,6 +76,7 @@ fn equal_values( DataType::Int16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Int32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Decimal128(_, _) => { @@ -103,6 +106,9 @@ fn equal_values( DataType::LargeUtf8 | DataType::LargeBinary => { variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) } + DataType::BinaryView | DataType::Utf8View => { + binary_view_equal(lhs, rhs, lhs_start, rhs_start, len) + } DataType::FixedSizeBinary(_) => { fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) } @@ -138,7 +144,6 @@ fn equal_values( } _ => unreachable!(), }, - DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::RunEndEncoded(_, _) => run_equal(lhs, rhs, lhs_start, rhs_start, len), } diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index cfa0dba66c35..901dcfe95a07 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -30,3 +30,5 @@ pub mod decimal; #[cfg(feature = "ffi")] pub mod ffi; + +pub mod view; diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index f4b2b46d1723..ee6f22e06425 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -15,13 +15,11 @@ // specific language governing permissions and limitations // under the License. -use super::{ - data::{into_buffers, new_buffers}, - ArrayData, ArrayDataBuilder, -}; +use super::{data::new_buffers, ArrayData, ArrayDataBuilder}; use crate::bit_mask::set_bits; +use crate::view::View; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; -use arrow_buffer::{bit_util, i256, ArrowNativeType, MutableBuffer}; +use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; use half::f16; use num::Integer; @@ -68,36 +66,6 @@ impl<'a> _MutableArrayData<'a> { .as_mut() .expect("MutableArrayData not nullable") } - - fn freeze(self, dictionary: Option) -> ArrayDataBuilder { - let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); - - let child_data = match self.data_type { - DataType::Dictionary(_, _) => vec![dictionary.unwrap()], - _ => { - let mut child_data = Vec::with_capacity(self.child_data.len()); - for child in self.child_data { - child_data.push(child.freeze()); - } - child_data - } - }; - - let nulls = self - .null_buffer - .map(|nulls| { - let bools = BooleanBuffer::new(nulls.into(), 0, self.len); - unsafe { NullBuffer::new_unchecked(bools, self.null_count) } - }) - .filter(|n| n.null_count() > 0); - - ArrayDataBuilder::new(self.data_type) - .offset(0) - .len(self.len) - .nulls(nulls) - .buffers(buffers) - .child_data(child_data) - } } fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { @@ -138,26 +106,31 @@ fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits pub struct MutableArrayData<'a> { #[allow(dead_code)] arrays: Vec<&'a ArrayData>, - // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to - // mutability invariants (interior mutability): - // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not - // [MutableArrayData] itself + /// The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to + /// mutability invariants (interior mutability): + /// [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not + /// [MutableArrayData] itself data: _MutableArrayData<'a>, - // the child data of the `Array` in Dictionary arrays. - // This is not stored in `MutableArrayData` because these values constant and only needed - // at the end, when freezing [_MutableArrayData]. + /// the child data of the `Array` in Dictionary arrays. + /// This is not stored in `MutableArrayData` because these values constant and only needed + /// at the end, when freezing [_MutableArrayData]. dictionary: Option, - // function used to extend values from arrays. This function's lifetime is bound to the array - // because it reads values from it. + /// View data buffers + /// This is not stored in `MutableArrayData` because these values constant and only needed + /// at the end, when freezing [_MutableArrayData]. + view_buffers: Vec, + + /// function used to extend values from arrays. This function's lifetime is bound to the array + /// because it reads values from it. extend_values: Vec>, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. + /// function used to extend nulls from arrays. This function's lifetime is bound to the array + /// because it reads nulls from it. extend_null_bits: Vec>, - // function used to extend nulls. - // this is independent of the arrays and therefore has no lifetime. + /// function used to extend nulls. + /// this is independent of the arrays and therefore has no lifetime. extend_nulls: ExtendNulls, } @@ -201,6 +174,26 @@ fn build_extend_dictionary( } } +/// Builds an extend that adds `buffer_offset` to any buffer indices encountered +fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend { + let views = array.buffer::(0); + Box::new( + move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { + mutable + .buffer1 + .extend(views[start..start + len].iter().map(|v| { + let len = *v as u32; + if len <= 12 { + return *v; // Stored inline + } + let mut view = View::from(*v); + view.buffer_index += buffer_offset; + view.into() + })) + }, + ) +} + fn build_extend(array: &ArrayData) -> Extend { match array.data_type() { DataType::Null => null::build_extend(array), @@ -238,6 +231,9 @@ fn build_extend(array: &ArrayData) -> Extend { } DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), DataType::LargeList(_) => list::build_extend::(array), + DataType::BinaryView | DataType::Utf8View => { + unreachable!("should use build_extend_view") + } DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), DataType::Struct(_) => structure::build_extend(array), DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array), @@ -280,6 +276,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, DataType::LargeList(_) => list::extend_nulls::, + DataType::Utf8View | DataType::BinaryView => primitive::extend_nulls::, DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { DataType::UInt8 => primitive::extend_nulls::, DataType::UInt16 => primitive::extend_nulls::, @@ -427,6 +424,8 @@ impl<'a> MutableArrayData<'a> { | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary + | DataType::Utf8View + | DataType::BinaryView | DataType::Interval(_) | DataType::FixedSizeBinary(_) => vec![], DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { @@ -560,6 +559,15 @@ impl<'a> MutableArrayData<'a> { _ => (None, false), }; + let view_buffers = match &data_type { + DataType::BinaryView | DataType::Utf8View => arrays + .iter() + .flat_map(|x| x.buffers().into_iter().skip(1)) + .map(Buffer::clone) + .collect(), + _ => vec![], + }; + let extend_nulls = build_extend_nulls(data_type); let extend_null_bits = arrays @@ -592,6 +600,20 @@ impl<'a> MutableArrayData<'a> { extend_values.expect("MutableArrayData::new is infallible") } + DataType::Utf8View | DataType::BinaryView => { + let mut next_offset = 0_u32; + arrays + .iter() + .map(|array| { + let num_data_buffers = (array.buffers().len() - 1) as u32; + let offset = next_offset; + next_offset = next_offset + .checked_add(num_data_buffers) + .expect("buffer index overflow"); + build_extend_view(array, offset) + }) + .collect() + } _ => arrays.iter().map(|array| build_extend(array)).collect(), }; @@ -608,6 +630,7 @@ impl<'a> MutableArrayData<'a> { arrays, data, dictionary, + view_buffers, extend_values, extend_null_bits, extend_nulls, @@ -667,13 +690,56 @@ impl<'a> MutableArrayData<'a> { /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. pub fn freeze(self) -> ArrayData { - unsafe { self.data.freeze(self.dictionary).build_unchecked() } + unsafe { self.into_builder().build_unchecked() } } /// Creates a [ArrayDataBuilder] from the pushed regions up to this point, consuming `self`. /// This is useful for extending the default behavior of MutableArrayData. pub fn into_builder(self) -> ArrayDataBuilder { - self.data.freeze(self.dictionary) + let data = self.data; + + let buffers = match data.data_type { + DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => { + vec![] + } + DataType::BinaryView | DataType::Utf8View => { + let mut b = self.view_buffers; + b.insert(0, data.buffer1.into()); + b + } + DataType::Utf8 + | DataType::Binary + | DataType::LargeUtf8 + | DataType::LargeBinary => vec![data.buffer1.into(), data.buffer2.into()], + DataType::Union(_, mode) => { + match mode { + // Based on Union's DataTypeLayout + UnionMode::Sparse => vec![data.buffer1.into()], + UnionMode::Dense => vec![data.buffer1.into(), data.buffer2.into()], + } + } + _ => vec![data.buffer1.into()], + }; + + let child_data = match data.data_type { + DataType::Dictionary(_, _) => vec![self.dictionary.unwrap()], + _ => data.child_data.into_iter().map(|x| x.freeze()).collect(), + }; + + let nulls = data + .null_buffer + .map(|nulls| { + let bools = BooleanBuffer::new(nulls.into(), 0, data.len); + unsafe { NullBuffer::new_unchecked(bools, data.null_count) } + }) + .filter(|n| n.null_count() > 0); + + ArrayDataBuilder::new(data.data_type) + .offset(0) + .len(data.len) + .nulls(nulls) + .buffers(buffers) + .child_data(child_data) } } diff --git a/arrow-data/src/view.rs b/arrow-data/src/view.rs new file mode 100644 index 000000000000..4108b9a55e52 --- /dev/null +++ b/arrow-data/src/view.rs @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! View array utilities + +use arrow_buffer::Buffer; +use arrow_schema::ArrowError; + +/// The element layout of a view buffer +/// +/// See [`DataType::Utf8View`](arrow_schema::DataType) +pub struct View { + /// The length of the string + pub length: u32, + /// The first 4 bytes of string data + pub prefix: u32, + /// The buffer index + pub buffer_index: u32, + /// The offset into the buffer + pub offset: u32, +} + +impl From for View { + #[inline] + fn from(value: u128) -> Self { + Self { + length: value as u32, + prefix: (value >> 32) as u32, + buffer_index: (value >> 64) as u32, + offset: (value >> 96) as u32, + } + } +} + +impl From for u128 { + #[inline] + fn from(value: View) -> Self { + (value.length as u128) + | ((value.prefix as u128) << 32) + | ((value.buffer_index as u128) << 64) + | ((value.offset as u128) << 96) + } +} + +/// Validates the combination of `views` and `buffers` is a valid BinaryView +pub fn validate_binary_view( + views: &[u128], + buffers: &[Buffer], +) -> Result<(), ArrowError> { + validate_view_impl(views, buffers, |_, _| Ok(())) +} + +/// Validates the combination of `views` and `buffers` is a valid StringView +pub fn validate_string_view( + views: &[u128], + buffers: &[Buffer], +) -> Result<(), ArrowError> { + validate_view_impl(views, buffers, |idx, b| { + std::str::from_utf8(b).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Encountered non-UTF-8 data at index {idx}: {e}" + )) + })?; + Ok(()) + }) +} + +fn validate_view_impl( + views: &[u128], + buffers: &[Buffer], + f: F, +) -> Result<(), ArrowError> +where + F: Fn(usize, &[u8]) -> Result<(), ArrowError>, +{ + for (idx, v) in views.iter().enumerate() { + let len = *v as u32; + if len <= 12 { + f(idx, &v.to_le_bytes()[4..4 + len as usize])? + } else { + let view = View::from(*v); + let data = buffers.get(view.buffer_index as usize).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Invalid buffer index at {idx}: got index {} but only has {} buffers", + view.buffer_index, + buffers.len() + )) + })?; + + let start = view.offset as usize; + let end = start + len as usize; + let b = data.get(start..end).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Invalid buffer slice at {idx}: got {start}..{end} but buffer {} has length {}", + view.buffer_index, + data.len() + )) + })?; + + if !b.starts_with(&view.prefix.to_le_bytes()) { + return Err(ArrowError::InvalidArgumentError( + "Mismatch between embedded prefix and data".to_string(), + )); + } + + f(idx, b)?; + } + } + Ok(()) +} diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 07f716dea843..187b19b5a127 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -787,6 +787,7 @@ pub(crate) fn get_fb_field_type<'a>( children: Some(fbb.create_vector(&children[..])), } } + BinaryView | Utf8View => unimplemented!(), } } diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index edd1dd09620e..e182625ff238 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -169,6 +169,11 @@ pub enum DataType { /// A single LargeBinary array can store up to [`i64::MAX`] bytes /// of binary data in total LargeBinary, + /// Opaque binary data of variable length. + /// + /// This is a less memory efficient layout than [`Self::Binary`] but can + /// be processed by some kernels more efficiently + BinaryView, /// A variable-length string in Unicode with UTF-8 encoding /// /// A single Utf8 array can store up to [`i32::MAX`] bytes @@ -179,6 +184,11 @@ pub enum DataType { /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes /// of string data in total LargeUtf8, + /// A variable-length string in Unicode with UTF-8 encoding + /// + /// This is a less memory efficient layout than [`Self::Utf8`] but can + /// be processed by some kernels more efficiently + Utf8View, /// A list of some logical data type with variable length. /// /// A single List array can store up to [`i32::MAX`] elements in total @@ -490,8 +500,8 @@ impl DataType { DataType::Interval(IntervalUnit::MonthDayNano) => Some(16), DataType::Decimal128(_, _) => Some(16), DataType::Decimal256(_, _) => Some(32), - DataType::Utf8 | DataType::LargeUtf8 => None, - DataType::Binary | DataType::LargeBinary => None, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None, + DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None, DataType::FixedSizeBinary(_) => None, DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) => None, DataType::FixedSizeList(_, _) => None, @@ -530,8 +540,10 @@ impl DataType { | DataType::Binary | DataType::FixedSizeBinary(_) | DataType::LargeBinary + | DataType::BinaryView | DataType::Utf8 | DataType::LargeUtf8 + | DataType::Utf8View | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => 0, DataType::Timestamp(_, s) => { diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 00deecf06283..1cb8b32068cf 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -482,6 +482,7 @@ impl Field { | DataType::Duration(_) | DataType::Binary | DataType::LargeBinary + | DataType::BinaryView | DataType::Interval(_) | DataType::LargeList(_) | DataType::List(_) @@ -492,6 +493,7 @@ impl Field { | DataType::FixedSizeBinary(_) | DataType::Utf8 | DataType::LargeUtf8 + | DataType::Utf8View | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { if self.data_type != from.data_type { diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index ebbadc00aecd..7e2e5f7b3cc6 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -22,6 +22,7 @@ use arrow::array::{ StringDictionaryBuilder, StructArray, UInt8Array, }; use arrow::datatypes::Int16Type; +use arrow_array::StringViewArray; use arrow_buffer::Buffer; use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; @@ -945,6 +946,48 @@ fn test_extend_nulls_panic() { mutable.extend_nulls(2); } +#[test] +fn test_string_view() { + let a1 = StringViewArray::from_iter_values(vec![ + "foo", + "very long string over 12 bytes", + "bar", + ]) + .into_data(); + let a2 = StringViewArray::from_iter(vec![ + Some("bar"), + None, + Some("long string also over 12 bytes"), + ]) + .into_data(); + + a1.validate_full().unwrap(); + a2.validate_full().unwrap(); + + let mut mutable = MutableArrayData::new(vec![&a1, &a2], false, 4); + mutable.extend(1, 0, 1); + mutable.extend(0, 1, 2); + mutable.extend(0, 0, 1); + mutable.extend(1, 2, 3); + + let array = StringViewArray::from(mutable.freeze()); + assert_eq!(array.data_buffers().len(), 2); + // Should have reused data buffers + assert_eq!(array.data_buffers()[0].as_ptr(), a1.buffers()[1].as_ptr()); + assert_eq!(array.data_buffers()[1].as_ptr(), a2.buffers()[1].as_ptr()); + + let v = array.iter().collect::>(); + assert_eq!( + v, + vec![ + Some("bar"), + Some("very long string over 12 bytes"), + Some("foo"), + Some("long string also over 12 bytes") + ] + ) +} + /* // this is an old test used on a meanwhile removed dead code // that is still useful when `MutableArrayData` supports fixed-size lists.