Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Safe conversion between arrow-rs and arrow2 Arrays #1446

Merged
merged 16 commits into from
Apr 12, 2023
8 changes: 5 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,10 @@ odbc-api = { version = "0.36", optional = true }
ahash = "0.8"

# Support conversion to/from arrow-rs
arrow-buffer = { version = "35.0.0", optional = true }
arrow-schema = { version = "35.0.0", optional = true }
arrow-buffer = { version = "36.0.0", optional = true }
arrow-schema = { version = "36.0.0", optional = true }
arrow-data = { version = "36.0.0", optional = true }
arrow-array = { version = "36.0.0", optional = true }

[target.wasm32-unknown-unknown.dependencies]
getrandom = { version = "0.2", features = ["js"] }
Expand Down Expand Up @@ -159,7 +161,7 @@ full = [
# parses timezones used in timestamp conversions
"chrono-tz",
]
arrow = ["arrow-buffer", "arrow-schema"]
arrow = ["arrow-buffer", "arrow-schema", "arrow-data", "arrow-array"]
io_odbc = ["odbc-api"]
io_csv = ["io_csv_read", "io_csv_write"]
io_csv_async = ["io_csv_read_async"]
Expand Down
42 changes: 42 additions & 0 deletions src/array/binary/data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use crate::array::{Arrow2Arrow, BinaryArray};
use crate::bitmap::Bitmap;
use crate::offset::{Offset, OffsetsBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};

impl<O: Offset> Arrow2Arrow for BinaryArray<O> {
fn to_data(&self) -> ArrayData {
let data_type = self.data_type.clone().into();
let builder = ArrayDataBuilder::new(data_type)
.len(self.offsets().len_proxy())
.buffers(vec![
self.offsets.clone().into_inner().into(),
self.values.clone().into(),
])
.nulls(self.validity.as_ref().map(|b| b.clone().into()));

// Safety: Array is valid
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
let data_type = data.data_type().clone().into();

if data.is_empty() {
// Handle empty offsets
return Self::new_empty(data_type);
}

let buffers = data.buffers();

// Safety: ArrayData is valid
let mut offsets = unsafe { OffsetsBuffer::new_unchecked(buffers[0].clone().into()) };
offsets.slice(data.offset(), data.len() + 1);

Self {
data_type,
offsets,
values: buffers[1].clone().into(),
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
}
}
}
3 changes: 3 additions & 0 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ pub use mutable_values::*;
mod mutable;
pub use mutable::*;

#[cfg(feature = "arrow")]
mod data;

/// A [`BinaryArray`] is Arrow's semantically equivalent of an immutable `Vec<Option<Vec<u8>>>`.
/// It implements [`Array`].
///
Expand Down
35 changes: 35 additions & 0 deletions src/array/boolean/data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use crate::array::{Arrow2Arrow, BooleanArray};
use crate::bitmap::Bitmap;
use crate::datatypes::DataType;
use arrow_buffer::{BooleanBuffer, NullBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};

impl Arrow2Arrow for BooleanArray {
fn to_data(&self) -> ArrayData {
let buffer = NullBuffer::from(self.values.clone());

let builder = ArrayDataBuilder::new(arrow_schema::DataType::Boolean)
.len(buffer.len())
.offset(buffer.offset())
.buffers(vec![buffer.into_inner().into_inner()])
.nulls(self.validity.as_ref().map(|b| b.clone().into()));

// Safety: Array is valid
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
assert_eq!(data.data_type(), &arrow_schema::DataType::Boolean);

let buffers = data.buffers();
let buffer = BooleanBuffer::new(buffers[0].clone(), data.offset(), data.len());
// Use NullBuffer to compute set count
let values = Bitmap::from_null_buffer(NullBuffer::new(buffer));

Self {
data_type: DataType::Boolean,
values,
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
}
}
}
2 changes: 2 additions & 0 deletions src/array/boolean/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ use either::Either;

use super::Array;

#[cfg(feature = "arrow")]
mod data;
mod ffi;
pub(super) mod fmt;
mod from;
Expand Down
48 changes: 48 additions & 0 deletions src/array/dictionary/data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use crate::array::{
from_data, to_data, Arrow2Arrow, DictionaryArray, DictionaryKey, PrimitiveArray,
};
use crate::datatypes::{DataType, PhysicalType};
use arrow_data::{ArrayData, ArrayDataBuilder};

impl<K: DictionaryKey> Arrow2Arrow for DictionaryArray<K> {
fn to_data(&self) -> ArrayData {
let keys = self.keys.to_data();
let builder = keys
.into_builder()
.data_type(self.data_type.clone().into())
.child_data(vec![to_data(self.values.as_ref())]);

// Safety: Dictionary is valid
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
let key = match data.data_type() {
arrow_schema::DataType::Dictionary(k, _) => k.as_ref(),
d => panic!("unsupported dictionary type {d}"),
};

let data_type = DataType::from(data.data_type().clone());
assert_eq!(
data_type.to_physical_type(),
PhysicalType::Dictionary(K::KEY_TYPE)
);

let key_builder = ArrayDataBuilder::new(key.clone())
.buffers(vec![data.buffers()[0].clone()])
.offset(data.offset())
.len(data.len())
.nulls(data.nulls().cloned());

// Safety: Dictionary is valid
let key_data = unsafe { key_builder.build_unchecked() };
let keys = PrimitiveArray::from_data(&key_data);
let values = from_data(&data.child_data()[0]);

Self {
data_type,
keys,
values,
}
}
}
2 changes: 2 additions & 0 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ use crate::{
types::NativeType,
};

#[cfg(feature = "arrow")]
mod data;
mod ffi;
pub(super) mod fmt;
mod iterator;
Expand Down
36 changes: 36 additions & 0 deletions src/array/fixed_size_binary/data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use crate::array::{Arrow2Arrow, FixedSizeBinaryArray};
use crate::bitmap::Bitmap;
use crate::buffer::Buffer;
use crate::datatypes::DataType;
use arrow_data::{ArrayData, ArrayDataBuilder};

impl Arrow2Arrow for FixedSizeBinaryArray {
fn to_data(&self) -> ArrayData {
let data_type = self.data_type.clone().into();
let builder = ArrayDataBuilder::new(data_type)
.len(self.len())
.buffers(vec![self.values.clone().into()])
.nulls(self.validity.as_ref().map(|b| b.clone().into()));

// Safety: Array is valid
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
let data_type: DataType = data.data_type().clone().into();
let size = match data_type {
DataType::FixedSizeBinary(size) => size,
_ => unreachable!("must be FixedSizeBinary"),
};

let mut values: Buffer<u8> = data.buffers()[0].clone().into();
values.slice(data.offset() * size, data.len() * size);

Self {
size,
data_type,
values,
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
}
}
}
2 changes: 2 additions & 0 deletions src/array/fixed_size_binary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Error};

use super::Array;

#[cfg(feature = "arrow")]
mod data;
mod ffi;
pub(super) mod fmt;
mod iterator;
Expand Down
35 changes: 35 additions & 0 deletions src/array/fixed_size_list/data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use crate::array::{from_data, to_data, Arrow2Arrow, FixedSizeListArray};
use crate::bitmap::Bitmap;
use crate::datatypes::DataType;
use arrow_data::{ArrayData, ArrayDataBuilder};

impl Arrow2Arrow for FixedSizeListArray {
fn to_data(&self) -> ArrayData {
let data_type = self.data_type.clone().into();
let builder = ArrayDataBuilder::new(data_type)
.len(self.len())
.nulls(self.validity.as_ref().map(|b| b.clone().into()))
.child_data(vec![to_data(self.values.as_ref())]);

// Safety: Array is valid
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
let data_type: DataType = data.data_type().clone().into();
let size = match data_type {
DataType::FixedSizeList(_, size) => size,
_ => unreachable!("must be FixedSizeList type"),
};

let mut values = from_data(&data.child_data()[0]);
values.slice(data.offset() * size, data.len() * size);

Self {
size,
data_type,
values,
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
}
}
}
2 changes: 2 additions & 0 deletions src/array/fixed_size_list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ use crate::{

use super::{new_empty_array, new_null_array, Array};

#[cfg(feature = "arrow")]
mod data;
mod ffi;
pub(super) mod fmt;
mod iterator;
Expand Down
37 changes: 37 additions & 0 deletions src/array/list/data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use crate::array::{from_data, to_data, Arrow2Arrow, ListArray};
use crate::bitmap::Bitmap;
use crate::offset::{Offset, OffsetsBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};

impl<O: Offset> Arrow2Arrow for ListArray<O> {
fn to_data(&self) -> ArrayData {
let data_type = self.data_type.clone().into();

let builder = ArrayDataBuilder::new(data_type)
.len(self.len())
.buffers(vec![self.offsets.clone().into_inner().into()])
.nulls(self.validity.as_ref().map(|b| b.clone().into()))
.child_data(vec![to_data(self.values.as_ref())]);

// Safety: Array is valid
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
let data_type = data.data_type().clone().into();
if data.is_empty() {
// Handle empty offsets
return Self::new_empty(data_type);
}

let mut offsets = unsafe { OffsetsBuffer::new_unchecked(data.buffers()[0].clone().into()) };
offsets.slice(data.offset(), data.len() + 1);

Self {
data_type,
offsets,
values: from_data(&data.child_data()[0]),
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
}
}
}
2 changes: 2 additions & 0 deletions src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use crate::{

use super::{new_empty_array, specification::try_check_offsets_bounds, Array};

#[cfg(feature = "arrow")]
mod data;
mod ffi;
pub(super) mod fmt;
mod iterator;
Expand Down
37 changes: 37 additions & 0 deletions src/array/map/data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use crate::array::{from_data, to_data, Arrow2Arrow, MapArray};
use crate::bitmap::Bitmap;
use crate::offset::OffsetsBuffer;
use arrow_data::{ArrayData, ArrayDataBuilder};

impl Arrow2Arrow for MapArray {
fn to_data(&self) -> ArrayData {
let data_type = self.data_type.clone().into();

let builder = ArrayDataBuilder::new(data_type)
.len(self.len())
.buffers(vec![self.offsets.clone().into_inner().into()])
.nulls(self.validity.as_ref().map(|b| b.clone().into()))
.child_data(vec![to_data(self.field.as_ref())]);

// Safety: Array is valid
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
let data_type = data.data_type().clone().into();
if data.is_empty() {
// Handle empty offsets
return Self::new_empty(data_type);
}

let mut offsets = unsafe { OffsetsBuffer::new_unchecked(data.buffers()[0].clone().into()) };
offsets.slice(data.offset(), data.len() + 1);

Self {
data_type: data.data_type().clone().into(),
offsets,
field: from_data(&data.child_data()[0]),
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
}
}
}
2 changes: 2 additions & 0 deletions src/array/map/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use crate::{

use super::{new_empty_array, specification::try_check_offsets_bounds, Array};

#[cfg(feature = "arrow")]
mod data;
mod ffi;
pub(super) mod fmt;
mod iterator;
Expand Down
Loading