From a949bba22b4f13611470cf89f4cf9b0747c4e078 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 24 Jul 2024 18:00:09 -0400 Subject: [PATCH 01/21] fix alignment UB in VarBinViewArray --- vortex-array/src/array/varbinview/builder.rs | 10 ++--- vortex-array/src/array/varbinview/compute.rs | 12 ++++-- vortex-array/src/array/varbinview/mod.rs | 44 ++++++++++++++------ vortex-datafusion/src/lib.rs | 2 + 4 files changed, 46 insertions(+), 22 deletions(-) diff --git a/vortex-array/src/array/varbinview/builder.rs b/vortex-array/src/array/varbinview/builder.rs index a3b44f3bd9..d8eb8153a8 100644 --- a/vortex-array/src/array/varbinview/builder.rs +++ b/vortex-array/src/array/varbinview/builder.rs @@ -6,7 +6,7 @@ use arrow_buffer::NullBufferBuilder; use vortex_dtype::DType; use crate::array::primitive::PrimitiveArray; -use crate::array::varbinview::{BinaryView, Inlined, Ref, VarBinViewArray, VIEW_SIZE}; +use crate::array::varbinview::{BinaryView, Inlined, Ref, VarBinViewArray, VIEW_SIZE_TO_U64_SIZE}; use crate::validity::Validity; use crate::{ArrayData, IntoArray, IntoArrayData, ToArray}; @@ -99,17 +99,17 @@ impl> VarBinViewBuilder { }; // convert Vec to Vec which can be stored as an array - let views_u8: Vec = unsafe { + let views_u64: Vec = unsafe { let mut views_clone = ManuallyDrop::new(mem::take(&mut self.views)); Vec::from_raw_parts( views_clone.as_mut_ptr() as _, - views_clone.len() * VIEW_SIZE, - views_clone.capacity() * VIEW_SIZE, + views_clone.len() * VIEW_SIZE_TO_U64_SIZE, + views_clone.capacity() * VIEW_SIZE_TO_U64_SIZE, ) }; VarBinViewArray::try_new( - PrimitiveArray::from(views_u8).to_array(), + PrimitiveArray::from(views_u64).to_array(), completed, dtype, validity, diff --git a/vortex-array/src/array/varbinview/compute.rs b/vortex-array/src/array/varbinview/compute.rs index 938236d7d9..1bf973c665 100644 --- a/vortex-array/src/array/varbinview/compute.rs +++ b/vortex-array/src/array/varbinview/compute.rs @@ -2,7 +2,7 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::varbin::varbin_scalar; -use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE}; +use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE_TO_U64_SIZE}; use crate::compute::unary::ScalarAtFn; use crate::compute::{slice, ArrayCompute, SliceFn}; use crate::validity::ArrayValidity; @@ -32,9 +32,13 @@ impl ScalarAtFn for VarBinViewArray { impl SliceFn for VarBinViewArray { fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(Self::try_new( - slice(&self.views(), start * VIEW_SIZE, stop * VIEW_SIZE)? - .into_array_data() - .into_array(), + slice( + &self.views(), + start * VIEW_SIZE_TO_U64_SIZE, + stop * VIEW_SIZE_TO_U64_SIZE, + )? + .into_array_data() + .into_array(), (0..self.metadata().data_lens.len()) .map(|i| self.bytes(i)) .collect::>(), diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 0379f9ca9c..565228e7dd 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -38,7 +38,6 @@ struct Inlined { } impl Inlined { - #[allow(dead_code)] pub fn new(value: &[u8]) -> Self { assert!( value.len() <= BinaryView::MAX_INLINED_SIZE, @@ -106,7 +105,9 @@ impl Debug for BinaryView { } } -pub const VIEW_SIZE: usize = mem::size_of::(); +// this is a hack; views are 16 bytes with 8-byte alignment +pub(crate) const VIEW_SIZE_TO_U64_SIZE: usize = + mem::size_of::() / mem::size_of::(); impl_encoding!("vortex.varbinview", 5u16, VarBinView); @@ -123,8 +124,11 @@ impl VarBinViewArray { dtype: DType, validity: Validity, ) -> VortexResult { - if !matches!(views.dtype(), &DType::BYTES) { - vortex_bail!(MismatchedTypes: "u8", views.dtype()); + if !matches!( + views.dtype(), + &DType::Primitive(PType::U64, Nullability::NonNullable) + ) { + vortex_bail!(MismatchedTypes: "u64", views.dtype()); } for d in data.iter() { @@ -141,10 +145,9 @@ impl VarBinViewArray { vortex_bail!("incorrect validity {:?}", validity); } - let length = views.len() / VIEW_SIZE; - + let num_views = views.len() / VIEW_SIZE_TO_U64_SIZE; let metadata = VarBinViewMetadata { - validity: validity.to_metadata(views.len() / VIEW_SIZE)?, + validity: validity.to_metadata(num_views)?, data_lens: data.iter().map(|a| a.len()).collect_vec(), }; @@ -155,7 +158,7 @@ impl VarBinViewArray { children.push(a) } - Self::try_from_parts(dtype, length, metadata, children.into(), StatsSet::new()) + Self::try_from_parts(dtype, num_views, metadata, children.into(), StatsSet::new()) } fn view_slice(&self) -> &[BinaryView] { @@ -163,9 +166,9 @@ impl VarBinViewArray { slice::from_raw_parts( PrimitiveArray::try_from(self.views()) .expect("Views must be a primitive array") - .maybe_null_slice::() + .maybe_null_slice::() .as_ptr() as _, - self.views().len() / VIEW_SIZE, + self.views().len() / VIEW_SIZE_TO_U64_SIZE, ) } } @@ -177,7 +180,11 @@ impl VarBinViewArray { #[inline] pub fn views(&self) -> Array { self.array() - .child(0, &DType::BYTES, self.len() * VIEW_SIZE) + .child( + 0, + &DType::Primitive(PType::U64, Nullability::NonNullable), + self.len() * VIEW_SIZE_TO_U64_SIZE, + ) .expect("missing views") } @@ -254,7 +261,7 @@ fn as_arrow(var_bin_view: VarBinViewArray) -> ArrayRef { .views() .into_primitive() .expect("views must be primitive"); - assert_eq!(views.ptype(), PType::U8); + assert_eq!(views.ptype(), PType::U64); let nulls = var_bin_view .logical_validity() .to_null_buffer() @@ -362,7 +369,7 @@ impl<'a> FromIterator> for VarBinViewArray { mod test { use vortex_scalar::Scalar; - use crate::array::varbinview::VarBinViewArray; + use crate::array::varbinview::{BinaryView, VarBinViewArray, VIEW_SIZE_TO_U64_SIZE}; use crate::compute::slice; use crate::compute::unary::scalar_at; use crate::{Canonical, IntoArray, IntoCanonical}; @@ -408,4 +415,15 @@ mod test { assert_eq!(scalar_at(&var_bin, 0).unwrap(), Scalar::from("string1")); assert_eq!(scalar_at(&var_bin, 1).unwrap(), Scalar::from("string2")); } + + #[test] + pub fn views_fit_in_u64() { + assert_eq!(VIEW_SIZE_TO_U64_SIZE, 2); + assert_eq!(std::mem::size_of::(), 16); + assert_eq!(std::mem::align_of::(), 8); + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::() * VIEW_SIZE_TO_U64_SIZE + ); + } } diff --git a/vortex-datafusion/src/lib.rs b/vortex-datafusion/src/lib.rs index 4d1d5f5d25..af4cd1be7f 100644 --- a/vortex-datafusion/src/lib.rs +++ b/vortex-datafusion/src/lib.rs @@ -510,6 +510,7 @@ mod test { } #[tokio::test] + #[cfg_attr(miri, ignore)] async fn test_datafusion_pushdown() { let ctx = SessionContext::new(); @@ -538,6 +539,7 @@ mod test { } #[tokio::test] + #[cfg_attr(miri, ignore)] async fn test_datafusion_no_pushdown() { let ctx = SessionContext::new(); From f56f7db089504ed790855ace409acdd7505fa4bc Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 24 Jul 2024 18:17:30 -0400 Subject: [PATCH 02/21] skip a couple of very slow tests on miri --- encodings/fastlanes/src/bitpacking/compress.rs | 7 ++++++- encodings/fastlanes/src/bitpacking/compute/take.rs | 1 + vortex-sampling-compressor/tests/smoketest.rs | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/encodings/fastlanes/src/bitpacking/compress.rs b/encodings/fastlanes/src/bitpacking/compress.rs index b07a81ae1d..0a3f5480bc 100644 --- a/encodings/fastlanes/src/bitpacking/compress.rs +++ b/encodings/fastlanes/src/bitpacking/compress.rs @@ -320,8 +320,13 @@ mod test { } #[test] - fn test_compression_roundtrip() { + fn test_compression_roundtrip_fast() { compression_roundtrip(125); + } + + #[test] + #[cfg_attr(miri, ignore)] // This test is too slow on miri + fn test_compression_roundtrip() { compression_roundtrip(1024); compression_roundtrip(10_000); compression_roundtrip(10_240); diff --git a/encodings/fastlanes/src/bitpacking/compute/take.rs b/encodings/fastlanes/src/bitpacking/compute/take.rs index de0540b586..ff355b662d 100644 --- a/encodings/fastlanes/src/bitpacking/compute/take.rs +++ b/encodings/fastlanes/src/bitpacking/compute/take.rs @@ -172,6 +172,7 @@ mod test { } #[test] + #[cfg_attr(miri, ignore)] // This test is too slow on miri fn take_random_indices() { let num_patches: usize = 128; let values = (0..u16::MAX as u32 + num_patches as u32).collect::>(); diff --git a/vortex-sampling-compressor/tests/smoketest.rs b/vortex-sampling-compressor/tests/smoketest.rs index 4449be47ba..fe21044485 100644 --- a/vortex-sampling-compressor/tests/smoketest.rs +++ b/vortex-sampling-compressor/tests/smoketest.rs @@ -24,6 +24,7 @@ use vortex_sampling_compressor::compressors::CompressorRef; use vortex_sampling_compressor::{CompressConfig, SamplingCompressor}; #[test] +#[cfg_attr(miri, ignore)] // This test is too slow on miri pub fn smoketest_compressor() { let compressor = SamplingCompressor::new_with_options( HashSet::from([ From dbf11d55fc4567b70b5b3a408115570ac42a7fde Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 24 Jul 2024 18:25:09 -0400 Subject: [PATCH 03/21] run tests with miri in GH actions --- .github/workflows/miri.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/miri.yml diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml new file mode 100644 index 0000000000..3488158f65 --- /dev/null +++ b/.github/workflows/miri.yml @@ -0,0 +1,31 @@ +name: Miri + +on: + push: + branches: [ "develop" ] + pull_request: { } + workflow_dispatch: { } + +permissions: + actions: read + contents: read + +jobs: + miri: + name: 'miri' + runs-on: ubuntu-latest + env: + RUST_BACKTRACE: 1 + MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check -Zmiri-backtrace=full + steps: + - uses: actions/checkout@v4 + + - uses: ./.github/actions/cleanup + + - name: Install Protoc + uses: arduino/setup-protoc@v3 + + - uses: ./.github/actions/setup-rust + + - name: Rust Miri + run: cargo +nightly miri test From a7b041c4298f658c970437a8478b30a599582c5f Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 24 Jul 2024 18:27:04 -0400 Subject: [PATCH 04/21] move miri job into ci.yml --- .github/workflows/ci.yml | 19 +++++++++++++++++++ .github/workflows/miri.yml | 31 ------------------------------- 2 files changed, 19 insertions(+), 31 deletions(-) delete mode 100644 .github/workflows/miri.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d719ab02d3..3741ded90f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,3 +44,22 @@ jobs: - name: Pytest - PyVortex run: rye run pytest --benchmark-disable test/ working-directory: pyvortex/ + + miri: + name: 'miri' + runs-on: ubuntu-latest + env: + RUST_BACKTRACE: 1 + MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check -Zmiri-backtrace=full + steps: + - uses: actions/checkout@v4 + + - uses: ./.github/actions/cleanup + + - name: Install Protoc + uses: arduino/setup-protoc@v3 + + - uses: ./.github/actions/setup-rust + + - name: Rust Miri + run: cargo +nightly miri test diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml deleted file mode 100644 index 3488158f65..0000000000 --- a/.github/workflows/miri.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Miri - -on: - push: - branches: [ "develop" ] - pull_request: { } - workflow_dispatch: { } - -permissions: - actions: read - contents: read - -jobs: - miri: - name: 'miri' - runs-on: ubuntu-latest - env: - RUST_BACKTRACE: 1 - MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check -Zmiri-backtrace=full - steps: - - uses: actions/checkout@v4 - - - uses: ./.github/actions/cleanup - - - name: Install Protoc - uses: arduino/setup-protoc@v3 - - - uses: ./.github/actions/setup-rust - - - name: Rust Miri - run: cargo +nightly miri test From 0d73dd25736bceb67e1d273334141728ab11b2bd Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 24 Jul 2024 21:48:37 -0400 Subject: [PATCH 05/21] add miri to rust setup --- .github/actions/setup-rust/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/setup-rust/action.yml b/.github/actions/setup-rust/action.yml index 45f96b3ddd..e3a8a9d4da 100644 --- a/.github/actions/setup-rust/action.yml +++ b/.github/actions/setup-rust/action.yml @@ -14,7 +14,7 @@ runs: if: steps.rustup-cache.outputs.cache-hit != 'true' with: toolchain: "${{ steps.rust-version.outputs.version }}" - components: clippy, rustfmt + components: clippy, rustfmt, miri - name: Rust Dependency Cache uses: Swatinem/rust-cache@v2 From ae3beff6366dd7638fac6b867580a5f922bc6c2e Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 24 Jul 2024 21:55:17 -0400 Subject: [PATCH 06/21] fixes --- .github/workflows/ci.yml | 2 +- vortex-array/src/array/varbinview/compute.rs | 6 ++---- vortex-array/src/array/varbinview/mod.rs | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3741ded90f..bf7c57c225 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,4 +62,4 @@ jobs: - uses: ./.github/actions/setup-rust - name: Rust Miri - run: cargo +nightly miri test + run: cargo miri test diff --git a/vortex-array/src/array/varbinview/compute.rs b/vortex-array/src/array/varbinview/compute.rs index 1bf973c665..1dd8220ab0 100644 --- a/vortex-array/src/array/varbinview/compute.rs +++ b/vortex-array/src/array/varbinview/compute.rs @@ -6,7 +6,7 @@ use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE_TO_U64_SIZE}; use crate::compute::unary::ScalarAtFn; use crate::compute::{slice, ArrayCompute, SliceFn}; use crate::validity::ArrayValidity; -use crate::{Array, ArrayDType, IntoArray, IntoArrayData}; +use crate::{Array, ArrayDType, IntoArray}; impl ArrayCompute for VarBinViewArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -36,9 +36,7 @@ impl SliceFn for VarBinViewArray { &self.views(), start * VIEW_SIZE_TO_U64_SIZE, stop * VIEW_SIZE_TO_U64_SIZE, - )? - .into_array_data() - .into_array(), + )?, (0..self.metadata().data_lens.len()) .map(|i| self.bytes(i)) .collect::>(), diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 565228e7dd..7d822c2b2f 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -105,7 +105,7 @@ impl Debug for BinaryView { } } -// this is a hack; views are 16 bytes with 8-byte alignment +// views are 16 bytes with 8-byte alignment; we transmute to u64 instead of u8 to preserve alignment pub(crate) const VIEW_SIZE_TO_U64_SIZE: usize = mem::size_of::() / mem::size_of::(); From 82f8d56ffc5cf281e4b26b19f9277da30fa22b0a Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 10:35:09 -0400 Subject: [PATCH 07/21] fix it --- vortex-array/src/array/varbinview/builder.rs | 44 ++++++++++++++------ vortex-array/src/array/varbinview/compute.rs | 8 +--- vortex-array/src/array/varbinview/mod.rs | 38 +++++++---------- vortex-array/src/lib.rs | 1 + vortex-buffer/src/allocator.rs | 31 ++++++++++++++ vortex-buffer/src/lib.rs | 3 ++ 6 files changed, 83 insertions(+), 42 deletions(-) create mode 100644 vortex-buffer/src/allocator.rs diff --git a/vortex-array/src/array/varbinview/builder.rs b/vortex-array/src/array/varbinview/builder.rs index d8eb8153a8..67817e8c70 100644 --- a/vortex-array/src/array/varbinview/builder.rs +++ b/vortex-array/src/array/varbinview/builder.rs @@ -1,17 +1,26 @@ +use core::ptr::NonNull; use std::marker::PhantomData; use std::mem; use std::mem::ManuallyDrop; +use std::sync::Arc; -use arrow_buffer::NullBufferBuilder; -use vortex_dtype::DType; +use arrow_buffer::{Buffer as ArrowBuffer, NullBufferBuilder}; +use vortex_buffer::allocator::MinAlignmentAllocator; +use vortex_buffer::Buffer; +use vortex_dtype::{DType, PType}; use crate::array::primitive::PrimitiveArray; -use crate::array::varbinview::{BinaryView, Inlined, Ref, VarBinViewArray, VIEW_SIZE_TO_U64_SIZE}; +use crate::array::varbinview::{BinaryView, Inlined, Ref, VarBinViewArray, VIEW_SIZE}; use crate::validity::Validity; use crate::{ArrayData, IntoArray, IntoArrayData, ToArray}; +// BinaryView has 8 byte alignment (because that's what's in the arrow spec), but arrow-rs +// erroneously requires 16 byte alignment. +const BINARY_VIEW_ALLOCATOR: MinAlignmentAllocator = + MinAlignmentAllocator::new(size_of::()); + pub struct VarBinViewBuilder> { - views: Vec, + views: Vec, nulls: NullBufferBuilder, completed: Vec, in_progress: Vec, @@ -22,7 +31,7 @@ pub struct VarBinViewBuilder> { impl> VarBinViewBuilder { pub fn with_capacity(capacity: usize) -> Self { Self { - views: Vec::with_capacity(capacity), + views: Vec::with_capacity_in(capacity, BINARY_VIEW_ALLOCATOR), nulls: NullBufferBuilder::new(capacity), completed: Vec::new(), in_progress: Vec::new(), @@ -99,17 +108,28 @@ impl> VarBinViewBuilder { }; // convert Vec to Vec which can be stored as an array - let views_u64: Vec = unsafe { - let mut views_clone = ManuallyDrop::new(mem::take(&mut self.views)); - Vec::from_raw_parts( + // have to ensure that we use the correct allocator at deallocation time + let views: Buffer = unsafe { + let mut views_clone = ManuallyDrop::new(mem::replace( + &mut self.views, + Vec::new_in(BINARY_VIEW_ALLOCATOR), + )); + let mut views_bytes: Vec = Vec::from_raw_parts_in( views_clone.as_mut_ptr() as _, - views_clone.len() * VIEW_SIZE_TO_U64_SIZE, - views_clone.capacity() * VIEW_SIZE_TO_U64_SIZE, - ) + views_clone.len() * VIEW_SIZE, + views_clone.capacity() * VIEW_SIZE, + BINARY_VIEW_ALLOCATOR, + ); + let buf = ArrowBuffer::from_custom_allocation( + NonNull::new_unchecked(views_bytes.as_mut_ptr()), + views_bytes.len(), + Arc::new(views_bytes), + ); + Buffer::Arrow(buf) }; VarBinViewArray::try_new( - PrimitiveArray::from(views_u64).to_array(), + PrimitiveArray::new(views, PType::U8, Validity::NonNullable).to_array(), completed, dtype, validity, diff --git a/vortex-array/src/array/varbinview/compute.rs b/vortex-array/src/array/varbinview/compute.rs index 1dd8220ab0..a966a8981d 100644 --- a/vortex-array/src/array/varbinview/compute.rs +++ b/vortex-array/src/array/varbinview/compute.rs @@ -2,7 +2,7 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::varbin::varbin_scalar; -use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE_TO_U64_SIZE}; +use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE}; use crate::compute::unary::ScalarAtFn; use crate::compute::{slice, ArrayCompute, SliceFn}; use crate::validity::ArrayValidity; @@ -32,11 +32,7 @@ impl ScalarAtFn for VarBinViewArray { impl SliceFn for VarBinViewArray { fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(Self::try_new( - slice( - &self.views(), - start * VIEW_SIZE_TO_U64_SIZE, - stop * VIEW_SIZE_TO_U64_SIZE, - )?, + slice(&self.views(), start * VIEW_SIZE, stop * VIEW_SIZE)?, (0..self.metadata().data_lens.len()) .map(|i| self.bytes(i)) .collect::>(), diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 7d822c2b2f..d13a37aad1 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -105,9 +105,8 @@ impl Debug for BinaryView { } } -// views are 16 bytes with 8-byte alignment; we transmute to u64 instead of u8 to preserve alignment -pub(crate) const VIEW_SIZE_TO_U64_SIZE: usize = - mem::size_of::() / mem::size_of::(); +// reminder: views are 16 bytes with 8-byte alignment +pub(crate) const VIEW_SIZE: usize = mem::size_of::(); impl_encoding!("vortex.varbinview", 5u16, VarBinView); @@ -124,11 +123,8 @@ impl VarBinViewArray { dtype: DType, validity: Validity, ) -> VortexResult { - if !matches!( - views.dtype(), - &DType::Primitive(PType::U64, Nullability::NonNullable) - ) { - vortex_bail!(MismatchedTypes: "u64", views.dtype()); + if !matches!(views.dtype(), &DType::BYTES) { + vortex_bail!(MismatchedTypes: "u8", views.dtype()); } for d in data.iter() { @@ -145,7 +141,7 @@ impl VarBinViewArray { vortex_bail!("incorrect validity {:?}", validity); } - let num_views = views.len() / VIEW_SIZE_TO_U64_SIZE; + let num_views = views.len() / VIEW_SIZE; let metadata = VarBinViewMetadata { validity: validity.to_metadata(num_views)?, data_lens: data.iter().map(|a| a.len()).collect_vec(), @@ -166,9 +162,9 @@ impl VarBinViewArray { slice::from_raw_parts( PrimitiveArray::try_from(self.views()) .expect("Views must be a primitive array") - .maybe_null_slice::() + .maybe_null_slice::() .as_ptr() as _, - self.views().len() / VIEW_SIZE_TO_U64_SIZE, + self.views().len() / VIEW_SIZE, ) } } @@ -180,11 +176,7 @@ impl VarBinViewArray { #[inline] pub fn views(&self) -> Array { self.array() - .child( - 0, - &DType::Primitive(PType::U64, Nullability::NonNullable), - self.len() * VIEW_SIZE_TO_U64_SIZE, - ) + .child(0, &DType::BYTES, self.len() * VIEW_SIZE) .expect("missing views") } @@ -261,7 +253,7 @@ fn as_arrow(var_bin_view: VarBinViewArray) -> ArrayRef { .views() .into_primitive() .expect("views must be primitive"); - assert_eq!(views.ptype(), PType::U64); + assert_eq!(views.ptype(), PType::U8); let nulls = var_bin_view .logical_validity() .to_null_buffer() @@ -369,7 +361,7 @@ impl<'a> FromIterator> for VarBinViewArray { mod test { use vortex_scalar::Scalar; - use crate::array::varbinview::{BinaryView, VarBinViewArray, VIEW_SIZE_TO_U64_SIZE}; + use crate::array::varbinview::{BinaryView, Inlined, Ref, VarBinViewArray, VIEW_SIZE}; use crate::compute::slice; use crate::compute::unary::scalar_at; use crate::{Canonical, IntoArray, IntoCanonical}; @@ -417,13 +409,11 @@ mod test { } #[test] - pub fn views_fit_in_u64() { - assert_eq!(VIEW_SIZE_TO_U64_SIZE, 2); + pub fn binary_view_size_and_alignment() { + assert_eq!(std::mem::size_of::(), 16); + assert_eq!(std::mem::size_of::(), 16); + assert_eq!(std::mem::size_of::(), VIEW_SIZE); assert_eq!(std::mem::size_of::(), 16); assert_eq!(std::mem::align_of::(), 8); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() * VIEW_SIZE_TO_U64_SIZE - ); } } diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 5c7f4f3c79..f23c889e4e 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -8,6 +8,7 @@ //! Every data type recognized by Vortex also has a canonical physical encoding format, which //! arrays can be [canonicalized](Canonical) into for ease of access in compute functions. //! +#![feature(allocator_api)] use std::fmt::{Debug, Display, Formatter}; use std::future::ready; diff --git a/vortex-buffer/src/allocator.rs b/vortex-buffer/src/allocator.rs new file mode 100644 index 0000000000..6a43f2ddf0 --- /dev/null +++ b/vortex-buffer/src/allocator.rs @@ -0,0 +1,31 @@ +use std::alloc::{AllocError, Allocator, Global, Layout}; +use std::ptr::NonNull; + +pub struct MinAlignmentAllocator { + min_alignment: usize, +} + +unsafe impl Allocator for MinAlignmentAllocator { + fn allocate(&self, layout: Layout) -> Result, AllocError> { + Global.allocate( + layout + .align_to(self.min_alignment) + .map_err(|_| AllocError)?, + ) + } + + unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { + let layout = layout.align_to(self.min_alignment).expect( + "align_to failed, which can only happen if self.min_alignment is not a power of two", + ); + unsafe { Global.deallocate(ptr, layout) } + } +} + +impl MinAlignmentAllocator { + pub const fn new(min_alignment: usize) -> Self { + assert!(min_alignment.is_power_of_two()); + let min_alignment = min_alignment.next_power_of_two(); + Self { min_alignment } + } +} diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index 1d7122f3db..56928c349b 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -1,9 +1,12 @@ +#![feature(allocator_api)] + use std::cmp::Ordering; use std::ops::{Deref, Range}; use arrow_buffer::{ArrowNativeType, Buffer as ArrowBuffer}; pub use string::*; +pub mod allocator; mod flexbuffers; pub mod io_buf; mod string; From b073de129f2998361b1da0ac91a9b0a0845e7b60 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 10:41:54 -0400 Subject: [PATCH 08/21] rename step --- .github/workflows/ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf7c57c225..a27a5b398e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,6 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: ./.github/actions/cleanup - name: Install Protoc @@ -61,5 +60,5 @@ jobs: - uses: ./.github/actions/setup-rust - - name: Rust Miri + - name: Run tests with Miri run: cargo miri test From 8604255cd5c77d19a09247c12ffb0102fddf415e Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 10:44:53 -0400 Subject: [PATCH 09/21] run rustsec audit on every PR --- .github/workflows/ci.yml | 4 ++++ .github/workflows/rustsec-audit.yml | 12 ------------ 2 files changed, 4 insertions(+), 12 deletions(-) delete mode 100644 .github/workflows/rustsec-audit.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 70f81ae967..365fdc3104 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,8 +39,12 @@ jobs: run: cargo test --workspace --all-features - name: Rust Build run: cargo build --all-features --all-targets + - name: License Check run: cargo install --locked cargo-deny && cargo deny check + - uses: rustsec/audit-check@v1.4.1 + with: + token: ${{ secrets.GITHUB_TOKEN }} - name: Pytest - PyVortex run: rye run pytest --benchmark-disable test/ diff --git a/.github/workflows/rustsec-audit.yml b/.github/workflows/rustsec-audit.yml deleted file mode 100644 index 4c179e4457..0000000000 --- a/.github/workflows/rustsec-audit.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: RustSec audit -on: - schedule: - - cron: '0 0 * * *' -jobs: - audit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: rustsec/audit-check@v1.4.1 - with: - token: ${{ secrets.GITHUB_TOKEN }} From c18f01e8d28e96efb44fa673665455dfe89c77c5 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 11:00:28 -0400 Subject: [PATCH 10/21] try using nextest to speed up miri tests --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 365fdc3104..730bb1a3b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -65,6 +65,6 @@ jobs: uses: arduino/setup-protoc@v3 - uses: ./.github/actions/setup-rust - + - uses: taiki-e/install-action@nextest - name: Run tests with Miri - run: cargo miri test + run: cargo miri nextest run From 5e6d6708dbf68477b8d25eac3a4ecc87d9a5768f Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 11:10:34 -0400 Subject: [PATCH 11/21] parallelism on a bigger runner --- .github/workflows/ci.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 730bb1a3b7..c83fef79b8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,6 +10,9 @@ permissions: actions: read contents: read +env: + CARGO_TERM_COLOR: always + jobs: build: name: 'build' @@ -52,7 +55,7 @@ jobs: miri: name: 'miri' - runs-on: ubuntu-latest + runs-on: ubuntu-latest-medium env: RUST_BACKTRACE: 1 MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check -Zmiri-backtrace=full @@ -67,4 +70,4 @@ jobs: - uses: ./.github/actions/setup-rust - uses: taiki-e/install-action@nextest - name: Run tests with Miri - run: cargo miri nextest run + run: cargo miri nextest run -j4 From 4e27946edcf391bb46e74d8fd25a8ff30e98c608 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 11:17:43 -0400 Subject: [PATCH 12/21] no nextest --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c83fef79b8..69e8caaea9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,6 +68,6 @@ jobs: uses: arduino/setup-protoc@v3 - uses: ./.github/actions/setup-rust - - uses: taiki-e/install-action@nextest + - name: Run tests with Miri - run: cargo miri nextest run -j4 + run: cargo miri test From ef710dca0b37fa264c4fa7b0057ee3582fb76cfc Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 11:20:20 -0400 Subject: [PATCH 13/21] ugh bigger runners take too long to provision --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 69e8caaea9..9f69203947 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,7 +55,7 @@ jobs: miri: name: 'miri' - runs-on: ubuntu-latest-medium + runs-on: ubuntu-latest env: RUST_BACKTRACE: 1 MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check -Zmiri-backtrace=full From b02a5cebb208bb269a554d79fbc3c7600b5ec602 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 11:32:00 -0400 Subject: [PATCH 14/21] self cr --- vortex-buffer/src/allocator.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/vortex-buffer/src/allocator.rs b/vortex-buffer/src/allocator.rs index 6a43f2ddf0..2f69b0570c 100644 --- a/vortex-buffer/src/allocator.rs +++ b/vortex-buffer/src/allocator.rs @@ -24,7 +24,6 @@ unsafe impl Allocator for MinAlignmentAllocator { impl MinAlignmentAllocator { pub const fn new(min_alignment: usize) -> Self { - assert!(min_alignment.is_power_of_two()); let min_alignment = min_alignment.next_power_of_two(); Self { min_alignment } } From ad460783371ed8f2ddb191c93dbf3ac604654fa3 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 12:02:45 -0400 Subject: [PATCH 15/21] streamline --- vortex-array/src/array/varbinview/builder.rs | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/vortex-array/src/array/varbinview/builder.rs b/vortex-array/src/array/varbinview/builder.rs index 67817e8c70..7812c622a4 100644 --- a/vortex-array/src/array/varbinview/builder.rs +++ b/vortex-array/src/array/varbinview/builder.rs @@ -1,7 +1,6 @@ use core::ptr::NonNull; use std::marker::PhantomData; use std::mem; -use std::mem::ManuallyDrop; use std::sync::Arc; use arrow_buffer::{Buffer as ArrowBuffer, NullBufferBuilder}; @@ -110,20 +109,11 @@ impl> VarBinViewBuilder { // convert Vec to Vec which can be stored as an array // have to ensure that we use the correct allocator at deallocation time let views: Buffer = unsafe { - let mut views_clone = ManuallyDrop::new(mem::replace( - &mut self.views, - Vec::new_in(BINARY_VIEW_ALLOCATOR), - )); - let mut views_bytes: Vec = Vec::from_raw_parts_in( - views_clone.as_mut_ptr() as _, - views_clone.len() * VIEW_SIZE, - views_clone.capacity() * VIEW_SIZE, - BINARY_VIEW_ALLOCATOR, - ); + let mut views_clone = mem::replace(&mut self.views, Vec::new_in(BINARY_VIEW_ALLOCATOR)); let buf = ArrowBuffer::from_custom_allocation( - NonNull::new_unchecked(views_bytes.as_mut_ptr()), - views_bytes.len(), - Arc::new(views_bytes), + NonNull::new_unchecked(views_clone.as_mut_ptr() as *mut u8), + views_clone.len() * VIEW_SIZE, + Arc::new(views_clone), ); Buffer::Arrow(buf) }; From 30956425d21719f594df9b59d6e12c3bfe284b81 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Thu, 25 Jul 2024 17:13:59 +0100 Subject: [PATCH 16/21] Remove varbin builder --- vortex-array/src/array/varbinview/builder.rs | 119 ------------------- vortex-array/src/array/varbinview/mod.rs | 86 +++++++------- 2 files changed, 41 insertions(+), 164 deletions(-) delete mode 100644 vortex-array/src/array/varbinview/builder.rs diff --git a/vortex-array/src/array/varbinview/builder.rs b/vortex-array/src/array/varbinview/builder.rs deleted file mode 100644 index a3b44f3bd9..0000000000 --- a/vortex-array/src/array/varbinview/builder.rs +++ /dev/null @@ -1,119 +0,0 @@ -use std::marker::PhantomData; -use std::mem; -use std::mem::ManuallyDrop; - -use arrow_buffer::NullBufferBuilder; -use vortex_dtype::DType; - -use crate::array::primitive::PrimitiveArray; -use crate::array::varbinview::{BinaryView, Inlined, Ref, VarBinViewArray, VIEW_SIZE}; -use crate::validity::Validity; -use crate::{ArrayData, IntoArray, IntoArrayData, ToArray}; - -pub struct VarBinViewBuilder> { - views: Vec, - nulls: NullBufferBuilder, - completed: Vec, - in_progress: Vec, - block_size: u32, - phantom: PhantomData, -} - -impl> VarBinViewBuilder { - pub fn with_capacity(capacity: usize) -> Self { - Self { - views: Vec::with_capacity(capacity), - nulls: NullBufferBuilder::new(capacity), - completed: Vec::new(), - in_progress: Vec::new(), - block_size: 16 * 1024, - phantom: Default::default(), - } - } - - #[inline] - pub fn push(&mut self, value: Option) { - match value { - None => self.push_null(), - Some(v) => self.push_value(v), - } - } - - #[inline] - pub fn push_value(&mut self, value: T) { - let vbytes = value.as_ref(); - if self.in_progress.len() + vbytes.len() > self.in_progress.capacity() { - let done = mem::replace( - &mut self.in_progress, - Vec::with_capacity(vbytes.len().max(self.block_size as usize)), - ); - if !done.is_empty() { - assert!(self.completed.len() < u32::MAX as usize); - self.completed - .push(PrimitiveArray::from(done).into_array_data()); - } - } - - if vbytes.len() > BinaryView::MAX_INLINED_SIZE { - self.views.push(BinaryView { - _ref: Ref::new( - vbytes.len() as u32, - vbytes[0..4].try_into().unwrap(), - self.completed.len() as u32, - self.in_progress.len() as u32, - ), - }); - self.in_progress.extend_from_slice(vbytes); - } else { - self.views.push(BinaryView { - inlined: Inlined::new(vbytes), - }); - } - self.nulls.append_non_null(); - } - - #[inline] - pub fn push_null(&mut self) { - self.views.push(BinaryView { - inlined: Inlined::new(b""), - }); - self.nulls.append_null(); - } - - pub fn finish(mut self, dtype: DType) -> VarBinViewArray { - let mut completed = self - .completed - .into_iter() - .map(|d| d.into_array()) - .collect::>(); - if !self.in_progress.is_empty() { - completed.push(PrimitiveArray::from(self.in_progress).into_array()); - } - - let nulls = self.nulls.finish(); - let validity = if dtype.is_nullable() { - nulls.map(Validity::from).unwrap_or(Validity::AllValid) - } else { - assert!(nulls.is_none(), "dtype and validity mismatch"); - Validity::NonNullable - }; - - // convert Vec to Vec which can be stored as an array - let views_u8: Vec = unsafe { - let mut views_clone = ManuallyDrop::new(mem::take(&mut self.views)); - Vec::from_raw_parts( - views_clone.as_mut_ptr() as _, - views_clone.len() * VIEW_SIZE, - views_clone.capacity() * VIEW_SIZE, - ) - }; - - VarBinViewArray::try_new( - PrimitiveArray::from(views_u8).to_array(), - completed, - dtype, - validity, - ) - .unwrap() - } -} diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 0379f9ca9c..3aef8f6cca 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -4,16 +4,16 @@ use std::sync::Arc; use std::{mem, slice}; use ::serde::{Deserialize, Serialize}; +use arrow_array::builder::{BinaryViewBuilder, StringViewBuilder}; use arrow_array::{ArrayRef, BinaryViewArray, StringViewArray}; use arrow_buffer::ScalarBuffer; use arrow_schema::DataType; use itertools::Itertools; -use vortex_dtype::{DType, Nullability, PType}; +use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, VortexResult}; use crate::array::primitive::PrimitiveArray; use crate::array::varbin::VarBinArray; -use crate::array::varbinview::builder::VarBinViewBuilder; use crate::arrow::FromArrowArray; use crate::compute::slice; use crate::stats::StatsSet; @@ -25,7 +25,6 @@ use crate::{ }; mod accessor; -mod builder; mod compute; mod stats; mod variants; @@ -196,24 +195,45 @@ impl VarBinViewArray { )) } - pub fn from_vec>(vec: Vec, dtype: DType) -> Self { - let mut builder = VarBinViewBuilder::with_capacity(vec.len()); - for v in vec { - builder.push_value(v) + pub fn from_iter_str, I: IntoIterator>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0); + for s in iter { + builder.append_value(s); } - builder.finish(dtype) + let array_data = ArrayData::from_arrow(&builder.finish(), false); + VarBinViewArray::try_from(array_data.into_array()).expect("should be var bin view array") } - pub fn from_iter, I: IntoIterator>>( + pub fn from_iter_nullable_str, I: IntoIterator>>( iter: I, - dtype: DType, ) -> Self { let iter = iter.into_iter(); - let mut builder = VarBinViewBuilder::with_capacity(iter.size_hint().0); - for v in iter { - builder.push(v) + let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0); + builder.extend(iter); + + let array_data = ArrayData::from_arrow(&builder.finish(), true); + VarBinViewArray::try_from(array_data.into_array()).expect("should be var bin view array") + } + + pub fn from_iter_bin, I: IntoIterator>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0); + for b in iter { + builder.append_value(b); } - builder.finish(dtype) + let array_data = ArrayData::from_arrow(&builder.finish(), true); + VarBinViewArray::try_from(array_data.into_array()).expect("should be var bin view array") + } + + pub fn from_iter_nullable_bin, I: IntoIterator>>( + iter: I, + ) -> Self { + let iter = iter.into_iter(); + let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0); + builder.extend(iter); + let array_data = ArrayData::from_arrow(&builder.finish(), true); + VarBinViewArray::try_from(array_data.into_array()).expect("should be var bin view array") } pub fn bytes_at(&self, index: usize) -> VortexResult> { @@ -310,51 +330,27 @@ impl AcceptArrayVisitor for VarBinViewArray { } } -impl From> for VarBinViewArray { - fn from(value: Vec<&[u8]>) -> Self { - Self::from_vec(value, DType::Binary(Nullability::NonNullable)) - } -} - -impl From>> for VarBinViewArray { - fn from(value: Vec>) -> Self { - Self::from_vec(value, DType::Binary(Nullability::NonNullable)) - } -} - -impl From> for VarBinViewArray { - fn from(value: Vec) -> Self { - Self::from_vec(value, DType::Utf8(Nullability::NonNullable)) - } -} - -impl From> for VarBinViewArray { - fn from(value: Vec<&str>) -> Self { - Self::from_vec(value, DType::Utf8(Nullability::NonNullable)) - } -} - impl<'a> FromIterator> for VarBinViewArray { fn from_iter>>(iter: T) -> Self { - Self::from_iter(iter, DType::Binary(Nullability::NonNullable)) + Self::from_iter_nullable_bin(iter) } } impl FromIterator>> for VarBinViewArray { fn from_iter>>>(iter: T) -> Self { - Self::from_iter(iter, DType::Binary(Nullability::NonNullable)) + Self::from_iter_nullable_bin(iter) } } impl FromIterator> for VarBinViewArray { fn from_iter>>(iter: T) -> Self { - Self::from_iter(iter, DType::Utf8(Nullability::NonNullable)) + Self::from_iter_nullable_str(iter) } } impl<'a> FromIterator> for VarBinViewArray { fn from_iter>>(iter: T) -> Self { - Self::from_iter(iter, DType::Utf8(Nullability::NonNullable)) + Self::from_iter_nullable_str(iter) } } @@ -370,7 +366,7 @@ mod test { #[test] pub fn varbin_view() { let binary_arr = - VarBinViewArray::from(vec!["hello world", "hello world this is a long string"]); + VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]); assert_eq!(binary_arr.len(), 2); assert_eq!( scalar_at(binary_arr.array(), 0).unwrap(), @@ -385,7 +381,7 @@ mod test { #[test] pub fn slice_array() { let binary_arr = slice( - &VarBinViewArray::from(vec!["hello world", "hello world this is a long string"]) + &VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]) .into_array(), 1, 2, @@ -399,7 +395,7 @@ mod test { #[test] pub fn flatten_array() { - let binary_arr = VarBinViewArray::from(vec!["string1", "string2"]); + let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]); let flattened = binary_arr.into_canonical().unwrap(); assert!(matches!(flattened, Canonical::VarBin(_))); From f6f085080fe3e4204365a9bb21ecf2cdfc5bb43b Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 12:16:59 -0400 Subject: [PATCH 17/21] nick feedback --- vortex-array/src/array/varbinview/builder.rs | 9 ++---- vortex-array/src/lib.rs | 2 -- vortex-buffer/src/allocator.rs | 30 -------------------- vortex-buffer/src/lib.rs | 1 - 4 files changed, 3 insertions(+), 39 deletions(-) delete mode 100644 vortex-buffer/src/allocator.rs diff --git a/vortex-array/src/array/varbinview/builder.rs b/vortex-array/src/array/varbinview/builder.rs index 7812c622a4..925439a681 100644 --- a/vortex-array/src/array/varbinview/builder.rs +++ b/vortex-array/src/array/varbinview/builder.rs @@ -4,7 +4,6 @@ use std::mem; use std::sync::Arc; use arrow_buffer::{Buffer as ArrowBuffer, NullBufferBuilder}; -use vortex_buffer::allocator::MinAlignmentAllocator; use vortex_buffer::Buffer; use vortex_dtype::{DType, PType}; @@ -15,11 +14,9 @@ use crate::{ArrayData, IntoArray, IntoArrayData, ToArray}; // BinaryView has 8 byte alignment (because that's what's in the arrow spec), but arrow-rs // erroneously requires 16 byte alignment. -const BINARY_VIEW_ALLOCATOR: MinAlignmentAllocator = - MinAlignmentAllocator::new(size_of::()); pub struct VarBinViewBuilder> { - views: Vec, + views: Vec, nulls: NullBufferBuilder, completed: Vec, in_progress: Vec, @@ -30,7 +27,7 @@ pub struct VarBinViewBuilder> { impl> VarBinViewBuilder { pub fn with_capacity(capacity: usize) -> Self { Self { - views: Vec::with_capacity_in(capacity, BINARY_VIEW_ALLOCATOR), + views: Vec::with_capacity(capacity), nulls: NullBufferBuilder::new(capacity), completed: Vec::new(), in_progress: Vec::new(), @@ -109,7 +106,7 @@ impl> VarBinViewBuilder { // convert Vec to Vec which can be stored as an array // have to ensure that we use the correct allocator at deallocation time let views: Buffer = unsafe { - let mut views_clone = mem::replace(&mut self.views, Vec::new_in(BINARY_VIEW_ALLOCATOR)); + let mut views_clone = mem::take(&mut self.views); let buf = ArrowBuffer::from_custom_allocation( NonNull::new_unchecked(views_clone.as_mut_ptr() as *mut u8), views_clone.len() * VIEW_SIZE, diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index f23c889e4e..5a6adb6195 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -8,8 +8,6 @@ //! Every data type recognized by Vortex also has a canonical physical encoding format, which //! arrays can be [canonicalized](Canonical) into for ease of access in compute functions. //! -#![feature(allocator_api)] - use std::fmt::{Debug, Display, Formatter}; use std::future::ready; diff --git a/vortex-buffer/src/allocator.rs b/vortex-buffer/src/allocator.rs deleted file mode 100644 index 2f69b0570c..0000000000 --- a/vortex-buffer/src/allocator.rs +++ /dev/null @@ -1,30 +0,0 @@ -use std::alloc::{AllocError, Allocator, Global, Layout}; -use std::ptr::NonNull; - -pub struct MinAlignmentAllocator { - min_alignment: usize, -} - -unsafe impl Allocator for MinAlignmentAllocator { - fn allocate(&self, layout: Layout) -> Result, AllocError> { - Global.allocate( - layout - .align_to(self.min_alignment) - .map_err(|_| AllocError)?, - ) - } - - unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { - let layout = layout.align_to(self.min_alignment).expect( - "align_to failed, which can only happen if self.min_alignment is not a power of two", - ); - unsafe { Global.deallocate(ptr, layout) } - } -} - -impl MinAlignmentAllocator { - pub const fn new(min_alignment: usize) -> Self { - let min_alignment = min_alignment.next_power_of_two(); - Self { min_alignment } - } -} diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index 56928c349b..bdabdfe50a 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -6,7 +6,6 @@ use std::ops::{Deref, Range}; use arrow_buffer::{ArrowNativeType, Buffer as ArrowBuffer}; pub use string::*; -pub mod allocator; mod flexbuffers; pub mod io_buf; mod string; From 096bc96a356e01372f0a1e4bb82e0573d05af1a8 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 12:27:38 -0400 Subject: [PATCH 18/21] bring back our bigger block size --- vortex-array/src/array/varbinview/mod.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 42753b8a89..2b0d207148 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -31,7 +31,7 @@ mod variants; #[derive(Clone, Copy, Debug)] #[repr(C, align(8))] -struct Inlined { +pub struct Inlined { size: u32, data: [u8; BinaryView::MAX_INLINED_SIZE], } @@ -54,7 +54,7 @@ impl Inlined { #[derive(Clone, Copy, Debug)] #[repr(C, align(8))] -struct Ref { +pub struct Ref { size: u32, prefix: [u8; 4], buffer_index: u32, @@ -194,9 +194,11 @@ impl VarBinViewArray { )) } + const BUILDER_BLOCK_SIZE: u32 = 16 * 1024; pub fn from_iter_str, I: IntoIterator>(iter: I) -> Self { let iter = iter.into_iter(); - let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0); + let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0) + .with_block_size(Self::BUILDER_BLOCK_SIZE); for s in iter { builder.append_value(s); } @@ -208,7 +210,8 @@ impl VarBinViewArray { iter: I, ) -> Self { let iter = iter.into_iter(); - let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0); + let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0) + .with_block_size(Self::BUILDER_BLOCK_SIZE); builder.extend(iter); let array_data = ArrayData::from_arrow(&builder.finish(), true); @@ -217,7 +220,8 @@ impl VarBinViewArray { pub fn from_iter_bin, I: IntoIterator>(iter: I) -> Self { let iter = iter.into_iter(); - let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0); + let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0) + .with_block_size(Self::BUILDER_BLOCK_SIZE); for b in iter { builder.append_value(b); } @@ -229,7 +233,8 @@ impl VarBinViewArray { iter: I, ) -> Self { let iter = iter.into_iter(); - let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0); + let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0) + .with_block_size(Self::BUILDER_BLOCK_SIZE); builder.extend(iter); let array_data = ArrayData::from_arrow(&builder.finish(), true); VarBinViewArray::try_from(array_data.into_array()).expect("should be var bin view array") From 678bbf633f86e2a32b20c85832556294373ef24c Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 12:29:03 -0400 Subject: [PATCH 19/21] missed one --- vortex-buffer/src/lib.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index bdabdfe50a..1d7122f3db 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -1,5 +1,3 @@ -#![feature(allocator_api)] - use std::cmp::Ordering; use std::ops::{Deref, Range}; From cd2b45f207ed7641985214cf6b9fa872e87f0efa Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 12:30:04 -0400 Subject: [PATCH 20/21] Revert "bring back our bigger block size" This reverts commit 096bc96a356e01372f0a1e4bb82e0573d05af1a8. --- vortex-array/src/array/varbinview/mod.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 2b0d207148..42753b8a89 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -31,7 +31,7 @@ mod variants; #[derive(Clone, Copy, Debug)] #[repr(C, align(8))] -pub struct Inlined { +struct Inlined { size: u32, data: [u8; BinaryView::MAX_INLINED_SIZE], } @@ -54,7 +54,7 @@ impl Inlined { #[derive(Clone, Copy, Debug)] #[repr(C, align(8))] -pub struct Ref { +struct Ref { size: u32, prefix: [u8; 4], buffer_index: u32, @@ -194,11 +194,9 @@ impl VarBinViewArray { )) } - const BUILDER_BLOCK_SIZE: u32 = 16 * 1024; pub fn from_iter_str, I: IntoIterator>(iter: I) -> Self { let iter = iter.into_iter(); - let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0) - .with_block_size(Self::BUILDER_BLOCK_SIZE); + let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0); for s in iter { builder.append_value(s); } @@ -210,8 +208,7 @@ impl VarBinViewArray { iter: I, ) -> Self { let iter = iter.into_iter(); - let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0) - .with_block_size(Self::BUILDER_BLOCK_SIZE); + let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0); builder.extend(iter); let array_data = ArrayData::from_arrow(&builder.finish(), true); @@ -220,8 +217,7 @@ impl VarBinViewArray { pub fn from_iter_bin, I: IntoIterator>(iter: I) -> Self { let iter = iter.into_iter(); - let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0) - .with_block_size(Self::BUILDER_BLOCK_SIZE); + let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0); for b in iter { builder.append_value(b); } @@ -233,8 +229,7 @@ impl VarBinViewArray { iter: I, ) -> Self { let iter = iter.into_iter(); - let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0) - .with_block_size(Self::BUILDER_BLOCK_SIZE); + let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0); builder.extend(iter); let array_data = ArrayData::from_arrow(&builder.finish(), true); VarBinViewArray::try_from(array_data.into_array()).expect("should be var bin view array") From 6e4f5e5ef5d2d0909314a024c5b5c2d1e8570b9d Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 25 Jul 2024 12:30:30 -0400 Subject: [PATCH 21/21] pub --- vortex-array/src/array/varbinview/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 42753b8a89..12addc709e 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -31,7 +31,7 @@ mod variants; #[derive(Clone, Copy, Debug)] #[repr(C, align(8))] -struct Inlined { +pub struct Inlined { size: u32, data: [u8; BinaryView::MAX_INLINED_SIZE], } @@ -54,7 +54,7 @@ impl Inlined { #[derive(Clone, Copy, Debug)] #[repr(C, align(8))] -struct Ref { +pub struct Ref { size: u32, prefix: [u8; 4], buffer_index: u32,