From 0c3732fcae50ea188f4677fcf0c598f723100db6 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 2 Aug 2024 09:31:06 -0400 Subject: [PATCH] Optimize `take` kernel for `BinaryViewArray` and `StringViewArray` (#6168) * improve speed of view take kernel * ArrayData -> new_unchecked * Update arrow-select/src/take.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- arrow-select/src/take.rs | 9 ++++----- arrow/benches/take_kernels.rs | 36 +++++++++++++++++++++++++++++++++++ arrow/src/util/bench_util.rs | 28 +++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index d6892eb0a9e4..b66133ac71f0 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -487,11 +487,10 @@ fn take_byte_view( ) -> Result, ArrowError> { let new_views = take_native(array.views(), indices); let new_nulls = take_nulls(array.nulls(), indices); - Ok(GenericByteViewArray::new( - new_views, - array.data_buffers().to_vec(), - new_nulls, - )) + // Safety: array.views was valid, and take_native copies only valid values, and verifies bounds + Ok(unsafe { + GenericByteViewArray::new_unchecked(new_views, array.data_buffers().to_vec(), new_nulls) + }) } /// `take` implementation for list arrays diff --git a/arrow/benches/take_kernels.rs b/arrow/benches/take_kernels.rs index 9c3f1eb40909..77ec54c97bc5 100644 --- a/arrow/benches/take_kernels.rs +++ b/arrow/benches/take_kernels.rs @@ -149,6 +149,42 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_take(&values, &indices)) }); + let values = create_string_view_array(512, 0.0); + let indices = create_random_index(512, 0.0); + c.bench_function("take stringview 512", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + + let values = create_string_view_array(1024, 0.0); + let indices = create_random_index(1024, 0.0); + c.bench_function("take stringview 1024", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + + let values = create_string_view_array(512, 0.0); + let indices = create_random_index(512, 0.5); + c.bench_function("take stringview null indices 512", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + + let values = create_string_view_array(1024, 0.0); + let indices = create_random_index(1024, 0.5); + c.bench_function("take stringview null indices 1024", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + + let values = create_string_view_array(1024, 0.5); + let indices = create_random_index(1024, 0.0); + c.bench_function("take stringview null values 1024", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + + let values = create_string_view_array(1024, 0.5); + let indices = create_random_index(1024, 0.5); + c.bench_function("take stringview null values null indices 1024", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + let values = create_primitive_run_array::(1024, 512); let indices = create_random_index(1024, 0.0); c.bench_function( diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index ac7f86d561d5..2561c925aaec 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -160,6 +160,34 @@ pub fn create_string_array_with_len( .collect() } +/// Creates a random (but fixed-seeded) string view array of a given size and null density. +/// +/// See `create_string_array` above for more details. +pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray { + create_string_view_array_with_max_len(size, null_density, 400) +} + +/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length +fn create_string_view_array_with_max_len( + size: usize, + null_density: f32, + max_str_len: usize, +) -> StringViewArray { + let rng = &mut seedable_rng(); + (0..size) + .map(|_| { + if rng.gen::() < null_density { + None + } else { + let str_len = rng.gen_range(0..max_str_len); + let value = rng.sample_iter(&Alphanumeric).take(str_len).collect(); + let value = String::from_utf8(value).unwrap(); + Some(value) + } + }) + .collect() +} + /// Creates a random (but fixed-seeded) array of a given size, null density and length pub fn create_string_view_array_with_len( size: usize,