Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve zip performance #6812

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions arrow-data/src/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,8 @@ impl<'a> MutableArrayData<'a> {

/// Extends the in progress array with a region of the input arrays
///
/// For extending scalar value, use [MutableArrayData::extend_scalar].
///
/// # Arguments
/// * `index` - the index of array that you what to copy values from
/// * `start` - the start index of the chunk (inclusive)
Expand All @@ -718,6 +720,28 @@ impl<'a> MutableArrayData<'a> {
self.data.len += len;
}

/// Extends the in progress array with the same value from the input arrays
///
/// # Arguments
/// * `index` - the index of array that you what to copy values from
/// * `scalar_index` - the index of the scalar value to copy
/// * `count` - the number of times to repeat the value
///
/// # Panic
/// This function panics if there is an invalid index,
/// i.e. `index` >= the number of source arrays
/// or `start` + `count` > the length of the `index`th array
///
pub fn extend_scalar(&mut self, index: usize, scalar_index: usize, count: usize) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
pub fn extend_scalar(&mut self, index: usize, scalar_index: usize, count: usize) {
pub fn extend_n(&mut self, index: usize, scalar_index: usize, count: usize) {

Scalars don't really exist as a notion at this level

let extend_null_fn = &self.extend_null_bits[index];
let extend_value_fn = &self.extend_values[index];
for _ in 0..count {
extend_null_fn(&mut self.data, scalar_index, 1);
extend_value_fn(&mut self.data, index, scalar_index, 1);
self.data.len += 1;
}
}

/// Extends the in progress array with null elements, ignoring the input arrays.
///
/// # Panics
Expand Down
106 changes: 82 additions & 24 deletions arrow-select/src/zip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,46 +70,104 @@ pub fn zip(
// the SlicesIterator slices only the true values. So the gaps left by this iterator we need to
// fill with falsy values

// keep track of how much is filled
match (truthy_is_scalar, falsy_is_scalar) {
(true, true) => zip_both_scalar(mask, &mut mutable),
(true, false) => zip_truthy_scalar_falsy_array(mask, &mut mutable),
(false, true) => zip_truthy_array_falsy_scalar(mask, &mut mutable),
(false, false) => zip_both_array(mask, &mut mutable),
};

let data = mutable.freeze();
Ok(make_array(data))
}

fn zip_both_scalar(mask: &BooleanArray, mutable: &mut MutableArrayData) {
// keep track of how much is filled
let mut filled = 0;

SlicesIterator::new(mask).for_each(|(start, end)| {
// the gap needs to be filled with falsy values
if start > filled {
if falsy_is_scalar {
for _ in filled..start {
// Copy the first item from the 'falsy' array into the output buffer.
mutable.extend(1, 0, 1);
}
} else {
mutable.extend(1, filled, start);
}
mutable.extend_scalar(1, 0, start - filled);
}
// fill with truthy values
if truthy_is_scalar {
for _ in start..end {
// Copy the first item from the 'truthy' array into the output buffer.
mutable.extend(0, 0, 1);
}
} else {
mutable.extend(0, start, end);
mutable.extend_scalar(0, 0, end - start);

filled = end;
});

// the remaining part is falsy
if filled < mask.len() {
// Copy the first item from the 'falsy' array into the output buffer.
mutable.extend_scalar(1, 0, mask.len() - filled);
}
}

fn zip_truthy_scalar_falsy_array(mask: &BooleanArray, mutable: &mut MutableArrayData) {
// keep track of how much is filled
let mut filled = 0;

SlicesIterator::new(mask).for_each(|(start, end)| {
// the gap needs to be filled with falsy values
if start > filled {
mutable.extend(1, filled, start);
}

// fill with truthy values
mutable.extend_scalar(0, 0, end - start);

filled = end;
});

// the remaining part is falsy
if filled < mask.len() {
if falsy_is_scalar {
for _ in filled..mask.len() {
// Copy the first item from the 'falsy' array into the output buffer.
mutable.extend(1, 0, 1);
}
} else {
mutable.extend(1, filled, mask.len());
mutable.extend(1, filled, mask.len());
}
}

fn zip_truthy_array_falsy_scalar(mask: &BooleanArray, mutable: &mut MutableArrayData) {
// keep track of how much is filled
let mut filled = 0;

SlicesIterator::new(mask).for_each(|(start, end)| {
// the gap needs to be filled with falsy values
if start > filled {
mutable.extend_scalar(1, 0, start - filled);
}

// fill with truthy values
mutable.extend(0, start, end);

filled = end;
});

// the remaining part is falsy
if filled < mask.len() {
// Copy the first item from the 'falsy' array into the output buffer.
mutable.extend_scalar(1, 0, mask.len() - filled);
}
}

let data = mutable.freeze();
Ok(make_array(data))
fn zip_both_array(mask: &BooleanArray, mutable: &mut MutableArrayData) {
let mut filled = 0;

SlicesIterator::new(mask).for_each(|(start, end)| {
// the gap needs to be filled with falsy values
if start > filled {
mutable.extend(1, filled, start);
}

// fill with truthy values
mutable.extend(0, start, end);

filled = end;
});

// the remaining part is falsy
if filled < mask.len() {
mutable.extend(1, filled, mask.len());
}
}

#[cfg(test)]
Expand Down
Loading