-
Notifications
You must be signed in to change notification settings - Fork 838
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use BitChunks in equal_bits #2194
Changes from all commits
0e0c6e0
5c9e1cb
3afd6c6
f8c719b
37a4816
06217ae
4801872
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ use crate::datatypes::{ | |
UnionMode, | ||
}; | ||
use crate::error::{ArrowError, Result}; | ||
use crate::util::bit_iterator::BitSliceIterator; | ||
use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; | ||
use crate::{ | ||
buffer::{Buffer, MutableBuffer}, | ||
|
@@ -37,6 +38,21 @@ use std::sync::Arc; | |
|
||
use super::equal::equal; | ||
|
||
#[inline] | ||
pub(crate) fn contains_nulls( | ||
null_bit_buffer: Option<&Buffer>, | ||
offset: usize, | ||
len: usize, | ||
) -> bool { | ||
match null_bit_buffer { | ||
Some(buffer) => match BitSliceIterator::new(buffer, offset, len).next() { | ||
Some((start, end)) => start != 0 || end != len, | ||
None => len != 0, // No non-null values | ||
}, | ||
None => false, // No null buffer | ||
} | ||
} | ||
|
||
#[inline] | ||
pub(crate) fn count_nulls( | ||
null_bit_buffer: Option<&Buffer>, | ||
|
@@ -2865,4 +2881,15 @@ mod tests { | |
let err = data.validate_values().unwrap_err(); | ||
assert_eq!(err.to_string(), "Invalid argument error: Offset invariant failure: offset at position 1 out of bounds: 3 > 2"); | ||
} | ||
|
||
#[test] | ||
fn test_contains_nulls() { | ||
let buffer: Buffer = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if using a buffer with more than 64 bits is important (or is that already well enough covered in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm happy they are well covered by BitSliceIterator |
||
MutableBuffer::from_iter([false, false, false, true, true, false]).into(); | ||
|
||
assert!(contains_nulls(Some(&buffer), 0, 6)); | ||
assert!(contains_nulls(Some(&buffer), 0, 3)); | ||
assert!(!contains_nulls(Some(&buffer), 3, 2)); | ||
assert!(!contains_nulls(Some(&buffer), 0, 0)); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,11 +15,14 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use crate::array::{data::count_nulls, ArrayData}; | ||
use crate::array::{data::contains_nulls, ArrayData}; | ||
use crate::util::bit_iterator::BitIndexIterator; | ||
use crate::util::bit_util::get_bit; | ||
|
||
use super::utils::{equal_bits, equal_len}; | ||
|
||
/// Returns true if the value data for the arrays is equal, assuming the null masks have | ||
/// already been checked for equality | ||
pub(super) fn boolean_equal( | ||
lhs: &ArrayData, | ||
rhs: &ArrayData, | ||
|
@@ -30,10 +33,9 @@ pub(super) fn boolean_equal( | |
let lhs_values = lhs.buffers()[0].as_slice(); | ||
let rhs_values = rhs.buffers()[0].as_slice(); | ||
|
||
let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); | ||
let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); | ||
let contains_nulls = contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); | ||
|
||
if lhs_null_count == 0 && rhs_null_count == 0 { | ||
if !contains_nulls { | ||
// Optimize performance for starting offset at u8 boundary. | ||
if lhs_start % 8 == 0 | ||
&& rhs_start % 8 == 0 | ||
|
@@ -75,20 +77,14 @@ pub(super) fn boolean_equal( | |
} else { | ||
// get a ref of the null buffer bytes, to use in testing for nullness | ||
let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); | ||
let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); | ||
|
||
let lhs_start = lhs.offset() + lhs_start; | ||
let rhs_start = rhs.offset() + rhs_start; | ||
|
||
(0..len).all(|i| { | ||
BitIndexIterator::new(lhs_null_bytes, lhs_start, len).all(|i| { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this correct? It seems like it will would not find positions where It seems like we need to iterate over the positions where either is set -- like Maybe there is a lack of test coverage 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or maybe you can just call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These methods are purely checking value equality, at this point the null masks have already been checked for equality. Otherwise the previous logic would have been ill-formed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I think I missed that this was checking |
||
let lhs_pos = lhs_start + i; | ||
let rhs_pos = rhs_start + i; | ||
let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos); | ||
let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos); | ||
|
||
lhs_is_null | ||
|| (lhs_is_null == rhs_is_null) | ||
&& equal_bits(lhs_values, rhs_values, lhs_pos, rhs_pos, 1) | ||
get_bit(lhs_values, lhs_pos) == get_bit(rhs_values, rhs_pos) | ||
}) | ||
} | ||
} | ||
|
@@ -109,4 +105,13 @@ mod tests { | |
let slice = array.slice(8, 24); | ||
assert_eq!(slice.data(), slice.data()); | ||
} | ||
|
||
#[test] | ||
fn test_sliced_nullable_boolean_array() { | ||
let a = BooleanArray::from(vec![None; 32]); | ||
let b = BooleanArray::from(vec![true; 32]); | ||
let slice_a = a.slice(1, 12); | ||
let slice_b = b.slice(1, 12); | ||
assert_ne!(slice_a.data(), slice_b.data()); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the never ending confusion about "is this a size in bits or bytes" I recommend making it explicit -- either add a docstring that says
offset
andlen
are inbits
or perhaps name thembit_offset
andbit_len
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the potential usefulness (and non obviousness) of using
BitSliceIterator
to check for nulls, I wonder what you think about making this a function onBitSliceIterator
such asBitSliceIterator::contains_only_unset_bits
or something?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I could definitely see this being promoted to a method on BitMap, once that supports slicing #1802