forked from apache/arrow-rs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Expose BitSliceIterator and BitIndexIterator (apache#1864)
- Loading branch information
Showing
4 changed files
with
162 additions
and
104 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
use crate::util::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; | ||
|
||
/// Iterator of contiguous ranges of set bits within a provided packed bitmask | ||
/// | ||
/// Returns `(usize, usize)` each representing an interval where the corresponding | ||
/// bits in the provides mask are set | ||
/// | ||
#[derive(Debug)] | ||
pub struct BitSliceIterator<'a> { | ||
iter: UnalignedBitChunkIterator<'a>, | ||
len: usize, | ||
current_offset: i64, | ||
current_chunk: u64, | ||
} | ||
|
||
impl<'a> BitSliceIterator<'a> { | ||
/// Create a new [`BitSliceIterator`] from the provide `buffer`, | ||
/// and `offset` and `len` in bits | ||
pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self { | ||
let chunk = UnalignedBitChunk::new(buffer, offset, len); | ||
let mut iter = chunk.iter(); | ||
|
||
let current_offset = -(chunk.lead_padding() as i64); | ||
let current_chunk = iter.next().unwrap_or(0); | ||
|
||
Self { | ||
iter, | ||
len, | ||
current_offset, | ||
current_chunk, | ||
} | ||
} | ||
|
||
/// Returns `Some((chunk_offset, bit_offset))` for the next chunk that has at | ||
/// least one bit set, or None if there is no such chunk. | ||
/// | ||
/// Where `chunk_offset` is the bit offset to the current `u64` chunk | ||
/// and `bit_offset` is the offset of the first `1` bit in that chunk | ||
fn advance_to_set_bit(&mut self) -> Option<(i64, u32)> { | ||
loop { | ||
if self.current_chunk != 0 { | ||
// Find the index of the first 1 | ||
let bit_pos = self.current_chunk.trailing_zeros(); | ||
return Some((self.current_offset, bit_pos)); | ||
} | ||
|
||
self.current_chunk = self.iter.next()?; | ||
self.current_offset += 64; | ||
} | ||
} | ||
} | ||
|
||
impl<'a> Iterator for BitSliceIterator<'a> { | ||
type Item = (usize, usize); | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
// Used as termination condition | ||
if self.len == 0 { | ||
return None; | ||
} | ||
|
||
let (start_chunk, start_bit) = self.advance_to_set_bit()?; | ||
|
||
// Set bits up to start | ||
self.current_chunk |= (1 << start_bit) - 1; | ||
|
||
loop { | ||
if self.current_chunk != u64::MAX { | ||
// Find the index of the first 0 | ||
let end_bit = self.current_chunk.trailing_ones(); | ||
|
||
// Zero out up to end_bit | ||
self.current_chunk &= !((1 << end_bit) - 1); | ||
|
||
return Some(( | ||
(start_chunk + start_bit as i64) as usize, | ||
(self.current_offset + end_bit as i64) as usize, | ||
)); | ||
} | ||
|
||
match self.iter.next() { | ||
Some(next) => { | ||
self.current_chunk = next; | ||
self.current_offset += 64; | ||
} | ||
None => { | ||
return Some(( | ||
(start_chunk + start_bit as i64) as usize, | ||
std::mem::replace(&mut self.len, 0), | ||
)); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
/// An iterator of `usize` whose index in a provided bitmask is true | ||
/// | ||
/// This provides the best performance on most masks, apart from those which contain | ||
/// large runs and therefore favour [`BitSliceIterator`] | ||
#[derive(Debug)] | ||
pub struct BitIndexIterator<'a> { | ||
current_chunk: u64, | ||
chunk_offset: i64, | ||
iter: UnalignedBitChunkIterator<'a>, | ||
} | ||
|
||
impl<'a> BitIndexIterator<'a> { | ||
/// Create a new [`BitIndexIterator`] from the provide `buffer`, | ||
/// and `offset` and `len` in bits | ||
pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self { | ||
let chunks = UnalignedBitChunk::new(buffer, offset, len); | ||
let mut iter = chunks.iter(); | ||
|
||
let current_chunk = iter.next().unwrap_or(0); | ||
let chunk_offset = -(chunks.lead_padding() as i64); | ||
|
||
Self { | ||
current_chunk, | ||
chunk_offset, | ||
iter, | ||
} | ||
} | ||
} | ||
|
||
impl<'a> Iterator for BitIndexIterator<'a> { | ||
type Item = usize; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
loop { | ||
if self.current_chunk != 0 { | ||
let bit_pos = self.current_chunk.trailing_zeros(); | ||
self.current_chunk ^= 1 << bit_pos; | ||
return Some((self.chunk_offset + bit_pos as i64) as usize); | ||
} | ||
|
||
self.current_chunk = self.iter.next()?; | ||
self.chunk_offset += 64; | ||
} | ||
} | ||
} | ||
|
||
// Note: tests located in filter module |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters