-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Mask sets entries of an array to null. I like the analogy to light: the array is a sequence of lights (each value might be a different wavelength). Null is represented by the absence of light. Placing a mask (i.e. a piece of plastic with slits) over the array causes those values where the mask is present (i.e. "on", "true") to be dark. An example in pseudo-code: ```rust a = [1, 2, 3, 4, 5] a_mask = [t, f, f, t, f] mask(a, a_mask) == [null, 2, 3, null, 5] ``` Specializations --------------- I only fallback to Arrow for two of the core arrays: - Sparse. I was skeptical that I could do better than decompressing and applying it. - Constant. If the mask is sparse, SparseArray might be a good choice. I didn't investigate. For the non-core arrays, I'm missing the following. I'm not clear that I can beat decompression for run end. The others are easy enough but some amount of typing and testing. - fastlanes - fsst - roaring - runend - runend-bool - zigzag Naming ------ Pandas also calls this operation [`mask`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mask.html) but accepts an optional second argument which is an array of values to use instead of null (which makes Pandas' mask more like an `if_else`). Arrow-rs calls this [`nullif`](https://arrow.apache.org/rust/arrow/compute/fn.nullif.html). Arrow-cpp has [`if_else(condition, consequent, alternate)`](https://arrow.apache.org/docs/cpp/compute.html#cpp-compute-scalar-selections) and [`replace_with_mask(array, mask, replacements)`](https://arrow.apache.org/docs/cpp/compute.html#replace-functions) both of which can implement our `mask` by passing a `NullArray` as the third argument.
- Loading branch information
Showing
39 changed files
with
1,583 additions
and
93 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
use vortex_array::compute::{mask, try_cast, FilterMask, MaskFn}; | ||
use vortex_array::{ArrayDType as _, ArrayData, IntoArrayData}; | ||
use vortex_error::VortexResult; | ||
|
||
use crate::{ALPArray, ALPEncoding}; | ||
|
||
impl MaskFn<ALPArray> for ALPEncoding { | ||
fn mask(&self, array: &ALPArray, filter_mask: FilterMask) -> VortexResult<ArrayData> { | ||
ALPArray::try_new( | ||
mask(&array.encoded(), filter_mask)?, | ||
array.exponents(), | ||
array | ||
.patches() | ||
.map(|patches| { | ||
patches.map_values(|values| try_cast(&values, &values.dtype().as_nullable())) | ||
}) | ||
.transpose()?, | ||
) | ||
.map(IntoArrayData::into_array) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use vortex_array::array::PrimitiveArray; | ||
use vortex_array::compute::test_harness::test_mask; | ||
use vortex_array::validity::Validity; | ||
use vortex_array::IntoArrayData as _; | ||
use vortex_buffer::buffer; | ||
|
||
use crate::alp_encode; | ||
|
||
#[test] | ||
fn test_mask_no_patches_alp_array() { | ||
test_mask( | ||
alp_encode(&PrimitiveArray::new( | ||
buffer![1.0f32, 2.0, 3.0, 4.0, 5.0], | ||
Validity::AllValid, | ||
)) | ||
.unwrap() | ||
.into_array(), | ||
); | ||
|
||
test_mask( | ||
alp_encode(&PrimitiveArray::new( | ||
buffer![1.0f32, 2.0, 3.0, 4.0, 5.0], | ||
Validity::NonNullable, | ||
)) | ||
.unwrap() | ||
.into_array(), | ||
); | ||
} | ||
|
||
#[test] | ||
fn test_mask_patched_alp_array() { | ||
let alp_array = alp_encode(&PrimitiveArray::new( | ||
buffer![1.0f32, 2.0, 3.0, 4.0, 1e10], | ||
Validity::AllValid, | ||
)) | ||
.unwrap(); | ||
assert!(alp_array.patches().is_some()); | ||
test_mask(alp_array.into_array()); | ||
|
||
let alp_array = alp_encode(&PrimitiveArray::new( | ||
buffer![1.0f32, 2.0, 3.0, 4.0, 1e10], | ||
Validity::NonNullable, | ||
)) | ||
.unwrap(); | ||
assert!(alp_array.patches().is_some()); | ||
test_mask(alp_array.into_array()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
use vortex_array::compute::{mask, FilterMask, MaskFn}; | ||
use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; | ||
use vortex_error::VortexResult; | ||
|
||
use crate::{ALPRDArray, ALPRDEncoding}; | ||
|
||
impl MaskFn<ALPRDArray> for ALPRDEncoding { | ||
fn mask(&self, array: &ALPRDArray, filter_mask: FilterMask) -> VortexResult<ArrayData> { | ||
Ok(ALPRDArray::try_new( | ||
array.dtype().as_nullable(), | ||
mask(&array.left_parts(), filter_mask)?, | ||
array.left_parts_dict(), | ||
array.right_parts(), | ||
array.right_bit_width(), | ||
array.left_parts_patches(), | ||
)? | ||
.into_array()) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use rstest::rstest; | ||
use vortex_array::array::PrimitiveArray; | ||
use vortex_array::compute::test_harness::test_mask; | ||
use vortex_array::IntoArrayData as _; | ||
|
||
use crate::{ALPRDFloat, RDEncoder}; | ||
|
||
#[rstest] | ||
#[case(0.1f32, 0.2f32, 3e25f32)] | ||
#[case(0.1f64, 0.2f64, 3e100f64)] | ||
fn test_mask_simple<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) { | ||
test_mask( | ||
RDEncoder::new(&[a, b]) | ||
.encode(&PrimitiveArray::from_iter([a, b, outlier, b, outlier])) | ||
.into_array(), | ||
); | ||
} | ||
|
||
#[rstest] | ||
#[case(0.1f32, 3e25f32)] | ||
#[case(0.5f64, 1e100f64)] | ||
fn test_mask_with_nulls<T: ALPRDFloat>(#[case] a: T, #[case] outlier: T) { | ||
test_mask( | ||
RDEncoder::new(&[a]) | ||
.encode(&PrimitiveArray::from_option_iter([ | ||
Some(a), | ||
None, | ||
Some(outlier), | ||
Some(a), | ||
None, | ||
])) | ||
.into_array(), | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
use vortex_array::compute::{try_cast, CastFn}; | ||
use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; | ||
use vortex_dtype::DType; | ||
use vortex_error::{vortex_bail, VortexResult}; | ||
|
||
use crate::{DateTimePartsArray, DateTimePartsEncoding}; | ||
|
||
impl CastFn<DateTimePartsArray> for DateTimePartsEncoding { | ||
fn cast(&self, array: &DateTimePartsArray, dtype: &DType) -> VortexResult<ArrayData> { | ||
if !array.dtype().eq_ignore_nullability(dtype) { | ||
vortex_bail!("cannot cast from {} to {}", array.dtype(), dtype); | ||
}; | ||
|
||
Ok(DateTimePartsArray::try_new( | ||
array.dtype().clone().as_nullable(), | ||
try_cast( | ||
array.days().as_ref(), | ||
&array.days().dtype().with_nullability(dtype.nullability()), | ||
)?, | ||
array.seconds(), | ||
array.subsecond(), | ||
)? | ||
.into_array()) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
use vortex_array::compute::{mask, FilterMask, MaskFn}; | ||
use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; | ||
use vortex_error::VortexResult; | ||
|
||
use crate::{DateTimePartsArray, DateTimePartsEncoding}; | ||
|
||
impl MaskFn<DateTimePartsArray> for DateTimePartsEncoding { | ||
fn mask(&self, array: &DateTimePartsArray, filter_mask: FilterMask) -> VortexResult<ArrayData> { | ||
Ok(DateTimePartsArray::try_new( | ||
array.dtype().clone().as_nullable(), | ||
mask(array.days().as_ref(), filter_mask)?, | ||
array.seconds(), | ||
array.subsecond(), | ||
)? | ||
.into_array()) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use vortex_array::array::TemporalArray; | ||
use vortex_array::compute::test_harness::test_mask; | ||
use vortex_array::IntoArrayData as _; | ||
use vortex_buffer::buffer; | ||
use vortex_datetime_dtype::TimeUnit; | ||
use vortex_dtype::DType; | ||
|
||
use crate::{split_temporal, DateTimePartsArray, TemporalParts}; | ||
|
||
#[test] | ||
fn test_mask_datetime_parts_array() { | ||
let raw_millis = buffer![ | ||
86_400i64, // element with only day component | ||
86_400i64 + 1000, // element with day + second components | ||
86_400i64 + 1000 + 1, // element with day + second + sub-second components | ||
86_400i64 + 1000 + 5, // element with day + second + sub-second components | ||
86_400i64 + 1000 + 55, // element with day + second + sub-second components | ||
] | ||
.into_array(); | ||
let temporal_array = | ||
TemporalArray::new_timestamp(raw_millis, TimeUnit::Ms, Some("UTC".to_string())); | ||
let TemporalParts { | ||
days, | ||
seconds, | ||
subseconds, | ||
} = split_temporal(temporal_array.clone()).unwrap(); | ||
let date_times = DateTimePartsArray::try_new( | ||
DType::Extension(temporal_array.ext_dtype()), | ||
days, | ||
seconds, | ||
subseconds, | ||
) | ||
.unwrap() | ||
.into_array(); | ||
|
||
test_mask(date_times.clone()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.