diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 12b6ddd6a830..a0fd96415a1d 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -56,6 +56,8 @@ arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } pyo3 = { version = "0.22.2", default-features = false, optional = true } +chrono = { workspace = true, optional = true } + [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] @@ -70,7 +72,7 @@ prettyprint = ["arrow-cast/prettyprint"] # not the core arrow code itself. Be aware that `rand` must be kept as # an optional dependency for supporting compile to wasm32-unknown-unknown # target without assuming an environment containing JavaScript. -test_utils = ["rand"] +test_utils = ["rand", "dep:chrono"] pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index d563fa36240a..edb406c0f3ca 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -19,6 +19,7 @@ use std::sync::Arc; +use rand::distributions::uniform::SampleRange; use rand::{distributions::uniform::SampleUniform, Rng}; use crate::array::*; @@ -117,30 +118,39 @@ pub fn create_random_array( size, primitive_null_density, )), - Timestamp(_, _) => { - let int64_array = Arc::new(create_primitive_array::( - size, - primitive_null_density, - )) as ArrayRef; - return crate::compute::cast(&int64_array, field.data_type()); + Timestamp(unit, _) => { + match unit { + TimeUnit::Second => Arc::new(create_random_temporal_array::( + size, + primitive_null_density, + )), + TimeUnit::Millisecond => Arc::new(create_random_temporal_array::< + TimestampMillisecondType, + >(size, primitive_null_density)), + TimeUnit::Microsecond => Arc::new(create_random_temporal_array::< + TimestampMicrosecondType, + >(size, primitive_null_density)), + TimeUnit::Nanosecond => Arc::new(create_random_temporal_array::< + TimestampNanosecondType, + >(size, primitive_null_density)), + } } - Date32 => Arc::new(create_primitive_array::( + Date32 => Arc::new(create_random_temporal_array::( size, primitive_null_density, )), - Date64 => Arc::new(create_primitive_array::( + Date64 => Arc::new(create_random_temporal_array::( size, primitive_null_density, )), Time32(unit) => match unit { - TimeUnit::Second => Arc::new(create_primitive_array::( + TimeUnit::Second => Arc::new(create_random_temporal_array::( size, primitive_null_density, )) as ArrayRef, - TimeUnit::Millisecond => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), + TimeUnit::Millisecond => Arc::new( + create_random_temporal_array::(size, primitive_null_density), + ), _ => { return Err(ArrowError::InvalidArgumentError(format!( "Unsupported unit {unit:?} for Time32" @@ -148,11 +158,10 @@ pub fn create_random_array( } }, Time64(unit) => match unit { - TimeUnit::Microsecond => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )) as ArrayRef, - TimeUnit::Nanosecond => Arc::new(create_primitive_array::( + TimeUnit::Microsecond => Arc::new( + create_random_temporal_array::(size, primitive_null_density), + ) as ArrayRef, + TimeUnit::Nanosecond => Arc::new(create_random_temporal_array::( size, primitive_null_density, )), @@ -382,6 +391,124 @@ fn create_random_null_buffer(size: usize, null_density: f32) -> Buffer { mut_buf.into() } +/// Useful for testing. The range of values are not likely to be representative of the +/// actual bounds. +pub trait RandomTemporalValue: ArrowTemporalType { + fn value_range() -> impl SampleRange; + + fn gen_range(rng: &mut R) -> Self::Native + where + Self::Native: SampleUniform, + { + rng.gen_range(Self::value_range()) + } + + fn random(rng: &mut R) -> Self::Native + where + Self::Native: SampleUniform, + { + Self::gen_range(rng) + } +} + +impl RandomTemporalValue for TimestampSecondType { + /// Range of values for a timestamp in seconds. The range begins at the start + /// of the unix epoch and continues for 100 years. + fn value_range() -> impl SampleRange { + 0..60 * 60 * 24 * 365 * 100 + } +} + +impl RandomTemporalValue for TimestampMillisecondType { + /// Range of values for a timestamp in milliseconds. The range begins at the start + /// of the unix epoch and continues for 100 years. + fn value_range() -> impl SampleRange { + 0..1_000 * 60 * 60 * 24 * 365 * 100 + } +} + +impl RandomTemporalValue for TimestampMicrosecondType { + /// Range of values for a timestamp in microseconds. The range begins at the start + /// of the unix epoch and continues for 100 years. + fn value_range() -> impl SampleRange { + 0..1_000 * 1_000 * 60 * 60 * 24 * 365 * 100 + } +} + +impl RandomTemporalValue for TimestampNanosecondType { + /// Range of values for a timestamp in nanoseconds. The range begins at the start + /// of the unix epoch and continues for 100 years. + fn value_range() -> impl SampleRange { + 0..1_000 * 1_000 * 1_000 * 60 * 60 * 24 * 365 * 100 + } +} + +impl RandomTemporalValue for Date32Type { + /// Range of values representing the elapsed time since UNIX epoch in days. The + /// range begins at the start of the unix epoch and continues for 100 years. + fn value_range() -> impl SampleRange { + 0..365 * 100 + } +} + +impl RandomTemporalValue for Date64Type { + /// Range of values representing the elapsed time since UNIX epoch in milliseconds. + /// The range begins at the start of the unix epoch and continues for 100 years. + fn value_range() -> impl SampleRange { + 0..1_000 * 60 * 60 * 24 * 365 * 100 + } +} + +impl RandomTemporalValue for Time32SecondType { + /// Range of values representing the elapsed time since midnight in seconds. The + /// range is from 0 to 24 hours. + fn value_range() -> impl SampleRange { + 0..60 * 60 * 24 + } +} + +impl RandomTemporalValue for Time32MillisecondType { + /// Range of values representing the elapsed time since midnight in milliseconds. The + /// range is from 0 to 24 hours. + fn value_range() -> impl SampleRange { + 0..1_000 * 60 * 60 * 24 + } +} + +impl RandomTemporalValue for Time64MicrosecondType { + /// Range of values representing the elapsed time since midnight in microseconds. The + /// range is from 0 to 24 hours. + fn value_range() -> impl SampleRange { + 0..1_000 * 1_000 * 60 * 60 * 24 + } +} + +impl RandomTemporalValue for Time64NanosecondType { + /// Range of values representing the elapsed time since midnight in nanoseconds. The + /// range is from 0 to 24 hours. + fn value_range() -> impl SampleRange { + 0..1_000 * 1_000 * 1_000 * 60 * 60 * 24 + } +} + +fn create_random_temporal_array(size: usize, null_density: f32) -> PrimitiveArray +where + T: RandomTemporalValue, + ::Native: SampleUniform, +{ + let mut rng = seedable_rng(); + + (0..size) + .map(|_| { + if rng.gen::() < null_density { + None + } else { + Some(T::random(&mut rng)) + } + }) + .collect() +} + #[cfg(test)] mod tests { use super::*;