Skip to content

Commit

Permalink
[Parquet] Add benchmark and test for writing NaNs to Parquet (#6955)
Browse files Browse the repository at this point in the history
* Add test and benchmarks for writing floats with NaNs

* Remove extra benchmark with no NaNs
  • Loading branch information
adamreeve authored Jan 8, 2025
1 parent 485dbb1 commit d0260fc
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 1 deletion.
3 changes: 2 additions & 1 deletion arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ arrow-string = { workspace = true }

rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true }
pyo3 = { version = "0.23", default-features = false, optional = true }
half = { version = "2.1", default-features = false, optional = true }

[package.metadata.docs.rs]
features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"]
Expand All @@ -70,7 +71,7 @@ prettyprint = ["arrow-cast/prettyprint"]
# not the core arrow code itself. Be aware that `rand` must be kept as
# an optional dependency for supporting compile to wasm32-unknown-unknown
# target without assuming an environment containing JavaScript.
test_utils = ["dep:rand"]
test_utils = ["dep:rand", "dep:half"]
pyarrow = ["pyo3", "ffi"]
# force_validate runs full data validation for all arrays that are created
# this is not enabled by default as it is too computationally expensive
Expand Down
46 changes: 46 additions & 0 deletions arrow/src/util/bench_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::array::*;
use crate::datatypes::*;
use crate::util::test_util::seedable_rng;
use arrow_buffer::{Buffer, IntervalMonthDayNano};
use half::f16;
use rand::distributions::uniform::SampleUniform;
use rand::thread_rng;
use rand::Rng;
Expand Down Expand Up @@ -416,3 +417,48 @@ where

DictionaryArray::from(data)
}

/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density
pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
let mut rng = seedable_rng();

(0..size)
.map(|_| {
if rng.gen::<f32>() < nan_density {
Some(f16::NAN)
} else {
Some(f16::from_f32(rng.gen()))
}
})
.collect()
}

/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density
pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
let mut rng = seedable_rng();

(0..size)
.map(|_| {
if rng.gen::<f32>() < nan_density {
Some(f32::NAN)
} else {
Some(rng.gen())
}
})
.collect()
}

/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density
pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
let mut rng = seedable_rng();

(0..size)
.map(|_| {
if rng.gen::<f32>() < nan_density {
Some(f64::NAN)
} else {
Some(rng.gen())
}
})
.collect()
}
33 changes: 33 additions & 0 deletions parquet/benches/arrow_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ extern crate parquet;
use std::sync::Arc;

use arrow::datatypes::*;
use arrow::util::bench_util::{create_f16_array, create_f32_array, create_f64_array};
use arrow::{record_batch::RecordBatch, util::data_gen::*};
use arrow_array::RecordBatchOptions;
use parquet::file::properties::WriterProperties;
use parquet::{arrow::ArrowWriter, errors::Result};

Expand Down Expand Up @@ -181,6 +183,25 @@ fn create_bool_bench_batch_non_null(
)?)
}

fn create_float_bench_batch_with_nans(size: usize, nan_density: f32) -> Result<RecordBatch> {
let fields = vec![
Field::new("_1", DataType::Float16, false),
Field::new("_2", DataType::Float32, false),
Field::new("_3", DataType::Float64, false),
];
let schema = Schema::new(fields);
let columns: Vec<arrow_array::ArrayRef> = vec![
Arc::new(create_f16_array(size, nan_density)),
Arc::new(create_f32_array(size, nan_density)),
Arc::new(create_f64_array(size, nan_density)),
];
Ok(RecordBatch::try_new_with_options(
Arc::new(schema),
columns,
&RecordBatchOptions::new().with_match_field_names(false),
)?)
}

fn create_list_primitive_bench_batch(
size: usize,
null_density: f32,
Expand Down Expand Up @@ -459,6 +480,18 @@ fn bench_primitive_writer(c: &mut Criterion) {
b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap())
});

let batch = create_float_bench_batch_with_nans(4096, 0.5).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
.iter()
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
group.bench_function("4096 values float with NaNs", |b| {
b.iter(|| write_batch(&batch).unwrap())
});

group.finish();
}

Expand Down
39 changes: 39 additions & 0 deletions parquet/src/arrow/arrow_writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,7 @@ mod tests {
use arrow::{array::*, buffer::Buffer};
use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer};
use arrow_schema::Fields;
use half::f16;

use crate::basic::Encoding;
use crate::data_type::AsBytes;
Expand Down Expand Up @@ -1763,6 +1764,44 @@ mod tests {
);
}

#[test]
fn arrow_writer_float_nans() {
let f16_field = Field::new("a", DataType::Float16, false);
let f32_field = Field::new("b", DataType::Float32, false);
let f64_field = Field::new("c", DataType::Float64, false);
let schema = Schema::new(vec![f16_field, f32_field, f64_field]);

let f16_values = (0..MEDIUM_SIZE)
.map(|i| {
Some(if i % 2 == 0 {
f16::NAN
} else {
f16::from_f32(i as f32)
})
})
.collect::<Float16Array>();

let f32_values = (0..MEDIUM_SIZE)
.map(|i| Some(if i % 2 == 0 { f32::NAN } else { i as f32 }))
.collect::<Float32Array>();

let f64_values = (0..MEDIUM_SIZE)
.map(|i| Some(if i % 2 == 0 { f64::NAN } else { i as f64 }))
.collect::<Float64Array>();

let batch = RecordBatch::try_new(
Arc::new(schema),
vec![
Arc::new(f16_values),
Arc::new(f32_values),
Arc::new(f64_values),
],
)
.unwrap();

roundtrip(batch, None);
}

const SMALL_SIZE: usize = 7;
const MEDIUM_SIZE: usize = 63;

Expand Down

0 comments on commit d0260fc

Please sign in to comment.