Skip to content

Commit

Permalink
perf: Use Cow in get_format_string in FFI_ArrowSchema (#6853)
Browse files Browse the repository at this point in the history
* add cast_decimal bench

* format

* save

* revert

* criterion disable default features

* address feedback
  • Loading branch information
andygrove committed Jan 3, 2025
1 parent f5b51ff commit 97c3d4a
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 52 deletions.
5 changes: 5 additions & 0 deletions arrow-schema/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,8 @@ features = ["ffi"]
[dev-dependencies]
serde_json = "1.0"
bincode = { version = "1.3.3", default-features = false }
criterion = { version = "0.5", default-features = false }

[[bench]]
name = "ffi"
harness = false
38 changes: 38 additions & 0 deletions arrow-schema/benches/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow_schema::ffi::FFI_ArrowSchema;
use arrow_schema::{DataType, Field};
use criterion::*;
use std::sync::Arc;

fn criterion_benchmark(c: &mut Criterion) {
let fields = vec![
Arc::new(Field::new("c1", DataType::Utf8, false)),
Arc::new(Field::new("c2", DataType::Utf8, false)),
Arc::new(Field::new("c3", DataType::Utf8, false)),
Arc::new(Field::new("c4", DataType::Utf8, false)),
Arc::new(Field::new("c5", DataType::Utf8, false)),
];
let data_type = DataType::Struct(fields.into());
c.bench_function("ffi_arrow_schema_try_from", |b| {
b.iter(|| FFI_ArrowSchema::try_from(&data_type));
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
107 changes: 55 additions & 52 deletions arrow-schema/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ use crate::{
ArrowError, DataType, Field, FieldRef, IntervalUnit, Schema, TimeUnit, UnionFields, UnionMode,
};
use bitflags::bitflags;
use std::borrow::Cow;
use std::sync::Arc;
use std::{
collections::HashMap,
Expand Down Expand Up @@ -685,66 +686,68 @@ impl TryFrom<&DataType> for FFI_ArrowSchema {
}
}

fn get_format_string(dtype: &DataType) -> Result<String, ArrowError> {
fn get_format_string(dtype: &DataType) -> Result<Cow<'static, str>, ArrowError> {
match dtype {
DataType::Null => Ok("n".to_string()),
DataType::Boolean => Ok("b".to_string()),
DataType::Int8 => Ok("c".to_string()),
DataType::UInt8 => Ok("C".to_string()),
DataType::Int16 => Ok("s".to_string()),
DataType::UInt16 => Ok("S".to_string()),
DataType::Int32 => Ok("i".to_string()),
DataType::UInt32 => Ok("I".to_string()),
DataType::Int64 => Ok("l".to_string()),
DataType::UInt64 => Ok("L".to_string()),
DataType::Float16 => Ok("e".to_string()),
DataType::Float32 => Ok("f".to_string()),
DataType::Float64 => Ok("g".to_string()),
DataType::BinaryView => Ok("vz".to_string()),
DataType::Binary => Ok("z".to_string()),
DataType::LargeBinary => Ok("Z".to_string()),
DataType::Utf8View => Ok("vu".to_string()),
DataType::Utf8 => Ok("u".to_string()),
DataType::LargeUtf8 => Ok("U".to_string()),
DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")),
DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")),
DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")),
DataType::Decimal256(precision, scale) => Ok(format!("d:{precision},{scale},256")),
DataType::Date32 => Ok("tdD".to_string()),
DataType::Date64 => Ok("tdm".to_string()),
DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()),
DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".to_string()),
DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".to_string()),
DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".to_string()),
DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".to_string()),
DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".to_string()),
DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".to_string()),
DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".to_string()),
DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(format!("tss:{tz}")),
DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(format!("tsm:{tz}")),
DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(format!("tsu:{tz}")),
DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(format!("tsn:{tz}")),
DataType::Duration(TimeUnit::Second) => Ok("tDs".to_string()),
DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".to_string()),
DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".to_string()),
DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".to_string()),
DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".to_string()),
DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".to_string()),
DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".to_string()),
DataType::List(_) => Ok("+l".to_string()),
DataType::LargeList(_) => Ok("+L".to_string()),
DataType::Struct(_) => Ok("+s".to_string()),
DataType::Map(_, _) => Ok("+m".to_string()),
DataType::RunEndEncoded(_, _) => Ok("+r".to_string()),
DataType::Null => Ok("n".into()),
DataType::Boolean => Ok("b".into()),
DataType::Int8 => Ok("c".into()),
DataType::UInt8 => Ok("C".into()),
DataType::Int16 => Ok("s".into()),
DataType::UInt16 => Ok("S".into()),
DataType::Int32 => Ok("i".into()),
DataType::UInt32 => Ok("I".into()),
DataType::Int64 => Ok("l".into()),
DataType::UInt64 => Ok("L".into()),
DataType::Float16 => Ok("e".into()),
DataType::Float32 => Ok("f".into()),
DataType::Float64 => Ok("g".into()),
DataType::BinaryView => Ok("vz".into()),
DataType::Binary => Ok("z".into()),
DataType::LargeBinary => Ok("Z".into()),
DataType::Utf8View => Ok("vu".into()),
DataType::Utf8 => Ok("u".into()),
DataType::LargeUtf8 => Ok("U".into()),
DataType::FixedSizeBinary(num_bytes) => Ok(Cow::Owned(format!("w:{num_bytes}"))),
DataType::FixedSizeList(_, num_elems) => Ok(Cow::Owned(format!("+w:{num_elems}"))),
DataType::Decimal128(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale}"))),
DataType::Decimal256(precision, scale) => {
Ok(Cow::Owned(format!("d:{precision},{scale},256")))
}
DataType::Date32 => Ok("tdD".into()),
DataType::Date64 => Ok("tdm".into()),
DataType::Time32(TimeUnit::Second) => Ok("tts".into()),
DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".into()),
DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".into()),
DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".into()),
DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".into()),
DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".into()),
DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".into()),
DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".into()),
DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(Cow::Owned(format!("tss:{tz}"))),
DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(Cow::Owned(format!("tsm:{tz}"))),
DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(Cow::Owned(format!("tsu:{tz}"))),
DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(Cow::Owned(format!("tsn:{tz}"))),
DataType::Duration(TimeUnit::Second) => Ok("tDs".into()),
DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".into()),
DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".into()),
DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".into()),
DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".into()),
DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".into()),
DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()),
DataType::List(_) => Ok("+l".into()),
DataType::LargeList(_) => Ok("+L".into()),
DataType::Struct(_) => Ok("+s".into()),
DataType::Map(_, _) => Ok("+m".into()),
DataType::RunEndEncoded(_, _) => Ok("+r".into()),
DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type),
DataType::Union(fields, mode) => {
let formats = fields
.iter()
.map(|(t, _)| t.to_string())
.collect::<Vec<_>>();
match mode {
UnionMode::Dense => Ok(format!("{}:{}", "+ud", formats.join(","))),
UnionMode::Sparse => Ok(format!("{}:{}", "+us", formats.join(","))),
UnionMode::Dense => Ok(Cow::Owned(format!("{}:{}", "+ud", formats.join(",")))),
UnionMode::Sparse => Ok(Cow::Owned(format!("{}:{}", "+us", formats.join(",")))),
}
}
other => Err(ArrowError::CDataInterface(format!(
Expand Down

0 comments on commit 97c3d4a

Please sign in to comment.