Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use StringView by default when reading from parquet #11723

Closed
wants to merge 13 commits into from
2 changes: 1 addition & 1 deletion benchmarks/queries/clickbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ ClickBench is focused on aggregation and filtering performance (though it has no

The "extended" queries are not part of the official ClickBench benchmark.
Instead they are used to test other DataFusion features that are not covered by
the standard benchmark Each description below is for the corresponding line in
the standard benchmark. Each description below is for the corresponding line in
`extended.sql` (line 1 is `Q0`, line 2 is `Q1`, etc.)

### Q0: Data Exploration
Expand Down
9 changes: 1 addition & 8 deletions benchmarks/src/clickbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,7 @@ impl RunOpt {
None => queries.min_query_id()..=queries.max_query_id(),
};

let mut config = self.common.config();
config
.options_mut()
.execution
.parquet
.schema_force_string_view = self.common.string_view;

let ctx = SessionContext::new_with_config(config);
let ctx = SessionContext::new();
self.register_hits(&ctx).await?;

let iterations = self.common.iterations;
Expand Down
5 changes: 0 additions & 5 deletions benchmarks/src/tpch/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,6 @@ impl RunOpt {
.config()
.with_collect_statistics(!self.disable_statistics);
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
config
.options_mut()
.execution
.parquet
.schema_force_string_view = self.common.string_view;
let ctx = SessionContext::new_with_config(config);

// register tables
Expand Down
5 changes: 0 additions & 5 deletions benchmarks/src/util/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,6 @@ pub struct CommonOpt {
/// Activate debug mode to see more details
#[structopt(short, long)]
pub debug: bool,

/// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
/// when reading ParquetFiles
#[structopt(long)]
pub string_view: bool,
}

impl CommonOpt {
Expand Down
7 changes: 4 additions & 3 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -470,9 +470,10 @@ config_namespace! {
/// data frame.
pub maximum_buffered_record_batches_per_stream: usize, default = 2

/// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
/// and `Binary/BinaryLarge` with `BinaryView`.
pub schema_force_string_view: bool, default = false
/// (reading) If true, parquet reader will read columns of
/// `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with
/// `BinaryView`.
pub schema_force_string_view: bool, default = true
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

use arrow::array::{
BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
StringViewBuilder,
};
use arrow::datatypes::i256;
use arrow::{array::ArrayRef, datatypes::DataType};
Expand Down Expand Up @@ -438,6 +439,25 @@ macro_rules! get_statistics {
}
Ok(Arc::new(builder.finish()))
},
DataType::Utf8View => {
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
let mut builder = StringViewBuilder::new();
for x in iterator {
let Some(x) = x else {
builder.append_null(); // no statistics value
continue;
};

let Ok(x) = std::str::from_utf8(x) else {
log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
builder.append_null();
continue;
};

builder.append_value(x);
}
Ok(Arc::new(builder.finish()))
},
DataType::FixedSizeBinary(size) => {
let iterator = [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator);
let mut builder = FixedSizeBinaryBuilder::new(*size);
Expand Down Expand Up @@ -482,8 +502,8 @@ macro_rules! get_statistics {
DataType::Duration(_) |
DataType::Interval(_) |
DataType::Null |
// TODO binary view
DataType::BinaryView |
DataType::Utf8View |
DataType::List(_) |
DataType::ListView(_) |
DataType::FixedSizeList(_, _) |
Expand Down Expand Up @@ -901,6 +921,29 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
// TODO file upstream in Arrowrs --
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I filed upstream as apache/arrow-rs#6164

Suggested change
// TODO file upstream in Arrowrs --
// https://github.com/apache/arrow-rs/issues/6164

// support Utf8View and BinaryView in statistics
Some(DataType::Utf8View) => {
let mut builder = StringViewBuilder::new();
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
for x in x.into_iter() {
let Some(x) = x else {
builder.append_null(); // no statistics value
continue;
};

let Ok(x) = std::str::from_utf8(x.data()) else {
log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
builder.append_null();
continue;
};

builder.append_value(x);
}
}
Ok(Arc::new(builder.finish()))
},
Some(DataType::Dictionary(_, value_type)) => {
[<$stat_type_prefix:lower _ page_statistics>](Some(value_type), $iterator)
},
Expand Down Expand Up @@ -983,6 +1026,7 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
// TODO file upstream in arrow-rs -- return not implemented for unsupported types rather than panic
_ => unimplemented!()
}
}
Expand Down Expand Up @@ -1104,6 +1148,7 @@ where
.iter()
.map(|x| x.null_count.map(|x| x as u64))
.collect::<Vec<_>>(),
// TODO file upstream in Arrow-rs -- return not implemented
_ => unimplemented!(),
});

Expand Down
39 changes: 36 additions & 3 deletions datafusion/core/src/datasource/schema_adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@
//! physical format into how they should be used by DataFusion. For instance, a schema
//! can be stored external to a parquet file that maps parquet logical types to arrow types.
use arrow::compute::{can_cast_types, cast};
use arrow_array::{new_null_array, RecordBatch, RecordBatchOptions};
use arrow_schema::{Schema, SchemaRef};
use arrow_array::builder::StringBuilder;
use arrow_array::cast::AsArray;
use arrow_array::{new_null_array, Array, ArrayRef, RecordBatch, RecordBatchOptions};
use arrow_schema::{ArrowError, DataType, Schema, SchemaRef};
use datafusion_common::plan_err;
use std::fmt::Debug;
use std::sync::Arc;
Expand Down Expand Up @@ -165,6 +166,38 @@ impl SchemaAdapter for DefaultSchemaAdapter {
}
}

// Workaround arrow-rs bug in can_cast_types
// External error: query failed: DataFusion error: Arrow error: Cast error: Casting from BinaryView to Utf8 not supported
fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
arrow::compute::can_cast_types(from_type, to_type)
|| matches!(
(from_type, to_type),
(DataType::BinaryView, DataType::Utf8 | DataType::LargeUtf8)
| (DataType::Utf8 | DataType::LargeUtf8, DataType::BinaryView)
)
}

// Work around arrow-rs casting bug
// External error: query failed: DataFusion error: Arrow error: Cast error: Casting from BinaryView to Utf8 not supported
fn cast(array: &dyn Array, to_type: &DataType) -> Result<ArrayRef, ArrowError> {
match (array.data_type(), to_type) {
(DataType::BinaryView, DataType::Utf8) => {
let array = array.as_binary_view();
let mut builder = StringBuilder::with_capacity(array.len(), 8 * 1024);
for value in array.iter() {
// check if the value is valid utf8 (should do this once, not each value)
let value = value.map(|value| std::str::from_utf8(value)).transpose()?;

builder.append_option(value);
}

Ok(Arc::new(builder.finish()))
}
// fallback to arrow kernel
(_, _) => arrow::compute::cast(array, to_type),
}
}

/// The SchemaMapping struct holds a mapping from the file schema to the table schema
/// and any necessary type conversions that need to be applied.
#[derive(Debug)]
Expand Down
5 changes: 4 additions & 1 deletion datafusion/functions-aggregate/src/count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,14 +237,17 @@ impl AggregateUDFImpl for Count {
Box::new(BytesDistinctCountAccumulator::<i32>::new(OutputType::Utf8))
}
DataType::Utf8View => {
Box::new(BytesViewDistinctCountAccumulator::new(OutputType::Utf8))
Box::new(BytesViewDistinctCountAccumulator::new(OutputType::Utf8View))
}
DataType::LargeUtf8 => {
Box::new(BytesDistinctCountAccumulator::<i64>::new(OutputType::Utf8))
}
DataType::Binary => Box::new(BytesDistinctCountAccumulator::<i32>::new(
OutputType::Binary,
)),
DataType::BinaryView => Box::new(BytesViewDistinctCountAccumulator::new(
OutputType::BinaryView,
)),
DataType::LargeBinary => Box::new(BytesDistinctCountAccumulator::<i64>::new(
OutputType::Binary,
)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,9 @@ pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec<DFColumnType> {
| DataType::Float64
| DataType::Decimal128(_, _)
| DataType::Decimal256(_, _) => DFColumnType::Float,
DataType::Utf8 | DataType::LargeUtf8 => DFColumnType::Text,
DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
DFColumnType::Text
}
DataType::Date32
| DataType::Date64
| DataType::Time32(_)
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/arrow_typeof.slt
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)');
[1, 2, 3]

# Tests for Utf8View
query ?T
query TT
select arrow_cast('MyAwesomeString', 'Utf8View'), arrow_typeof(arrow_cast('MyAwesomeString', 'Utf8View'))
----
MyAwesomeString Utf8View
Expand Down
18 changes: 18 additions & 0 deletions datafusion/sqllogictest/test_files/clickbench.slt
Original file line number Diff line number Diff line change
Expand Up @@ -274,5 +274,23 @@ query PI
SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-14' AND "EventDate"::INT::DATE <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
----

# Clickbench "Extended" queries that test count distinct

query III
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
----
1 1 1

query III
SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage") FROM hits;
----
1 1 1

query TIIII
SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
----
� 1 1 1 1


statement ok
drop table hits;
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/describe.slt
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ int_col Int32 YES
bigint_col Int64 YES
float_col Float32 YES
double_col Float64 YES
date_string_col Utf8 YES
string_col Utf8 YES
date_string_col Utf8View YES
string_col Utf8View YES
timestamp_col Timestamp(Nanosecond, None) YES
year Int32 YES
month Int32 YES
Loading