Skip to content

Commit

Permalink
Update comments
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Oct 16, 2024
1 parent 49f67b7 commit 124f0c7
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 13 deletions.
21 changes: 15 additions & 6 deletions datafusion/core/src/datasource/file_format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ pub fn file_type_to_format(
}

/// Transform a schema to use view types for Utf8 and Binary
///
/// See [parquet::ParquetFormat::force_view_types] for details
pub fn transform_schema_to_view(schema: &Schema) -> Schema {
let transformed_fields: Vec<Arc<Field>> = schema
.fields
Expand Down Expand Up @@ -302,7 +304,7 @@ pub(crate) fn coerce_file_schema_to_view_type(
))
}

/// Transform a schema to force binary types to be strings
/// Transform a schema so that any binary types are strings
pub fn transform_binary_to_string(schema: &Schema) -> Schema {
let transformed_fields: Vec<Arc<Field>> = schema
.fields
Expand All @@ -316,13 +318,19 @@ pub fn transform_binary_to_string(schema: &Schema) -> Schema {
Field::new(field.name(), DataType::LargeUtf8, field.is_nullable())
.with_metadata(field.metadata().to_owned()),
),
DataType::BinaryView => Arc::new(
Field::new(field.name(), DataType::BinaryView, field.is_nullable())
.with_metadata(field.metadata().to_owned()),
),
_ => field.clone(),
})
.collect();
Schema::new_with_metadata(transformed_fields, schema.metadata.clone())
}

/// If the table schema uses a string type, coerce the file schema to use a string type.
///
/// See [parquet::ParquetFormat::binary_as_string] for details
pub(crate) fn coerce_file_schema_to_string_type(
table_schema: &Schema,
file_schema: &Schema,
Expand All @@ -338,25 +346,26 @@ pub(crate) fn coerce_file_schema_to_string_type(
.iter()
.map(
|field| match (table_fields.get(field.name()), field.data_type()) {
(Some(DataType::Utf8), DataType::Binary) => {
// table schema uses string type, coerce the file schema to use string type
(Some(DataType::Utf8),
DataType::Binary | DataType::LargeBinary | DataType::BinaryView) => {
transform = true;
Arc::new(Field::new(
field.name(),
DataType::Utf8,
field.is_nullable(),
))
}
(Some(DataType::LargeUtf8), DataType::LargeBinary) => {
// table schema uses large string type, coerce the file schema to use large string type
(Some(DataType::LargeUtf8), DataType::Binary | DataType::LargeBinary | DataType::BinaryView) => {
transform = true;
Arc::new(Field::new(
field.name(),
DataType::LargeUtf8,
field.is_nullable(),
))
}
// If `schema_force_view_types` is enabled, the actual data could be `Binary` or `LargeBinary`
// because we will first change the table schema for binary-to-string coercion, then apply the
// string-to-view transformation. So we need all binary types to be coerced to `Utf8View` here.
// table schema uses string view type, coerce the file schema to use view type
(
Some(DataType::Utf8View),
DataType::Binary | DataType::LargeBinary | DataType::BinaryView,
Expand Down
17 changes: 10 additions & 7 deletions datafusion/core/src/datasource/file_format/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,22 +254,25 @@ impl ParquetFormat {
self.options.global.schema_force_view_types
}

/// If true, will use view types (StringView and BinaryView).
///
/// Refer to [`Self::force_view_types`].
/// If true, will use view types. See [`Self::force_view_types`] for details
pub fn with_force_view_types(mut self, use_views: bool) -> Self {
self.options.global.schema_force_view_types = use_views;
self
}

/// Return `true` if binary type will be read as string.
/// Return `true` if binary types will be read as strings.
///
/// If this returns true, DataFusion will instruct the parquet reader
/// to read binary columns such as `Binary` or `BinaryView` as the
/// corresponding string type such as `Utf8` or `LargeUtf8`.
/// The parquet reader has special optimizations for `Utf8` and `LargeUtf8`
/// validation, and such queries are significantly faster than reading
/// binary columns and then casting to string columns.
pub fn binary_as_string(&self) -> bool {
self.options.global.binary_as_string
}

/// If true, will read binary type as string.
///
/// Refer to [`Self::binary_as_string`].
/// If true, will read binary types as strings. See [`Self::binary_as_string`] for details
pub fn with_binary_as_string(mut self, binary_as_string: bool) -> Self {
self.options.global.binary_as_string = binary_as_string;
self
Expand Down

0 comments on commit 124f0c7

Please sign in to comment.