Skip to content

Commit

Permalink
remove unused max_statistics_size field
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Dec 21, 2024
1 parent 00a66bd commit 2fe785b
Show file tree
Hide file tree
Showing 9 changed files with 1 addition and 144 deletions.
8 changes: 0 additions & 8 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -459,10 +459,6 @@ config_namespace! {
/// default parquet writer setting
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())

/// (writing) Sets max statistics size for any column. If NULL, uses
/// default parquet writer setting
pub max_statistics_size: Option<usize>, default = Some(4096)

/// (writing) Target maximum number of rows in each row group (defaults to 1M
/// rows). Writing larger row groups requires more memory to write, but
/// can get better compression and be faster to read.
Expand Down Expand Up @@ -1653,10 +1649,6 @@ config_namespace_with_hashmap! {
/// Sets bloom filter number of distinct values. If NULL, uses
/// default parquet options
pub bloom_filter_ndv: Option<u64>, default = None

/// Sets max statistics size for the column path. If NULL, uses
/// default parquet options
pub max_statistics_size: Option<usize>, default = None
}
}

Expand Down
15 changes: 1 addition & 14 deletions datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use parquet::{
basic::{BrotliLevel, GzipLevel, ZstdLevel},
file::properties::{
EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion,
DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED,
DEFAULT_STATISTICS_ENABLED,
},
format::KeyValue,
schema::types::ColumnPath,
Expand Down Expand Up @@ -129,11 +129,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
builder =
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
}

if let Some(max_statistics_size) = options.max_statistics_size {
builder =
builder.set_column_max_statistics_size(path, max_statistics_size);
}
}

Ok(builder)
Expand All @@ -154,7 +149,6 @@ impl ParquetOptions {
dictionary_enabled,
dictionary_page_size_limit,
statistics_enabled,
max_statistics_size,
max_row_group_size,
created_by,
column_index_truncate_length,
Expand Down Expand Up @@ -190,9 +184,6 @@ impl ParquetOptions {
.and_then(|s| parse_statistics_string(s).ok())
.unwrap_or(DEFAULT_STATISTICS_ENABLED),
)
.set_max_statistics_size(
max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
)
.set_max_row_group_size(*max_row_group_size)
.set_created_by(created_by.clone())
.set_column_index_truncate_length(*column_index_truncate_length)
Expand Down Expand Up @@ -395,7 +386,6 @@ mod tests {
compression: Some("zstd(22)".into()),
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
statistics_enabled: Some("none".into()),
max_statistics_size: Some(72),
encoding: Some("RLE".into()),
bloom_filter_enabled: Some(true),
bloom_filter_fpp: Some(0.72),
Expand All @@ -419,7 +409,6 @@ mod tests {
dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)),
dictionary_page_size_limit: 42,
statistics_enabled: Some("chunk".into()),
max_statistics_size: Some(42),
max_row_group_size: 42,
created_by: "wordy".into(),
column_index_truncate_length: Some(42),
Expand Down Expand Up @@ -473,7 +462,6 @@ mod tests {
),
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
max_statistics_size: Some(props.max_statistics_size(&col)),
}
}

Expand Down Expand Up @@ -523,7 +511,6 @@ mod tests {
compression: default_col_props.compression,
dictionary_enabled: default_col_props.dictionary_enabled,
statistics_enabled: default_col_props.statistics_enabled,
max_statistics_size: default_col_props.max_statistics_size,
bloom_filter_on_write: default_col_props
.bloom_filter_enabled
.unwrap_or_default(),
Expand Down
8 changes: 0 additions & 8 deletions datafusion/proto-common/proto/datafusion_common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -473,10 +473,6 @@ message ParquetColumnOptions {
oneof bloom_filter_ndv_opt {
uint64 bloom_filter_ndv = 7;
}

oneof max_statistics_size_opt {
uint32 max_statistics_size = 8;
}
}

message ParquetOptions {
Expand Down Expand Up @@ -514,10 +510,6 @@ message ParquetOptions {
string statistics_enabled = 13;
}

oneof max_statistics_size_opt {
uint64 max_statistics_size = 14;
}

oneof column_index_truncate_length_opt {
uint64 column_index_truncate_length = 17;
}
Expand Down
12 changes: 0 additions & 12 deletions datafusion/proto-common/src/from_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -926,12 +926,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v),
})
.unwrap_or(None),
max_statistics_size: value
.max_statistics_size_opt.as_ref()
.map(|opt| match opt {
protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize),
})
.unwrap_or(None),
max_row_group_size: value.max_row_group_size as usize,
created_by: value.created_by.clone(),
column_index_truncate_length: value
Expand Down Expand Up @@ -986,12 +980,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions {
protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v),
})
.unwrap_or(None),
max_statistics_size: value
.max_statistics_size_opt
.map(|opt| match opt {
protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize),
})
.unwrap_or(None),
encoding: value
.encoding_opt.clone()
.map(|opt| match opt {
Expand Down
46 changes: 0 additions & 46 deletions datafusion/proto-common/src/generated/pbjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4466,9 +4466,6 @@ impl serde::Serialize for ParquetColumnOptions {
if self.bloom_filter_ndv_opt.is_some() {
len += 1;
}
if self.max_statistics_size_opt.is_some() {
len += 1;
}
let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?;
if let Some(v) = self.bloom_filter_enabled_opt.as_ref() {
match v {
Expand Down Expand Up @@ -4521,13 +4518,6 @@ impl serde::Serialize for ParquetColumnOptions {
}
}
}
if let Some(v) = self.max_statistics_size_opt.as_ref() {
match v {
parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => {
struct_ser.serialize_field("maxStatisticsSize", v)?;
}
}
}
struct_ser.end()
}
}
Expand All @@ -4550,8 +4540,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
"bloomFilterFpp",
"bloom_filter_ndv",
"bloomFilterNdv",
"max_statistics_size",
"maxStatisticsSize",
];

#[allow(clippy::enum_variant_names)]
Expand All @@ -4563,7 +4551,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
StatisticsEnabled,
BloomFilterFpp,
BloomFilterNdv,
MaxStatisticsSize,
}
impl<'de> serde::Deserialize<'de> for GeneratedField {
fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
Expand Down Expand Up @@ -4592,7 +4579,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
"statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled),
"bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp),
"bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv),
"maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize),
_ => Err(serde::de::Error::unknown_field(value, FIELDS)),
}
}
Expand All @@ -4619,7 +4605,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
let mut statistics_enabled_opt__ = None;
let mut bloom_filter_fpp_opt__ = None;
let mut bloom_filter_ndv_opt__ = None;
let mut max_statistics_size_opt__ = None;
while let Some(k) = map_.next_key()? {
match k {
GeneratedField::BloomFilterEnabled => {
Expand Down Expand Up @@ -4664,12 +4649,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
}
bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0));
}
GeneratedField::MaxStatisticsSize => {
if max_statistics_size_opt__.is_some() {
return Err(serde::de::Error::duplicate_field("maxStatisticsSize"));
}
max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0));
}
}
}
Ok(ParquetColumnOptions {
Expand All @@ -4680,7 +4659,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
statistics_enabled_opt: statistics_enabled_opt__,
bloom_filter_fpp_opt: bloom_filter_fpp_opt__,
bloom_filter_ndv_opt: bloom_filter_ndv_opt__,
max_statistics_size_opt: max_statistics_size_opt__,
})
}
}
Expand Down Expand Up @@ -4964,9 +4942,6 @@ impl serde::Serialize for ParquetOptions {
if self.statistics_enabled_opt.is_some() {
len += 1;
}
if self.max_statistics_size_opt.is_some() {
len += 1;
}
if self.column_index_truncate_length_opt.is_some() {
len += 1;
}
Expand Down Expand Up @@ -5081,15 +5056,6 @@ impl serde::Serialize for ParquetOptions {
}
}
}
if let Some(v) = self.max_statistics_size_opt.as_ref() {
match v {
parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => {
#[allow(clippy::needless_borrow)]
#[allow(clippy::needless_borrows_for_generic_args)]
struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?;
}
}
}
if let Some(v) = self.column_index_truncate_length_opt.as_ref() {
match v {
parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => {
Expand Down Expand Up @@ -5176,8 +5142,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
"dictionaryEnabled",
"statistics_enabled",
"statisticsEnabled",
"max_statistics_size",
"maxStatisticsSize",
"column_index_truncate_length",
"columnIndexTruncateLength",
"encoding",
Expand Down Expand Up @@ -5212,7 +5176,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
Compression,
DictionaryEnabled,
StatisticsEnabled,
MaxStatisticsSize,
ColumnIndexTruncateLength,
Encoding,
BloomFilterFpp,
Expand Down Expand Up @@ -5261,7 +5224,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
"compression" => Ok(GeneratedField::Compression),
"dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled),
"statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled),
"maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize),
"columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength),
"encoding" => Ok(GeneratedField::Encoding),
"bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp),
Expand Down Expand Up @@ -5308,7 +5270,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
let mut compression_opt__ = None;
let mut dictionary_enabled_opt__ = None;
let mut statistics_enabled_opt__ = None;
let mut max_statistics_size_opt__ = None;
let mut column_index_truncate_length_opt__ = None;
let mut encoding_opt__ = None;
let mut bloom_filter_fpp_opt__ = None;
Expand Down Expand Up @@ -5467,12 +5428,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
}
statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled);
}
GeneratedField::MaxStatisticsSize => {
if max_statistics_size_opt__.is_some() {
return Err(serde::de::Error::duplicate_field("maxStatisticsSize"));
}
max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0));
}
GeneratedField::ColumnIndexTruncateLength => {
if column_index_truncate_length_opt__.is_some() {
return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength"));
Expand Down Expand Up @@ -5523,7 +5478,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
compression_opt: compression_opt__,
dictionary_enabled_opt: dictionary_enabled_opt__,
statistics_enabled_opt: statistics_enabled_opt__,
max_statistics_size_opt: max_statistics_size_opt__,
column_index_truncate_length_opt: column_index_truncate_length_opt__,
encoding_opt: encoding_opt__,
bloom_filter_fpp_opt: bloom_filter_fpp_opt__,
Expand Down
18 changes: 0 additions & 18 deletions datafusion/proto-common/src/generated/prost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -664,10 +664,6 @@ pub struct ParquetColumnOptions {
pub bloom_filter_ndv_opt: ::core::option::Option<
parquet_column_options::BloomFilterNdvOpt,
>,
#[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")]
pub max_statistics_size_opt: ::core::option::Option<
parquet_column_options::MaxStatisticsSizeOpt,
>,
}
/// Nested message and enum types in `ParquetColumnOptions`.
pub mod parquet_column_options {
Expand Down Expand Up @@ -706,11 +702,6 @@ pub mod parquet_column_options {
#[prost(uint64, tag = "7")]
BloomFilterNdv(u64),
}
#[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
pub enum MaxStatisticsSizeOpt {
#[prost(uint32, tag = "8")]
MaxStatisticsSize(u32),
}
}
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ParquetOptions {
Expand Down Expand Up @@ -785,10 +776,6 @@ pub struct ParquetOptions {
pub statistics_enabled_opt: ::core::option::Option<
parquet_options::StatisticsEnabledOpt,
>,
#[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")]
pub max_statistics_size_opt: ::core::option::Option<
parquet_options::MaxStatisticsSizeOpt,
>,
#[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")]
pub column_index_truncate_length_opt: ::core::option::Option<
parquet_options::ColumnIndexTruncateLengthOpt,
Expand Down Expand Up @@ -823,11 +810,6 @@ pub mod parquet_options {
StatisticsEnabled(::prost::alloc::string::String),
}
#[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
pub enum MaxStatisticsSizeOpt {
#[prost(uint64, tag = "14")]
MaxStatisticsSize(u64),
}
#[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
pub enum ColumnIndexTruncateLengthOpt {
#[prost(uint64, tag = "17")]
ColumnIndexTruncateLength(u64),
Expand Down
6 changes: 0 additions & 6 deletions datafusion/proto-common/src/to_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,6 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled),
dictionary_page_size_limit: value.dictionary_page_size_limit as u64,
statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled),
max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)),
max_row_group_size: value.max_row_group_size as u64,
created_by: value.created_by.clone(),
column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)),
Expand Down Expand Up @@ -857,11 +856,6 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions {
.statistics_enabled
.clone()
.map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled),
max_statistics_size_opt: value.max_statistics_size.map(|v| {
protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(
v as u32,
)
}),
encoding_opt: value
.encoding
.clone()
Expand Down
Loading

0 comments on commit 2fe785b

Please sign in to comment.