diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index cedcebbfb6e33..47a00016b94b0 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4371,6 +4371,7 @@ TEST_P(TestArrowWriteDictionary, Statistics) { ->data_page_version(this->GetParquetDataPageVersion()) ->write_batch_size(2) ->data_pagesize(2) + ->disable_write_page_index() ->build(); std::unique_ptr writer; ASSERT_OK_AND_ASSIGN( @@ -4476,6 +4477,7 @@ TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) { ->data_page_version(this->GetParquetDataPageVersion()) ->write_batch_size(3) ->data_pagesize(3) + ->disable_write_page_index() ->build(); std::unique_ptr writer; ASSERT_OK_AND_ASSIGN( @@ -5290,7 +5292,10 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) { auto pool = ::arrow::default_memory_pool(); auto sink = CreateOutputStream(); // Limit the max number of rows in a row group to 10 - auto writer_properties = WriterProperties::Builder().max_row_group_length(10)->build(); + auto writer_properties = WriterProperties::Builder() + .max_row_group_length(10) + ->disable_write_page_index() + ->build(); auto arrow_writer_properties = default_arrow_writer_properties(); // Prepare schema @@ -5346,7 +5351,7 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) { ASSERT_EQ(10, file_metadata->RowGroup(0)->num_rows()); ASSERT_EQ(2, file_metadata->RowGroup(1)->num_rows()); - // Verify that page index is not written by default. + // Verify that page index is not written. for (int i = 0; i < num_row_groups; ++i) { auto row_group_metadata = file_metadata->RowGroup(i); for (int j = 0; j < row_group_metadata->num_columns(); ++j) { diff --git a/cpp/src/parquet/arrow/size_stats_benchmark.cc b/cpp/src/parquet/arrow/size_stats_benchmark.cc index d43a3737b18b8..c5c95fc6141b6 100644 --- a/cpp/src/parquet/arrow/size_stats_benchmark.cc +++ b/cpp/src/parquet/arrow/size_stats_benchmark.cc @@ -80,12 +80,16 @@ int64_t GetTotalPageIndexSize(const std::shared_ptr<::parquet::FileMetaData>& me } void WriteColumn(::benchmark::State& state, const std::shared_ptr<::arrow::Table>& table, - SizeStatisticsLevel stats_level) { + SizeStatisticsLevel stats_level, bool enable_page_index) { // Use the fastest possible encoding and compression settings, to better exhibit // the size statistics overhead. - auto properties = WriterProperties::Builder() - .enable_statistics() - ->enable_write_page_index() + auto builder = WriterProperties::Builder(); + if (enable_page_index) { + builder.enable_write_page_index(); + } else { + builder.disable_write_page_index(); + } + auto properties = builder.enable_statistics() ->disable_dictionary() ->encoding(Encoding::PLAIN) ->set_size_statistics_level(stats_level) @@ -113,17 +117,17 @@ void WriteColumn(::benchmark::State& state, const std::shared_ptr<::arrow::Table state.SetBytesProcessed(state.iterations() * GetTotalBytes(table)); } -template +template void BM_WritePrimitiveColumn(::benchmark::State& state) { ::arrow::random::RandomArrayGenerator generator(/*seed=*/42); auto type = std::make_shared(); auto array = generator.ArrayOf(type, kBenchmarkSize, kNullProbability); auto table = ::arrow::Table::Make( ::arrow::schema({::arrow::field("column", type, kNullProbability > 0)}), {array}); - WriteColumn(state, table, level); + WriteColumn(state, table, level, enable_page_index); } -template +template void BM_WriteListColumn(::benchmark::State& state) { ::arrow::random::RandomArrayGenerator generator(/*seed=*/42); auto element_type = std::make_shared(); @@ -133,33 +137,43 @@ void BM_WriteListColumn(::benchmark::State& state) { auto table = ::arrow::Table::Make( ::arrow::schema({::arrow::field("column", list_type, kNullProbability > 0)}), {list_array}); - WriteColumn(state, table, level); + WriteColumn(state, table, level, enable_page_index); } -BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, - ::arrow::Int64Type); +BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, ::arrow::Int64Type, + /*enable_page_index=*/false); +BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, ::arrow::Int64Type, + /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk, - ::arrow::Int64Type); + ::arrow::Int64Type, /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::PageAndColumnChunk, - ::arrow::Int64Type); + ::arrow::Int64Type, /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, - ::arrow::StringType); + ::arrow::StringType, /*enable_page_index=*/false); +BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, + ::arrow::StringType, /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk, - ::arrow::StringType); + ::arrow::StringType, /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::PageAndColumnChunk, - ::arrow::StringType); + ::arrow::StringType, /*enable_page_index=*/true); -BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::Int64Type); +BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::Int64Type, + /*enable_page_index=*/false); +BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::Int64Type, + /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk, - ::arrow::Int64Type); + ::arrow::Int64Type, /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk, - ::arrow::Int64Type); + ::arrow::Int64Type, /*enable_page_index=*/true); -BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::StringType); +BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::StringType, + /*enable_page_index=*/false); +BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::StringType, + /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk, - ::arrow::StringType); + ::arrow::StringType, /*enable_page_index=*/true); BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk, - ::arrow::StringType); + ::arrow::StringType, /*enable_page_index=*/true); } // namespace parquet::benchmark diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index edaf28cd92ae6..8ae3660014f76 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -164,7 +164,9 @@ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096; static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; -static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false; +static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true; +static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL = + SizeStatisticsLevel::PageAndColumnChunk; class PARQUET_EXPORT ColumnProperties { public: @@ -258,7 +260,7 @@ class PARQUET_EXPORT WriterProperties { created_by_(DEFAULT_CREATED_BY), store_decimal_as_integer_(false), page_checksum_enabled_(false), - size_statistics_level_(SizeStatisticsLevel::None) {} + size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()),