Skip to content

Commit

Permalink
apacheGH-45227: [C++][Parquet] Enable Size Stats and Page Index by de…
Browse files Browse the repository at this point in the history
…fault (apache#45249)

### Rationale for this change

Benchmark data shows that enabling page index and size stats by default does not have significant penalty.

### What changes are included in this PR?

Enable the parquet writer to generate page index and size stats by default.

### Are these changes tested?

Pass CIs.

### Are there any user-facing changes?

No.
* GitHub Issue: apache#45227

Authored-by: Gang Wu <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
  • Loading branch information
wgtmac authored Jan 21, 2025
1 parent 43afdce commit 1fcc892
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 25 deletions.
9 changes: 7 additions & 2 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4371,6 +4371,7 @@ TEST_P(TestArrowWriteDictionary, Statistics) {
->data_page_version(this->GetParquetDataPageVersion())
->write_batch_size(2)
->data_pagesize(2)
->disable_write_page_index()
->build();
std::unique_ptr<FileWriter> writer;
ASSERT_OK_AND_ASSIGN(
Expand Down Expand Up @@ -4476,6 +4477,7 @@ TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) {
->data_page_version(this->GetParquetDataPageVersion())
->write_batch_size(3)
->data_pagesize(3)
->disable_write_page_index()
->build();
std::unique_ptr<FileWriter> writer;
ASSERT_OK_AND_ASSIGN(
Expand Down Expand Up @@ -5290,7 +5292,10 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) {
auto pool = ::arrow::default_memory_pool();
auto sink = CreateOutputStream();
// Limit the max number of rows in a row group to 10
auto writer_properties = WriterProperties::Builder().max_row_group_length(10)->build();
auto writer_properties = WriterProperties::Builder()
.max_row_group_length(10)
->disable_write_page_index()
->build();
auto arrow_writer_properties = default_arrow_writer_properties();

// Prepare schema
Expand Down Expand Up @@ -5346,7 +5351,7 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) {
ASSERT_EQ(10, file_metadata->RowGroup(0)->num_rows());
ASSERT_EQ(2, file_metadata->RowGroup(1)->num_rows());

// Verify that page index is not written by default.
// Verify that page index is not written.
for (int i = 0; i < num_row_groups; ++i) {
auto row_group_metadata = file_metadata->RowGroup(i);
for (int j = 0; j < row_group_metadata->num_columns(); ++j) {
Expand Down
56 changes: 35 additions & 21 deletions cpp/src/parquet/arrow/size_stats_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,16 @@ int64_t GetTotalPageIndexSize(const std::shared_ptr<::parquet::FileMetaData>& me
}

void WriteColumn(::benchmark::State& state, const std::shared_ptr<::arrow::Table>& table,
SizeStatisticsLevel stats_level) {
SizeStatisticsLevel stats_level, bool enable_page_index) {
// Use the fastest possible encoding and compression settings, to better exhibit
// the size statistics overhead.
auto properties = WriterProperties::Builder()
.enable_statistics()
->enable_write_page_index()
auto builder = WriterProperties::Builder();
if (enable_page_index) {
builder.enable_write_page_index();
} else {
builder.disable_write_page_index();
}
auto properties = builder.enable_statistics()
->disable_dictionary()
->encoding(Encoding::PLAIN)
->set_size_statistics_level(stats_level)
Expand Down Expand Up @@ -113,17 +117,17 @@ void WriteColumn(::benchmark::State& state, const std::shared_ptr<::arrow::Table
state.SetBytesProcessed(state.iterations() * GetTotalBytes(table));
}

template <SizeStatisticsLevel level, typename ArrowType>
template <SizeStatisticsLevel level, typename ArrowType, bool enable_page_index>
void BM_WritePrimitiveColumn(::benchmark::State& state) {
::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
auto type = std::make_shared<ArrowType>();
auto array = generator.ArrayOf(type, kBenchmarkSize, kNullProbability);
auto table = ::arrow::Table::Make(
::arrow::schema({::arrow::field("column", type, kNullProbability > 0)}), {array});
WriteColumn(state, table, level);
WriteColumn(state, table, level, enable_page_index);
}

template <SizeStatisticsLevel level, typename ArrowType>
template <SizeStatisticsLevel level, typename ArrowType, bool enable_page_index>
void BM_WriteListColumn(::benchmark::State& state) {
::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
auto element_type = std::make_shared<ArrowType>();
Expand All @@ -133,33 +137,43 @@ void BM_WriteListColumn(::benchmark::State& state) {
auto table = ::arrow::Table::Make(
::arrow::schema({::arrow::field("column", list_type, kNullProbability > 0)}),
{list_array});
WriteColumn(state, table, level);
WriteColumn(state, table, level, enable_page_index);
}

BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
::arrow::Int64Type);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, ::arrow::Int64Type,
/*enable_page_index=*/false);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None, ::arrow::Int64Type,
/*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
::arrow::Int64Type);
::arrow::Int64Type, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::PageAndColumnChunk,
::arrow::Int64Type);
::arrow::Int64Type, /*enable_page_index=*/true);

BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
::arrow::StringType);
::arrow::StringType, /*enable_page_index=*/false);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
::arrow::StringType, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
::arrow::StringType);
::arrow::StringType, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::PageAndColumnChunk,
::arrow::StringType);
::arrow::StringType, /*enable_page_index=*/true);

BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::Int64Type);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::Int64Type,
/*enable_page_index=*/false);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::Int64Type,
/*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
::arrow::Int64Type);
::arrow::Int64Type, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
::arrow::Int64Type);
::arrow::Int64Type, /*enable_page_index=*/true);

BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::StringType);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::StringType,
/*enable_page_index=*/false);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None, ::arrow::StringType,
/*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
::arrow::StringType);
::arrow::StringType, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
::arrow::StringType);
::arrow::StringType, /*enable_page_index=*/true);

} // namespace parquet::benchmark
6 changes: 4 additions & 2 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
SizeStatisticsLevel::PageAndColumnChunk;

class PARQUET_EXPORT ColumnProperties {
public:
Expand Down Expand Up @@ -258,7 +260,7 @@ class PARQUET_EXPORT WriterProperties {
created_by_(DEFAULT_CREATED_BY),
store_decimal_as_integer_(false),
page_checksum_enabled_(false),
size_statistics_level_(SizeStatisticsLevel::None) {}
size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL) {}

explicit Builder(const WriterProperties& properties)
: pool_(properties.memory_pool()),
Expand Down

0 comments on commit 1fcc892

Please sign in to comment.