From 6959c5ba23c9756265eefe9e64a2d2f6d93d9873 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 17 Jan 2025 00:00:43 +0800 Subject: [PATCH 1/4] GH-45283: [C++][Parquet] Omit level histogram when max level is 0 --- cpp/src/parquet/column_writer.cc | 3 +++ cpp/src/parquet/size_statistics.cc | 32 +++++++++++++++++-------- cpp/src/parquet/size_statistics_test.cc | 18 ++++++++++++++ 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 683a5ab735aed..4998e6f301a00 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1609,6 +1609,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< auto add_levels = [](std::vector& level_histogram, ::arrow::util::span levels, int16_t max_level) { + if (max_level == 0) { + return; + } ARROW_DCHECK_EQ(static_cast(max_level) + 1, level_histogram.size()); ::parquet::UpdateLevelHistogram(levels, level_histogram); }; diff --git a/cpp/src/parquet/size_statistics.cc b/cpp/src/parquet/size_statistics.cc index 7292f9222a684..2be1828625921 100644 --- a/cpp/src/parquet/size_statistics.cc +++ b/cpp/src/parquet/size_statistics.cc @@ -64,14 +64,19 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) { } void SizeStatistics::Validate(const ColumnDescriptor* descr) const { - if (repetition_level_histogram.size() != - static_cast(descr->max_repetition_level() + 1)) { - throw ParquetException("Repetition level histogram size mismatch"); - } - if (definition_level_histogram.size() != - static_cast(descr->max_definition_level() + 1)) { - throw ParquetException("Definition level histogram size mismatch"); - } + auto validate_histogram = [](const std::vector& histogram, int16_t max_level, + const std::string& name) { + if (max_level == 0 && histogram.empty()) { + return; + } + if (histogram.size() != static_cast(max_level + 1)) { + throw ParquetException(name + " level histogram size mismatch"); + } + }; + validate_histogram(repetition_level_histogram, descr->max_repetition_level(), + "Repetition"); + validate_histogram(definition_level_histogram, descr->max_definition_level(), + "Definition"); if (unencoded_byte_array_data_bytes.has_value() && descr->physical_type() != Type::BYTE_ARRAY) { throw ParquetException("Unencoded byte array data bytes does not support " + @@ -93,8 +98,15 @@ void SizeStatistics::Reset() { std::unique_ptr SizeStatistics::Make(const ColumnDescriptor* descr) { auto size_stats = std::make_unique(); - size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); - size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + // If the max level is 0, the level histogram can be omitted because it contains + // only single level (a.k.a. 0) and its count is equivalent to `num_values` of the + // column chunk or data page. + if (descr->max_repetition_level() != 0) { + size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); + } + if (descr->max_definition_level() != 0) { + size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + } if (descr->physical_type() == Type::BYTE_ARRAY) { size_stats->unencoded_byte_array_data_bytes = 0; } diff --git a/cpp/src/parquet/size_statistics_test.cc b/cpp/src/parquet/size_statistics_test.cc index 0958ae4dec2ca..ee2b79daabf3b 100644 --- a/cpp/src/parquet/size_statistics_test.cc +++ b/cpp/src/parquet/size_statistics_test.cc @@ -376,4 +376,22 @@ TEST_F(SizeStatisticsRoundTripTest, LargePage) { /*byte_array_bytes=*/{90000}})); } +TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) { + auto schema = + ::arrow::schema({::arrow::field("a", ::arrow::utf8(), /*nullable=*/false)}); + WriteFile(SizeStatisticsLevel::PageAndColumnChunk, + ::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}), + /*max_row_group_length=*/2, + /*page_size=*/1024); + ReadSizeStatistics(); + EXPECT_THAT(row_group_stats_, + ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/6})); + EXPECT_THAT(page_stats_, + ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/{6}})); +} + } // namespace parquet From 420c12f3e715fba2236852a1c7a13933294a5dd0 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 17 Jan 2025 11:40:40 +0800 Subject: [PATCH 2/4] fix test --- cpp/src/parquet/size_statistics.cc | 8 ++++-- cpp/src/parquet/size_statistics_test.cc | 33 ++++++++++++++++++------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/size_statistics.cc b/cpp/src/parquet/size_statistics.cc index 2be1828625921..c225e324891d2 100644 --- a/cpp/src/parquet/size_statistics.cc +++ b/cpp/src/parquet/size_statistics.cc @@ -66,11 +66,15 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) { void SizeStatistics::Validate(const ColumnDescriptor* descr) const { auto validate_histogram = [](const std::vector& histogram, int16_t max_level, const std::string& name) { - if (max_level == 0 && histogram.empty()) { + if (histogram.empty()) { + // A levels histogram is always allowed to be missing. return; } if (histogram.size() != static_cast(max_level + 1)) { - throw ParquetException(name + " level histogram size mismatch"); + std::stringstream ss; + ss << name << " level histogram size mismatch, size: " << histogram.size() + << ", expected: " << (max_level + 1); + throw ParquetException(ss.str()); } }; validate_histogram(repetition_level_histogram, descr->max_repetition_level(), diff --git a/cpp/src/parquet/size_statistics_test.cc b/cpp/src/parquet/size_statistics_test.cc index ee2b79daabf3b..e379d38bf2bc5 100644 --- a/cpp/src/parquet/size_statistics_test.cc +++ b/cpp/src/parquet/size_statistics_test.cc @@ -216,6 +216,20 @@ class SizeStatisticsRoundTripTest : public ::testing::Test { } } + void ReadData() { + auto reader = + ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer_)); + auto metadata = reader->metadata(); + for (int i = 0; i < metadata->num_row_groups(); ++i) { + int64_t num_rows = metadata->RowGroup(i)->num_rows(); + auto row_group_reader = reader->RowGroup(i); + for (int j = 0; j < metadata->num_columns(); ++j) { + auto column_reader = row_group_reader->RecordReader(j); + ASSERT_EQ(column_reader->ReadRecords(num_rows), num_rows); + } + } + } + void Reset() { buffer_.reset(); } protected: @@ -300,23 +314,23 @@ TEST_F(SizeStatisticsRoundTripTest, WriteDictionaryArray) { ReadSizeStatistics(); EXPECT_THAT(row_group_stats_, ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/5}, SizeStatistics{/*def_levels=*/{1, 1}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/1}, SizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/4})); EXPECT_THAT(page_stats_, ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{5}}, PageSizeStatistics{/*def_levels=*/{1, 1}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{1}}, PageSizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{4}})); } @@ -368,11 +382,11 @@ TEST_F(SizeStatisticsRoundTripTest, LargePage) { ReadSizeStatistics(); EXPECT_THAT(row_group_stats_, ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{30000, 60000}, - /*rep_levels=*/{90000}, + /*rep_levels=*/{}, /*byte_array_bytes=*/90000})); EXPECT_THAT(page_stats_, ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{30000, 60000}, - /*rep_levels=*/{90000}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{90000}})); } @@ -383,7 +397,8 @@ TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) { ::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}), /*max_row_group_length=*/2, /*page_size=*/1024); - ReadSizeStatistics(); + ASSERT_NO_FATAL_FAILURE(ReadSizeStatistics()); + ASSERT_NO_FATAL_FAILURE(ReadData()); EXPECT_THAT(row_group_stats_, ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{}, /*rep_levels=*/{}, From f16e450f17a46ac4a7cedfafab415cfc9e32f537 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 17 Jan 2025 21:33:02 +0800 Subject: [PATCH 3/4] loose the check for unencoded_byte_array_data_bytes --- cpp/src/parquet/size_statistics.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/src/parquet/size_statistics.cc b/cpp/src/parquet/size_statistics.cc index c225e324891d2..1ce6c937ad5e6 100644 --- a/cpp/src/parquet/size_statistics.cc +++ b/cpp/src/parquet/size_statistics.cc @@ -86,10 +86,6 @@ void SizeStatistics::Validate(const ColumnDescriptor* descr) const { throw ParquetException("Unencoded byte array data bytes does not support " + TypeToString(descr->physical_type())); } - if (!unencoded_byte_array_data_bytes.has_value() && - descr->physical_type() == Type::BYTE_ARRAY) { - throw ParquetException("Missing unencoded byte array data bytes"); - } } void SizeStatistics::Reset() { From cf08cc82a5f540ca881a8b462e66144dded3b957 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 20 Jan 2025 16:01:30 +0800 Subject: [PATCH 4/4] Update cpp/src/parquet/size_statistics_test.cc Co-authored-by: Antoine Pitrou --- cpp/src/parquet/size_statistics_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/size_statistics_test.cc b/cpp/src/parquet/size_statistics_test.cc index e379d38bf2bc5..90d6df57e7f43 100644 --- a/cpp/src/parquet/size_statistics_test.cc +++ b/cpp/src/parquet/size_statistics_test.cc @@ -225,7 +225,7 @@ class SizeStatisticsRoundTripTest : public ::testing::Test { auto row_group_reader = reader->RowGroup(i); for (int j = 0; j < metadata->num_columns(); ++j) { auto column_reader = row_group_reader->RecordReader(j); - ASSERT_EQ(column_reader->ReadRecords(num_rows), num_rows); + ASSERT_EQ(column_reader->ReadRecords(num_rows + 1), num_rows); } } }