Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-45283: [C++][Parquet] Omit level histogram when max level is 0 #45285

Merged
merged 4 commits into from
Jan 21, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
@@ -1609,6 +1609,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<

auto add_levels = [](std::vector<int64_t>& level_histogram,
::arrow::util::span<const int16_t> levels, int16_t max_level) {
if (max_level == 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor nit: should this be a constant?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO this is already clear.

return;
}
ARROW_DCHECK_EQ(static_cast<size_t>(max_level) + 1, level_histogram.size());
::parquet::UpdateLevelHistogram(levels, level_histogram);
};
40 changes: 26 additions & 14 deletions cpp/src/parquet/size_statistics.cc
Original file line number Diff line number Diff line change
@@ -64,23 +64,28 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) {
}

void SizeStatistics::Validate(const ColumnDescriptor* descr) const {
if (repetition_level_histogram.size() !=
static_cast<size_t>(descr->max_repetition_level() + 1)) {
throw ParquetException("Repetition level histogram size mismatch");
}
if (definition_level_histogram.size() !=
static_cast<size_t>(descr->max_definition_level() + 1)) {
throw ParquetException("Definition level histogram size mismatch");
}
auto validate_histogram = [](const std::vector<int64_t>& histogram, int16_t max_level,
const std::string& name) {
if (histogram.empty()) {
// A levels histogram is always allowed to be missing.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also allow for missing unencoded_byte_array_data_bytes below? Maybe a different PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I just thought about this and will loose it as well.

return;
}
if (histogram.size() != static_cast<size_t>(max_level + 1)) {
std::stringstream ss;
ss << name << " level histogram size mismatch, size: " << histogram.size()
<< ", expected: " << (max_level + 1);
throw ParquetException(ss.str());
}
};
validate_histogram(repetition_level_histogram, descr->max_repetition_level(),
"Repetition");
validate_histogram(definition_level_histogram, descr->max_definition_level(),
"Definition");
if (unencoded_byte_array_data_bytes.has_value() &&
descr->physical_type() != Type::BYTE_ARRAY) {
throw ParquetException("Unencoded byte array data bytes does not support " +
TypeToString(descr->physical_type()));
}
if (!unencoded_byte_array_data_bytes.has_value() &&
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this can be also emit ( though now we didn't do that?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Writers are free not to write unencoded_byte_array_data_bytes therefore we shouldn't assume it exists.

descr->physical_type() == Type::BYTE_ARRAY) {
throw ParquetException("Missing unencoded byte array data bytes");
}
}

void SizeStatistics::Reset() {
@@ -93,8 +98,15 @@ void SizeStatistics::Reset() {

std::unique_ptr<SizeStatistics> SizeStatistics::Make(const ColumnDescriptor* descr) {
auto size_stats = std::make_unique<SizeStatistics>();
size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0);
size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0);
// If the max level is 0, the level histogram can be omitted because it contains
// only single level (a.k.a. 0) and its count is equivalent to `num_values` of the
// column chunk or data page.
if (descr->max_repetition_level() != 0) {
size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0);
}
if (descr->max_definition_level() != 0) {
size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0);
}
if (descr->physical_type() == Type::BYTE_ARRAY) {
size_stats->unencoded_byte_array_data_bytes = 0;
}
49 changes: 41 additions & 8 deletions cpp/src/parquet/size_statistics_test.cc
Original file line number Diff line number Diff line change
@@ -216,6 +216,20 @@ class SizeStatisticsRoundTripTest : public ::testing::Test {
}
}

void ReadData() {
auto reader =
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer_));
auto metadata = reader->metadata();
for (int i = 0; i < metadata->num_row_groups(); ++i) {
int64_t num_rows = metadata->RowGroup(i)->num_rows();
auto row_group_reader = reader->RowGroup(i);
for (int j = 0; j < metadata->num_columns(); ++j) {
auto column_reader = row_group_reader->RecordReader(j);
ASSERT_EQ(column_reader->ReadRecords(num_rows + 1), num_rows);
}
}
}

void Reset() { buffer_.reset(); }

protected:
@@ -300,23 +314,23 @@ TEST_F(SizeStatisticsRoundTripTest, WriteDictionaryArray) {
ReadSizeStatistics();
EXPECT_THAT(row_group_stats_,
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{0, 2},
/*rep_levels=*/{2},
/*rep_levels=*/{},
/*byte_array_bytes=*/5},
SizeStatistics{/*def_levels=*/{1, 1},
/*rep_levels=*/{2},
/*rep_levels=*/{},
/*byte_array_bytes=*/1},
SizeStatistics{/*def_levels=*/{0, 2},
/*rep_levels=*/{2},
/*rep_levels=*/{},
/*byte_array_bytes=*/4}));
EXPECT_THAT(page_stats_,
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{0, 2},
/*rep_levels=*/{2},
/*rep_levels=*/{},
/*byte_array_bytes=*/{5}},
PageSizeStatistics{/*def_levels=*/{1, 1},
/*rep_levels=*/{2},
/*rep_levels=*/{},
/*byte_array_bytes=*/{1}},
PageSizeStatistics{/*def_levels=*/{0, 2},
/*rep_levels=*/{2},
/*rep_levels=*/{},
/*byte_array_bytes=*/{4}}));
}

@@ -368,12 +382,31 @@ TEST_F(SizeStatisticsRoundTripTest, LargePage) {
ReadSizeStatistics();
EXPECT_THAT(row_group_stats_,
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{30000, 60000},
/*rep_levels=*/{90000},
/*rep_levels=*/{},
/*byte_array_bytes=*/90000}));
EXPECT_THAT(page_stats_,
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{30000, 60000},
/*rep_levels=*/{90000},
/*rep_levels=*/{},
/*byte_array_bytes=*/{90000}}));
}

TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a file to parquet-testing that has max_def==1 and omits the def histogram?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think if any open-sourced Parquet writer is able to create this yet...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could coerce one into doing so. But with the change allowing empty histograms, it's probably not worth creating one now.

auto schema =
::arrow::schema({::arrow::field("a", ::arrow::utf8(), /*nullable=*/false)});
WriteFile(SizeStatisticsLevel::PageAndColumnChunk,
::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}),
/*max_row_group_length=*/2,
/*page_size=*/1024);
ASSERT_NO_FATAL_FAILURE(ReadSizeStatistics());
ASSERT_NO_FATAL_FAILURE(ReadData());
EXPECT_THAT(row_group_stats_,
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{},
/*rep_levels=*/{},
/*byte_array_bytes=*/6}));
EXPECT_THAT(page_stats_,
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{},
/*rep_levels=*/{},
/*byte_array_bytes=*/{6}}));
}

} // namespace parquet