-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-45283: [C++][Parquet] Omit level histogram when max level is 0 #45285
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,23 +64,28 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) { | |
} | ||
|
||
void SizeStatistics::Validate(const ColumnDescriptor* descr) const { | ||
if (repetition_level_histogram.size() != | ||
static_cast<size_t>(descr->max_repetition_level() + 1)) { | ||
throw ParquetException("Repetition level histogram size mismatch"); | ||
} | ||
if (definition_level_histogram.size() != | ||
static_cast<size_t>(descr->max_definition_level() + 1)) { | ||
throw ParquetException("Definition level histogram size mismatch"); | ||
} | ||
auto validate_histogram = [](const std::vector<int64_t>& histogram, int16_t max_level, | ||
const std::string& name) { | ||
if (histogram.empty()) { | ||
// A levels histogram is always allowed to be missing. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also allow for missing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I just thought about this and will loose it as well. |
||
return; | ||
} | ||
if (histogram.size() != static_cast<size_t>(max_level + 1)) { | ||
std::stringstream ss; | ||
ss << name << " level histogram size mismatch, size: " << histogram.size() | ||
<< ", expected: " << (max_level + 1); | ||
throw ParquetException(ss.str()); | ||
} | ||
}; | ||
validate_histogram(repetition_level_histogram, descr->max_repetition_level(), | ||
"Repetition"); | ||
validate_histogram(definition_level_histogram, descr->max_definition_level(), | ||
"Definition"); | ||
if (unencoded_byte_array_data_bytes.has_value() && | ||
descr->physical_type() != Type::BYTE_ARRAY) { | ||
throw ParquetException("Unencoded byte array data bytes does not support " + | ||
TypeToString(descr->physical_type())); | ||
} | ||
if (!unencoded_byte_array_data_bytes.has_value() && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this can be also emit ( though now we didn't do that?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Writers are free not to write |
||
descr->physical_type() == Type::BYTE_ARRAY) { | ||
throw ParquetException("Missing unencoded byte array data bytes"); | ||
} | ||
} | ||
|
||
void SizeStatistics::Reset() { | ||
|
@@ -93,8 +98,15 @@ void SizeStatistics::Reset() { | |
|
||
std::unique_ptr<SizeStatistics> SizeStatistics::Make(const ColumnDescriptor* descr) { | ||
auto size_stats = std::make_unique<SizeStatistics>(); | ||
size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); | ||
size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); | ||
// If the max level is 0, the level histogram can be omitted because it contains | ||
// only single level (a.k.a. 0) and its count is equivalent to `num_values` of the | ||
// column chunk or data page. | ||
if (descr->max_repetition_level() != 0) { | ||
size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); | ||
} | ||
if (descr->max_definition_level() != 0) { | ||
size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); | ||
} | ||
if (descr->physical_type() == Type::BYTE_ARRAY) { | ||
size_stats->unencoded_byte_array_data_bytes = 0; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -216,6 +216,20 @@ class SizeStatisticsRoundTripTest : public ::testing::Test { | |
} | ||
} | ||
|
||
void ReadData() { | ||
auto reader = | ||
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer_)); | ||
auto metadata = reader->metadata(); | ||
for (int i = 0; i < metadata->num_row_groups(); ++i) { | ||
int64_t num_rows = metadata->RowGroup(i)->num_rows(); | ||
auto row_group_reader = reader->RowGroup(i); | ||
for (int j = 0; j < metadata->num_columns(); ++j) { | ||
auto column_reader = row_group_reader->RecordReader(j); | ||
ASSERT_EQ(column_reader->ReadRecords(num_rows + 1), num_rows); | ||
} | ||
} | ||
} | ||
|
||
void Reset() { buffer_.reset(); } | ||
|
||
protected: | ||
|
@@ -300,23 +314,23 @@ TEST_F(SizeStatisticsRoundTripTest, WriteDictionaryArray) { | |
ReadSizeStatistics(); | ||
EXPECT_THAT(row_group_stats_, | ||
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{0, 2}, | ||
/*rep_levels=*/{2}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/5}, | ||
SizeStatistics{/*def_levels=*/{1, 1}, | ||
/*rep_levels=*/{2}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/1}, | ||
SizeStatistics{/*def_levels=*/{0, 2}, | ||
/*rep_levels=*/{2}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/4})); | ||
EXPECT_THAT(page_stats_, | ||
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{0, 2}, | ||
/*rep_levels=*/{2}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/{5}}, | ||
PageSizeStatistics{/*def_levels=*/{1, 1}, | ||
/*rep_levels=*/{2}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/{1}}, | ||
PageSizeStatistics{/*def_levels=*/{0, 2}, | ||
/*rep_levels=*/{2}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/{4}})); | ||
} | ||
|
||
|
@@ -368,12 +382,31 @@ TEST_F(SizeStatisticsRoundTripTest, LargePage) { | |
ReadSizeStatistics(); | ||
EXPECT_THAT(row_group_stats_, | ||
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{30000, 60000}, | ||
/*rep_levels=*/{90000}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/90000})); | ||
EXPECT_THAT(page_stats_, | ||
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{30000, 60000}, | ||
/*rep_levels=*/{90000}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/{90000}})); | ||
} | ||
|
||
TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a file to parquet-testing that has max_def==1 and omits the def histogram? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think if any open-sourced Parquet writer is able to create this yet... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could coerce one into doing so. But with the change allowing empty histograms, it's probably not worth creating one now. |
||
auto schema = | ||
::arrow::schema({::arrow::field("a", ::arrow::utf8(), /*nullable=*/false)}); | ||
WriteFile(SizeStatisticsLevel::PageAndColumnChunk, | ||
::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}), | ||
/*max_row_group_length=*/2, | ||
/*page_size=*/1024); | ||
ASSERT_NO_FATAL_FAILURE(ReadSizeStatistics()); | ||
ASSERT_NO_FATAL_FAILURE(ReadData()); | ||
EXPECT_THAT(row_group_stats_, | ||
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/6})); | ||
EXPECT_THAT(page_stats_, | ||
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{}, | ||
/*rep_levels=*/{}, | ||
/*byte_array_bytes=*/{6}})); | ||
} | ||
|
||
} // namespace parquet |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor nit: should this be a constant?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMHO this is already clear.