Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add file metadata columns support for spark parquet #7880

7 changes: 5 additions & 2 deletions velox/connectors/hive/HiveConnectorSplit.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ struct HiveConnectorSplit : public connector::ConnectorSplit {
std::unordered_map<std::string, std::string> customSplitInfo;
std::shared_ptr<std::string> extraFileInfo;
std::unordered_map<std::string, std::string> serdeParameters;
std::unordered_map<std::string, std::string> metadataColumns;

HiveConnectorSplit(
const std::string& connectorId,
Expand All @@ -45,7 +46,8 @@ struct HiveConnectorSplit : public connector::ConnectorSplit {
std::optional<int32_t> _tableBucketNumber = std::nullopt,
const std::unordered_map<std::string, std::string>& _customSplitInfo = {},
const std::shared_ptr<std::string>& _extraFileInfo = {},
const std::unordered_map<std::string, std::string>& _serdeParameters = {})
const std::unordered_map<std::string, std::string>& _serdeParameters = {},
const std::unordered_map<std::string, std::string>& _metadataColumns = {})
: ConnectorSplit(connectorId),
filePath(_filePath),
fileFormat(_fileFormat),
Expand All @@ -55,7 +57,8 @@ struct HiveConnectorSplit : public connector::ConnectorSplit {
tableBucketNumber(_tableBucketNumber),
customSplitInfo(_customSplitInfo),
extraFileInfo(_extraFileInfo),
serdeParameters(_serdeParameters) {}
serdeParameters(_serdeParameters),
metadataColumns(_metadataColumns) {}

std::string toString() const override {
if (tableBucketNumber.has_value()) {
Expand Down
9 changes: 8 additions & 1 deletion velox/connectors/hive/HiveDataSource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,9 @@ HiveDataSource::HiveDataSource(
if (handle->columnType() == HiveColumnHandle::ColumnType::kPartitionKey) {
partitionKeys_.emplace(handle->name(), handle);
}
if (handle->columnType() == HiveColumnHandle::ColumnType::kMetadata) {
metadataColumns_.emplace(handle->name(), handle);
}
}

std::vector<std::string> readerRowNames;
Expand Down Expand Up @@ -468,6 +471,7 @@ HiveDataSource::HiveDataSource(
filters,
hiveTableHandle_->dataColumns(),
partitionKeys_,
metadataColumns_,
pool_);
if (remainingFilter) {
metadataFilter_ = std::make_shared<common::MetadataFilter>(
Expand Down Expand Up @@ -724,6 +728,8 @@ std::shared_ptr<common::ScanSpec> HiveDataSource::makeScanSpec(
const RowTypePtr& dataColumns,
const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
partitionKeys,
const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
metadataColumns,
memory::MemoryPool* pool) {
auto spec = std::make_shared<common::ScanSpec>("root");
folly::F14FastMap<std::string, std::vector<const common::Subfield*>>
Expand Down Expand Up @@ -782,7 +788,8 @@ std::shared_ptr<common::ScanSpec> HiveDataSource::makeScanSpec(
// $bucket column. This filter is redundant and needs to be removed.
// TODO Remove this check when Presto is fixed to not specify a filter
// on $path and $bucket column.
if (auto name = pair.first.toString(); name == kPath || name == kBucket) {
if (auto name = pair.first.toString();
name == kPath || name == kBucket || metadataColumns.count(name) != 0) {
gaoyangxiaozhu marked this conversation as resolved.
Show resolved Hide resolved
continue;
}
auto fieldSpec = spec->getOrCreateChild(pair.first);
Expand Down
7 changes: 7 additions & 0 deletions velox/connectors/hive/HiveDataSource.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class HiveDataSource : public DataSource {
const RowTypePtr& dataColumns,
const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
partitionKeys,
const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
metadataColumns,
memory::MemoryPool* pool);

// Internal API, made public to be accessible in unit tests. Do not use in
Expand Down Expand Up @@ -117,6 +119,11 @@ class HiveDataSource : public DataSource {
std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>
partitionKeys_;

// Column handles for the metadata columns keyed on metadata column
// name.
std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>
metadataColumns_;

private:
// Evaluates remainingFilter_ on the specified vector. Returns number of rows
// passed. Populates filterEvalCtx_.selectedIndices and selectedBits if only
Expand Down
15 changes: 12 additions & 3 deletions velox/connectors/hive/SplitReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,18 @@ std::vector<TypePtr> SplitReader::adaptColumns(
auto* childSpec = childrenSpecs[i].get();
const std::string& fieldName = childSpec->fieldName();

auto iter = hiveSplit_->partitionKeys.find(fieldName);
if (iter != hiveSplit_->partitionKeys.end()) {
setPartitionValue(childSpec, fieldName, iter->second);
auto partitionKey = hiveSplit_->partitionKeys.find(fieldName);
auto metadataColumn = hiveSplit_->metadataColumns.find(fieldName);
if (partitionKey != hiveSplit_->partitionKeys.end()) {
setPartitionValue(childSpec, fieldName, partitionKey->second);
} else if (metadataColumn != hiveSplit_->metadataColumns.end()) {
auto metadataColumnOutputType =
readerOutputType_->childAt(readerOutputType_->getChildIdx(fieldName));
auto constValue = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
convertFromString,
metadataColumnOutputType->kind(),
std::make_optional(metadataColumn->second));
setConstantValue(childSpec, metadataColumnOutputType, constValue);
} else if (fieldName == kPath) {
setConstantValue(
childSpec, VARCHAR(), velox::variant(hiveSplit_->filePath));
Expand Down
1 change: 1 addition & 0 deletions velox/connectors/hive/TableHandle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ columnTypeNames() {
{HiveColumnHandle::ColumnType::kPartitionKey, "PartitionKey"},
{HiveColumnHandle::ColumnType::kRegular, "Regular"},
{HiveColumnHandle::ColumnType::kSynthesized, "Synthesized"},
{HiveColumnHandle::ColumnType::kMetadata, "Metadata"},
};
}

Expand Down
2 changes: 1 addition & 1 deletion velox/connectors/hive/TableHandle.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ using SubfieldFilters =

class HiveColumnHandle : public ColumnHandle {
public:
enum class ColumnType { kPartitionKey, kRegular, kSynthesized };
enum class ColumnType { kPartitionKey, kRegular, kSynthesized, kMetadata };

/// NOTE: 'dataType' is the column type in target write table. 'hiveType' is
/// converted type of the corresponding column in source table which might not
Expand Down
17 changes: 14 additions & 3 deletions velox/connectors/hive/tests/HiveConnectorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_multilevel) {
auto rowType = ROW({{"c0", columnType}});
auto subfields = makeSubfields({"c0.c0c1[3][\"foo\"].c0c1c0"});
auto scanSpec = HiveDataSource::makeScanSpec(
rowType, groupSubfields(subfields), {}, nullptr, {}, pool_.get());
rowType, groupSubfields(subfields), {}, nullptr, {}, {}, pool_.get());
auto* c0c0 = scanSpec->childByName("c0")->childByName("c0c0");
validateNullConstant(*c0c0, *BIGINT());
auto* c0c1 = scanSpec->childByName("c0")->childByName("c0c1");
Expand Down Expand Up @@ -122,6 +122,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeFields) {
{},
nullptr,
{},
{},
pool_.get());
auto* c0c0 = scanSpec->childByName("c0")->childByName("c0c0");
ASSERT_FALSE(c0c0->childByName("c0c0c0")->isConstant());
Expand All @@ -144,6 +145,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeArray) {
{},
nullptr,
{},
{},
pool_.get());
auto* c0 = scanSpec->childByName("c0");
ASSERT_EQ(c0->maxArrayElementsCount(), 2);
Expand All @@ -161,7 +163,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeArrayNegative) {
auto groupedSubfields = groupSubfields(subfields);
VELOX_ASSERT_USER_THROW(
HiveDataSource::makeScanSpec(
rowType, groupedSubfields, {}, nullptr, {}, pool_.get()),
rowType, groupedSubfields, {}, nullptr, {}, {}, pool_.get()),
"Non-positive array subscript cannot be push down");
}

Expand All @@ -176,6 +178,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeMap) {
{},
nullptr,
{},
{},
pool_.get());
auto* c0 = scanSpec->childByName("c0");
auto* keysFilter = c0->childByName(ScanSpec::kMapKeysFieldName)->filter();
Expand All @@ -201,6 +204,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) {
{},
nullptr,
{},
{},
pool_.get());
auto* c0 = scanSpec->childByName("c0");
ASSERT_FALSE(c0->childByName(ScanSpec::kMapKeysFieldName)->filter());
Expand All @@ -219,6 +223,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) {
{},
nullptr,
{},
{},
pool_.get());
auto* c0 = scanSpec->childByName("c0");
ASSERT_FALSE(c0->childByName(ScanSpec::kMapKeysFieldName)->filter());
Expand All @@ -241,6 +246,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) {
{},
nullptr,
{},
{},
pool_.get());
auto* keysFilter = scanSpec->childByName("c0")
->childByName(ScanSpec::kMapKeysFieldName)
Expand Down Expand Up @@ -268,6 +274,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) {
{},
nullptr,
{},
{},
pool_.get());
keysFilter = scanSpec->childByName("c0")
->childByName(ScanSpec::kMapKeysFieldName)
Expand All @@ -286,6 +293,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) {
{},
nullptr,
{},
{},
pool_.get());
keysFilter = scanSpec->childByName("c0")
->childByName(ScanSpec::kMapKeysFieldName)
Expand All @@ -301,6 +309,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) {
{},
nullptr,
{},
{},
pool_.get());
keysFilter = scanSpec->childByName("c0")
->childByName(ScanSpec::kMapKeysFieldName)
Expand Down Expand Up @@ -336,6 +345,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_filtersNotInRequiredSubfields) {
filters,
ROW({{"c0", c0Type}, {"c1", c1Type}}),
{},
{},
pool_.get());
auto c0 = scanSpec->childByName("c0");
ASSERT_FALSE(c0->isConstant());
Expand Down Expand Up @@ -380,6 +390,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_duplicateSubfields) {
{},
nullptr,
{},
{},
pool_.get());
auto* c0 = scanSpec->childByName("c0");
ASSERT_EQ(c0->children().size(), 2);
Expand All @@ -393,7 +404,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_filterPartitionKey) {
SubfieldFilters filters;
filters.emplace(Subfield("ds"), exec::equal("2023-10-13"));
auto scanSpec = HiveDataSource::makeScanSpec(
rowType, {}, filters, rowType, {{"ds", nullptr}}, pool_.get());
rowType, {}, filters, rowType, {{"ds", nullptr}}, {}, pool_.get());
ASSERT_TRUE(scanSpec->childByName("c0")->projectOut());
ASSERT_FALSE(scanSpec->childByName("ds")->projectOut());
}
Expand Down