From 61cd05848f30c840b8d4d468aab291cca85c1935 Mon Sep 17 00:00:00 2001 From: Rui Mo Date: Fri, 14 Jul 2023 09:14:10 +0800 Subject: [PATCH] Fix the output type of complex type vector (#359) --- .../common/SelectiveRepeatedColumnReader.cpp | 14 ++++- .../common/SelectiveRepeatedColumnReader.h | 11 ++++ .../common/SelectiveStructColumnReader.cpp | 63 ++++++++++++++++--- .../dwio/common/SelectiveStructColumnReader.h | 9 +++ .../parquet/reader/RepeatedColumnReader.cpp | 2 + .../parquet/reader/StructColumnReader.cpp | 1 + 6 files changed, 90 insertions(+), 10 deletions(-) diff --git a/velox/dwio/common/SelectiveRepeatedColumnReader.cpp b/velox/dwio/common/SelectiveRepeatedColumnReader.cpp index d08f061739de..a6d8408d0f34 100644 --- a/velox/dwio/common/SelectiveRepeatedColumnReader.cpp +++ b/velox/dwio/common/SelectiveRepeatedColumnReader.cpp @@ -192,7 +192,7 @@ void SelectiveListColumnReader::getValues(RowSet rows, VectorPtr* result) { } *result = std::make_shared( &memoryPool_, - requestedType_->type, + outputType_ ? outputType_ : requestedType_->type, anyNulls_ ? resultNulls_ : nullptr, rows.size(), offsets_, @@ -200,6 +200,11 @@ void SelectiveListColumnReader::getValues(RowSet rows, VectorPtr* result) { elements); } +void SelectiveListColumnReader::setOutputType( + const std::shared_ptr& outputType) { + outputType_ = outputType; +} + SelectiveMapColumnReader::SelectiveMapColumnReader( const std::shared_ptr& requestedType, const std::shared_ptr& dataType, @@ -279,7 +284,7 @@ void SelectiveMapColumnReader::getValues(RowSet rows, VectorPtr* result) { } *result = std::make_shared( &memoryPool_, - requestedType_->type, + outputType_ ? outputType_ : requestedType_->type, anyNulls_ ? resultNulls_ : nullptr, rows.size(), offsets_, @@ -288,4 +293,9 @@ void SelectiveMapColumnReader::getValues(RowSet rows, VectorPtr* result) { values); } +void SelectiveMapColumnReader::setOutputType( + const std::shared_ptr& outputType) { + outputType_ = outputType; +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SelectiveRepeatedColumnReader.h b/velox/dwio/common/SelectiveRepeatedColumnReader.h index 7d1149833f44..7df7f259ad33 100644 --- a/velox/dwio/common/SelectiveRepeatedColumnReader.h +++ b/velox/dwio/common/SelectiveRepeatedColumnReader.h @@ -109,8 +109,13 @@ class SelectiveListColumnReader : public SelectiveRepeatedColumnReader { void getValues(RowSet rows, VectorPtr* FOLLY_NULLABLE result) override; protected: + void setOutputType(const std::shared_ptr& outputType); + std::unique_ptr child_; const std::shared_ptr requestedType_; + + private: + std::shared_ptr outputType_ = nullptr; }; class SelectiveMapColumnReader : public SelectiveRepeatedColumnReader { @@ -138,6 +143,12 @@ class SelectiveMapColumnReader : public SelectiveRepeatedColumnReader { std::unique_ptr keyReader_; std::unique_ptr elementReader_; const std::shared_ptr requestedType_; + + protected: + void setOutputType(const std::shared_ptr& outputType); + + private: + std::shared_ptr outputType_ = nullptr; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index 8c88314e5f74..644735ce9209 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -216,6 +216,7 @@ void fillRowVectorChildren( } } } + } // namespace void SelectiveStructColumnReaderBase::getValues( @@ -228,16 +229,14 @@ void SelectiveStructColumnReaderBase::getValues( VELOX_CHECK( result->get()->type()->isRow(), "Struct reader expects a result of type ROW."); - auto& rowType = result->get()->type()->asRow(); - if (!result->unique() || result->get()->isLazy()) { + checkOutputType(outputType_, asRowType(requestedType_->type)); + const auto& outDataType = outputType_ ? outputType_ : result->get()->type(); + auto& rowType = outDataType->asRow(); + if (outputType_ || !result->unique() || result->get()->isLazy()) { std::vector children(rowType.size()); fillRowVectorChildren(*result->get()->pool(), rowType, children); *result = std::make_unique( - result->get()->pool(), - result->get()->type(), - nullptr, - 0, - std::move(children)); + result->get()->pool(), outDataType, nullptr, 0, std::move(children)); } auto* resultRow = static_cast(result->get()); resultRow->resize(rows.size()); @@ -277,7 +276,7 @@ void SelectiveStructColumnReaderBase::getValues( } resultRow->childAt(channel) = std::make_shared( &memoryPool_, - resultRow->type()->childAt(channel), + outDataType->childAt(channel), rows.size(), std::make_unique(this, children_[index], numReads_)); } else { @@ -287,4 +286,52 @@ void SelectiveStructColumnReaderBase::getValues( } } +void SelectiveStructColumnReaderBase::setOutputType( + const RowTypePtr& outputType) { + outputType_ = outputType; +} + +/** + * Check the output type against requested type on compatibility. + * @param outputType: the output type from user. + * @param requestedType: the type from Parquet. + */ +void SelectiveStructColumnReaderBase::checkOutputType( + const RowTypePtr& outputType, + const RowTypePtr& requestedType) { + if (outputType == nullptr) { + return; + } + VELOX_CHECK_NOT_NULL(requestedType); + for (int i = 0; i < outputType->size(); ++i) { + if (!requestedType->containsChild(outputType->nameOf(i))) + continue; + + bool isPartitionColumn = false; + for (const auto& childSpec : scanSpec_->children()) { + if (childSpec->fieldName() == outputType->nameOf(i) && + childSpec->isConstant()) { + isPartitionColumn = true; + break; + } + } + // Skip the type check for partition column because requested type does not + // contain it. + if (isPartitionColumn) + continue; + + const auto& childOutputType = outputType->childAt(i); + const auto& childRequestedType = + requestedType->findChild(outputType->nameOf(i)); + if (auto rowTypePtr = asRowType(childOutputType)) { + VELOX_CHECK_NOT_NULL(asRowType(childRequestedType)); + checkOutputType( + asRowType(childOutputType), asRowType(childRequestedType)); + continue; + } + VELOX_CHECK(BaseVector::compatibleKind( + childOutputType->kind(), childRequestedType->kind())); + } +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SelectiveStructColumnReader.h b/velox/dwio/common/SelectiveStructColumnReader.h index 66189b8c3f36..d9b2bee91f9e 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.h +++ b/velox/dwio/common/SelectiveStructColumnReader.h @@ -116,6 +116,8 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { return hasMutation_; } + void setOutputType(const RowTypePtr& outputType); + const std::shared_ptr requestedType_; std::vector children_; @@ -141,6 +143,13 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { // and query. Set at construction, which takes place on first // use. If no ExceptionContext is in effect, this is "". const std::string debugString_; + + private: + void checkOutputType( + const RowTypePtr& outputType, + const RowTypePtr& requestedType); + + RowTypePtr outputType_ = nullptr; }; struct SelectiveStructColumnReader : SelectiveStructColumnReaderBase { diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp index 9bf5d4e5dff9..83fd68766ac5 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp @@ -125,6 +125,7 @@ MapColumnReader::MapColumnReader( scanSpec) { const std::shared_ptr& mapTypePtr = std::dynamic_pointer_cast(colType); + setOutputType(mapTypePtr); auto& keyChildType = requestedType->childAt(0); auto& elementChildType = requestedType->childAt(1); keyReader_ = ParquetColumnReader::build( @@ -249,6 +250,7 @@ ListColumnReader::ListColumnReader( auto& childType = requestedType->childAt(0); const std::shared_ptr& arrayTypePtr = std::dynamic_pointer_cast(colType); + setOutputType(arrayTypePtr); child_ = ParquetColumnReader::build( childType, params, diff --git a/velox/dwio/parquet/reader/StructColumnReader.cpp b/velox/dwio/parquet/reader/StructColumnReader.cpp index 6914b1bc7b47..456d33809611 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.cpp +++ b/velox/dwio/parquet/reader/StructColumnReader.cpp @@ -47,6 +47,7 @@ StructColumnReader::StructColumnReader( rowTypePtr = asRowType(colType); VELOX_CHECK_NOT_NULL(rowTypePtr); } + setOutputType(rowTypePtr); auto& childSpecs = scanSpec_->children(); if (rowTypePtr && !caseSensitive) {