From fbf06366b865157425eb4bb289eba3213df35b0a Mon Sep 17 00:00:00 2001
From: PHILO-HE <feilong.he@intel.com>
Date: Thu, 7 Mar 2024 17:15:37 +0800
Subject: [PATCH] Revert "Revert "Allow HiveSplit info columns like
 '$file_size' and '$file_modified_time' to be queried in SQL (#8800)""

This reverts commit d3dc172630d532432de3856d7f677b3d3ab91ac8.
---
 velox/connectors/hive/HiveConnectorSplit.h    | 10 ++-
 velox/connectors/hive/HiveConnectorUtil.cpp   | 28 ++++++--
 velox/connectors/hive/HiveConnectorUtil.h     |  7 +-
 velox/connectors/hive/HiveDataSource.cpp      |  7 +-
 velox/connectors/hive/HiveDataSource.h        |  4 ++
 velox/connectors/hive/SplitReader.cpp         | 18 ++++-
 .../connectors/hive/iceberg/IcebergSplit.cpp  | 18 +++--
 velox/connectors/hive/iceberg/IcebergSplit.h  |  6 +-
 .../hive/tests/HiveConnectorTest.cpp          | 17 ++++-
 velox/exec/tests/TableScanTest.cpp            | 70 +++++++++++++++++++
 .../tests/utils/HiveConnectorTestBase.cpp     | 22 +++++-
 .../exec/tests/utils/HiveConnectorTestBase.h  | 39 ++++++++++-
 velox/exec/tests/utils/TempFilePath.h         | 13 ++++
 13 files changed, 235 insertions(+), 24 deletions(-)
diff --git a/velox/connectors/hive/HiveConnectorSplit.h b/velox/connectors/hive/HiveConnectorSplit.h
index 10fa9206ec2d..48f39f64bec2 100644
--- a/velox/connectors/hive/HiveConnectorSplit.h
+++ b/velox/connectors/hive/HiveConnectorSplit.h
@@ -39,6 +39,10 @@ struct HiveConnectorSplit : public connector::ConnectorSplit {
   std::shared_ptr<std::string> extraFileInfo;
   std::unordered_map<std::string, std::string> serdeParameters;
 
+  /// These represent columns like $file_size, $file_modified_time that are
+  /// associated with the HiveSplit.
+  std::unordered_map<std::string, std::string> infoColumns;
+
   HiveConnectorSplit(
       const std::string& connectorId,
       const std::string& _filePath,
@@ -51,7 +55,8 @@ struct HiveConnectorSplit : public connector::ConnectorSplit {
       const std::unordered_map<std::string, std::string>& _customSplitInfo = {},
       const std::shared_ptr<std::string>& _extraFileInfo = {},
       const std::unordered_map<std::string, std::string>& _serdeParameters = {},
-      int64_t _splitWeight = 0)
+      int64_t _splitWeight = 0,
+      const std::unordered_map<std::string, std::string>& _infoColumns = {})
       : ConnectorSplit(connectorId, _splitWeight),
         filePath(_filePath),
         fileFormat(_fileFormat),
@@ -61,7 +66,8 @@ struct HiveConnectorSplit : public connector::ConnectorSplit {
         tableBucketNumber(_tableBucketNumber),
         customSplitInfo(_customSplitInfo),
         extraFileInfo(_extraFileInfo),
-        serdeParameters(_serdeParameters) {}
+        serdeParameters(_serdeParameters),
+        infoColumns(_infoColumns) {}
 
   std::string toString() const override {
     if (tableBucketNumber.has_value()) {
diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp
index c43eecd31fc6..3ff34cc50e84 100644
--- a/velox/connectors/hive/HiveConnectorUtil.cpp
+++ b/velox/connectors/hive/HiveConnectorUtil.cpp
@@ -239,6 +239,13 @@ inline uint8_t parseDelimiter(const std::string& delim) {
   return stoi(delim);
 }
 
+inline bool isSynthesizedColumn(
+    const std::string& name,
+    const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
+        infoColumns) {
+  return name == kPath || name == kBucket || infoColumns.count(name) != 0;
+}
+
 } // namespace
 
 const std::string& getColumnName(const common::Subfield& subfield) {
@@ -273,9 +280,13 @@ void checkColumnNameLowerCase(const std::shared_ptr<const Type>& type) {
   }
 }
 
-void checkColumnNameLowerCase(const SubfieldFilters& filters) {
+void checkColumnNameLowerCase(
+    const SubfieldFilters& filters,
+    const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
+        infoColumns) {
   for (auto& pair : filters) {
-    if (auto name = pair.first.toString(); name == kPath || name == kBucket) {
+    if (auto name = pair.first.toString();
+        isSynthesizedColumn(name, infoColumns)) {
       continue;
     }
     auto& path = pair.first.path();
@@ -310,6 +321,8 @@ std::shared_ptr<common::ScanSpec> makeScanSpec(
     const RowTypePtr& dataColumns,
     const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
         partitionKeys,
+    const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
+        infoColumns,
     memory::MemoryPool* pool) {
   auto spec = std::make_shared<common::ScanSpec>("root");
   folly::F14FastMap<std::string, std::vector<const common::Subfield*>>
@@ -317,7 +330,8 @@ std::shared_ptr<common::ScanSpec> makeScanSpec(
   std::vector<SubfieldSpec> subfieldSpecs;
   for (auto& [subfield, _] : filters) {
     if (auto name = subfield.toString();
-        name != kPath && name != kBucket && partitionKeys.count(name) == 0) {
+        !isSynthesizedColumn(name, infoColumns) &&
+        partitionKeys.count(name) == 0) {
       filterSubfields[getColumnName(subfield)].push_back(&subfield);
     }
   }
@@ -364,11 +378,13 @@ std::shared_ptr<common::ScanSpec> makeScanSpec(
     // SelectiveColumnReader doesn't support constant columns with filters,
     // hence, we can't have a filter for a $path or $bucket column.
     //
-    // Unfortunately, Presto happens to specify a filter for $path or
-    // $bucket column. This filter is redundant and needs to be removed.
+    // Unfortunately, Presto happens to specify a filter for $path, $file_size,
+    // $file_modified_time or $bucket column. This filter is redundant and needs
+    // to be removed.
     // TODO Remove this check when Presto is fixed to not specify a filter
     // on $path and $bucket column.
-    if (auto name = pair.first.toString(); name == kPath || name == kBucket) {
+    if (auto name = pair.first.toString();
+        isSynthesizedColumn(name, infoColumns)) {
       continue;
     }
     auto fieldSpec = spec->getOrCreateChild(pair.first);
diff --git a/velox/connectors/hive/HiveConnectorUtil.h b/velox/connectors/hive/HiveConnectorUtil.h
index 4ba219e6b6b7..4c5fe743966f 100644
--- a/velox/connectors/hive/HiveConnectorUtil.h
+++ b/velox/connectors/hive/HiveConnectorUtil.h
@@ -40,7 +40,10 @@ const std::string& getColumnName(const common::Subfield& subfield);
 
 void checkColumnNameLowerCase(const std::shared_ptr<const Type>& type);
 
-void checkColumnNameLowerCase(const SubfieldFilters& filters);
+void checkColumnNameLowerCase(
+    const SubfieldFilters& filters,
+    const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
+        infoColumns);
 
 void checkColumnNameLowerCase(const core::TypedExprPtr& typeExpr);
 
@@ -52,6 +55,8 @@ std::shared_ptr<common::ScanSpec> makeScanSpec(
     const RowTypePtr& dataColumns,
     const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
         partitionKeys,
+    const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
+        infoColumns,
     memory::MemoryPool* pool);
 
 void configureReaderOptions(
diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp
index 5216113ddeb8..e70fea4e3a71 100644
--- a/velox/connectors/hive/HiveDataSource.cpp
+++ b/velox/connectors/hive/HiveDataSource.cpp
@@ -57,6 +57,10 @@ HiveDataSource::HiveDataSource(
     if (handle->columnType() == HiveColumnHandle::ColumnType::kPartitionKey) {
       partitionKeys_.emplace(handle->name(), handle);
     }
+
+    if (handle->columnType() == HiveColumnHandle::ColumnType::kSynthesized) {
+      infoColumns_.emplace(handle->name(), handle);
+    }
   }
 
   std::vector<std::string> readerRowNames;
@@ -88,7 +92,7 @@ HiveDataSource::HiveDataSource(
   if (hiveConfig_->isFileColumnNamesReadAsLowerCase(
           connectorQueryCtx->sessionProperties())) {
     checkColumnNameLowerCase(outputType_);
-    checkColumnNameLowerCase(hiveTableHandle_->subfieldFilters());
+    checkColumnNameLowerCase(hiveTableHandle_->subfieldFilters(), infoColumns_);
     checkColumnNameLowerCase(hiveTableHandle_->remainingFilter());
   }
 
@@ -152,6 +156,7 @@ HiveDataSource::HiveDataSource(
       filters,
       hiveTableHandle_->dataColumns(),
       partitionKeys_,
+      infoColumns_,
       pool_);
   if (remainingFilter) {
     metadataFilter_ = std::make_shared<common::MetadataFilter>(
diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h
index 3272f068552a..8674839485c7 100644
--- a/velox/connectors/hive/HiveDataSource.h
+++ b/velox/connectors/hive/HiveDataSource.h
@@ -119,6 +119,10 @@ class HiveDataSource : public DataSource {
 
   // The row type for the data source output, not including filter-only columns
   const RowTypePtr outputType_;
+
+  // Column handles for the Split info columns keyed on their column names.
+  std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>
+      infoColumns_;
   std::shared_ptr<common::MetadataFilter> metadataFilter_;
   std::unique_ptr<exec::ExprSet> remainingFilterExprSet_;
   RowVectorPtr emptyOutput_;
diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp
index 1b8daa58829c..e7c0e8302dd7 100644
--- a/velox/connectors/hive/SplitReader.cpp
+++ b/velox/connectors/hive/SplitReader.cpp
@@ -218,9 +218,9 @@ std::vector<TypePtr> SplitReader::adaptColumns(
     auto* childSpec = childrenSpecs[i].get();
     const std::string& fieldName = childSpec->fieldName();
 
-    auto iter = hiveSplit_->partitionKeys.find(fieldName);
-    if (iter != hiveSplit_->partitionKeys.end()) {
-      setPartitionValue(childSpec, fieldName, iter->second);
+    if (auto it = hiveSplit_->partitionKeys.find(fieldName);
+        it != hiveSplit_->partitionKeys.end()) {
+      setPartitionValue(childSpec, fieldName, it->second);
     } else if (fieldName == kPath) {
       auto constantVec = std::make_shared<ConstantVector<StringView>>(
           connectorQueryCtx_->memoryPool(),
@@ -240,6 +240,18 @@ std::vector<TypePtr> SplitReader::adaptColumns(
             std::move(bucket));
         childSpec->setConstantValue(constantVec);
       }
+    } else if (auto iter = hiveSplit_->infoColumns.find(fieldName);
+               iter != hiveSplit_->infoColumns.end()) {
+      auto infoColumnType =
+          readerOutputType_->childAt(readerOutputType_->getChildIdx(fieldName));
+      auto constant = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL(
+          newConstantFromString,
+          infoColumnType->kind(),
+          infoColumnType,
+          iter->second,
+          1,
+          connectorQueryCtx_->memoryPool());
+      childSpec->setConstantValue(constant);
     } else {
       auto fileTypeIdx = fileType->getChildIdxIfExists(fieldName);
       if (!fileTypeIdx.has_value()) {
diff --git a/velox/connectors/hive/iceberg/IcebergSplit.cpp b/velox/connectors/hive/iceberg/IcebergSplit.cpp
index 7fa9a52f2c69..747d70869f53 100644
--- a/velox/connectors/hive/iceberg/IcebergSplit.cpp
+++ b/velox/connectors/hive/iceberg/IcebergSplit.cpp
@@ -30,7 +30,8 @@ HiveIcebergSplit::HiveIcebergSplit(
         _partitionKeys,
     std::optional<int32_t> _tableBucketNumber,
     const std::unordered_map<std::string, std::string>& _customSplitInfo,
-    const std::shared_ptr<std::string>& _extraFileInfo)
+    const std::shared_ptr<std::string>& _extraFileInfo,
+    const std::unordered_map<std::string, std::string>& _infoColumns)
     : HiveConnectorSplit(
           _connectorId,
           _filePath,
@@ -38,7 +39,12 @@ HiveIcebergSplit::HiveIcebergSplit(
           _start,
           _length,
           _partitionKeys,
-          _tableBucketNumber) {
+          _tableBucketNumber,
+          _customSplitInfo,
+          _extraFileInfo,
+          {},
+          0,
+          _infoColumns) {
   // TODO: Deserialize _extraFileInfo to get deleteFiles;
 }
 
@@ -54,7 +60,8 @@ HiveIcebergSplit::HiveIcebergSplit(
     std::optional<int32_t> _tableBucketNumber,
     const std::unordered_map<std::string, std::string>& _customSplitInfo,
     const std::shared_ptr<std::string>& _extraFileInfo,
-    std::vector<IcebergDeleteFile> _deletes)
+    std::vector<IcebergDeleteFile> _deletes,
+    const std::unordered_map<std::string, std::string>& _infoColumns)
     : HiveConnectorSplit(
           _connectorId,
           _filePath,
@@ -64,6 +71,9 @@ HiveIcebergSplit::HiveIcebergSplit(
           _partitionKeys,
           _tableBucketNumber,
           _customSplitInfo,
-          _extraFileInfo),
+          _extraFileInfo,
+          {},
+          0,
+          _infoColumns),
       deleteFiles(_deletes) {}
 } // namespace facebook::velox::connector::hive::iceberg
diff --git a/velox/connectors/hive/iceberg/IcebergSplit.h b/velox/connectors/hive/iceberg/IcebergSplit.h
index 05bd70f9820a..972a48c8f5e9 100644
--- a/velox/connectors/hive/iceberg/IcebergSplit.h
+++ b/velox/connectors/hive/iceberg/IcebergSplit.h
@@ -36,7 +36,8 @@ struct HiveIcebergSplit : public connector::hive::HiveConnectorSplit {
           _partitionKeys = {},
       std::optional<int32_t> _tableBucketNumber = std::nullopt,
       const std::unordered_map<std::string, std::string>& _customSplitInfo = {},
-      const std::shared_ptr<std::string>& _extraFileInfo = {});
+      const std::shared_ptr<std::string>& _extraFileInfo = {},
+      const std::unordered_map<std::string, std::string>& _infoColumns = {});
 
   // For tests only
   HiveIcebergSplit(
@@ -50,7 +51,8 @@ struct HiveIcebergSplit : public connector::hive::HiveConnectorSplit {
       std::optional<int32_t> _tableBucketNumber = std::nullopt,
       const std::unordered_map<std::string, std::string>& _customSplitInfo = {},
       const std::shared_ptr<std::string>& _extraFileInfo = {},
-      std::vector<IcebergDeleteFile> deletes = {});
+      std::vector<IcebergDeleteFile> deletes = {},
+      const std::unordered_map<std::string, std::string>& _infoColumns = {});
 };
 
 } // namespace facebook::velox::connector::hive::iceberg
diff --git a/velox/connectors/hive/tests/HiveConnectorTest.cpp b/velox/connectors/hive/tests/HiveConnectorTest.cpp
index 23859fff6f6f..c58edffe999f 100644
--- a/velox/connectors/hive/tests/HiveConnectorTest.cpp
+++ b/velox/connectors/hive/tests/HiveConnectorTest.cpp
@@ -87,7 +87,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_multilevel) {
   auto rowType = ROW({{"c0", columnType}});
   auto subfields = makeSubfields({"c0.c0c1[3][\"foo\"].c0c1c0"});
   auto scanSpec = makeScanSpec(
-      rowType, groupSubfields(subfields), {}, nullptr, {}, pool_.get());
+      rowType, groupSubfields(subfields), {}, nullptr, {}, {}, pool_.get());
   auto* c0c0 = scanSpec->childByName("c0")->childByName("c0c0");
   validateNullConstant(*c0c0, *BIGINT());
   auto* c0c1 = scanSpec->childByName("c0")->childByName("c0c1");
@@ -122,6 +122,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeFields) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   auto* c0c0 = scanSpec->childByName("c0")->childByName("c0c0");
   ASSERT_FALSE(c0c0->childByName("c0c0c0")->isConstant());
@@ -144,6 +145,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeArray) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   auto* c0 = scanSpec->childByName("c0");
   ASSERT_EQ(c0->maxArrayElementsCount(), 2);
@@ -160,7 +162,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeArrayNegative) {
   auto subfields = makeSubfields({"c0[1].c0c0", "c0[-1].c0c2"});
   auto groupedSubfields = groupSubfields(subfields);
   VELOX_ASSERT_USER_THROW(
-      makeScanSpec(rowType, groupedSubfields, {}, nullptr, {}, pool_.get()),
+      makeScanSpec(rowType, groupedSubfields, {}, nullptr, {}, {}, pool_.get()),
       "Non-positive array subscript cannot be push down");
 }
 
@@ -175,6 +177,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeMap) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   auto* c0 = scanSpec->childByName("c0");
   auto* keysFilter = c0->childByName(ScanSpec::kMapKeysFieldName)->filter();
@@ -200,6 +203,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) {
         {},
         nullptr,
         {},
+        {},
         pool_.get());
     auto* c0 = scanSpec->childByName("c0");
     ASSERT_FALSE(c0->childByName(ScanSpec::kMapKeysFieldName)->filter());
@@ -218,6 +222,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   auto* c0 = scanSpec->childByName("c0");
   ASSERT_FALSE(c0->childByName(ScanSpec::kMapKeysFieldName)->filter());
@@ -240,6 +245,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   auto* keysFilter = scanSpec->childByName("c0")
                          ->childByName(ScanSpec::kMapKeysFieldName)
@@ -267,6 +273,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   keysFilter = scanSpec->childByName("c0")
                    ->childByName(ScanSpec::kMapKeysFieldName)
@@ -285,6 +292,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   keysFilter = scanSpec->childByName("c0")
                    ->childByName(ScanSpec::kMapKeysFieldName)
@@ -300,6 +308,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   keysFilter = scanSpec->childByName("c0")
                    ->childByName(ScanSpec::kMapKeysFieldName)
@@ -335,6 +344,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_filtersNotInRequiredSubfields) {
       filters,
       ROW({{"c0", c0Type}, {"c1", c1Type}}),
       {},
+      {},
       pool_.get());
   auto c0 = scanSpec->childByName("c0");
   ASSERT_FALSE(c0->isConstant());
@@ -379,6 +389,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_duplicateSubfields) {
       {},
       nullptr,
       {},
+      {},
       pool_.get());
   auto* c0 = scanSpec->childByName("c0");
   ASSERT_EQ(c0->children().size(), 2);
@@ -392,7 +403,7 @@ TEST_F(HiveConnectorTest, makeScanSpec_filterPartitionKey) {
   SubfieldFilters filters;
   filters.emplace(Subfield("ds"), exec::equal("2023-10-13"));
   auto scanSpec = makeScanSpec(
-      rowType, {}, filters, rowType, {{"ds", nullptr}}, pool_.get());
+      rowType, {}, filters, rowType, {{"ds", nullptr}}, {}, pool_.get());
   ASSERT_TRUE(scanSpec->childByName("c0")->projectOut());
   ASSERT_FALSE(scanSpec->childByName("ds")->projectOut());
 }
diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp
index 394c733fd445..2d35b347bddd 100644
--- a/velox/exec/tests/TableScanTest.cpp
+++ b/velox/exec/tests/TableScanTest.cpp
@@ -2443,6 +2443,76 @@ TEST_F(TableScanTest, path) {
       op, {filePath}, fmt::format("SELECT '{}', * FROM tmp", pathValue));
 }
 
+TEST_F(TableScanTest, fileSizeAndModifiedTime) {
+  auto rowType = ROW({"a"}, {BIGINT()});
+  auto filePath = makeFilePaths(1)[0];
+  auto vector = makeVectors(1, 10, rowType)[0];
+  writeToFile(filePath->path, vector);
+  createDuckDbTable({vector});
+
+  static const char* kSize = "$file_size";
+  static const char* kModifiedTime = "$file_modified_time";
+
+  auto allColumns =
+      ROW({"a", kSize, kModifiedTime}, {BIGINT(), BIGINT(), BIGINT()});
+
+  auto assignments = allRegularColumns(rowType);
+  assignments[kSize] = synthesizedColumn(kSize, BIGINT());
+  assignments[kModifiedTime] = synthesizedColumn(kModifiedTime, BIGINT());
+
+  auto fileSizeValue = fmt::format("{}", filePath->fileSize());
+  auto fileTimeValue = fmt::format("{}", filePath->fileModifiedTime());
+
+  // Select and project both '$file_size', '$file_modified_time'.
+  auto op = PlanBuilder()
+                .startTableScan()
+                .outputType(allColumns)
+                .dataColumns(allColumns)
+                .assignments(assignments)
+                .endTableScan()
+                .planNode();
+  assertQuery(
+      op,
+      {filePath},
+      fmt::format("SELECT *, {}, {} FROM tmp", fileSizeValue, fileTimeValue));
+
+  auto filterTest = [&](const std::string& filter) {
+    auto tableHandle = makeTableHandle(
+        SubfieldFilters{},
+        parseExpr(filter, allColumns),
+        "hive_table",
+        allColumns);
+
+    // Use synthesized column in a filter but don't project it.
+    op = PlanBuilder()
+             .startTableScan()
+             .outputType(rowType)
+             .dataColumns(allColumns)
+             .tableHandle(tableHandle)
+             .assignments(assignments)
+             .endTableScan()
+             .planNode();
+    assertQuery(op, {filePath}, "SELECT * FROM tmp");
+
+    // Use synthesized column in a filter and project it out.
+    op = PlanBuilder()
+             .startTableScan()
+             .outputType(allColumns)
+             .dataColumns(allColumns)
+             .tableHandle(tableHandle)
+             .assignments(assignments)
+             .endTableScan()
+             .planNode();
+    assertQuery(
+        op,
+        {filePath},
+        fmt::format("SELECT *, {}, {} FROM tmp", fileSizeValue, fileTimeValue));
+  };
+
+  filterTest(fmt::format("\"{}\" = {}", kSize, fileSizeValue));
+  filterTest(fmt::format("\"{}\" = {}", kModifiedTime, fileTimeValue));
+}
+
 TEST_F(TableScanTest, bucket) {
   vector_size_t size = 1'000;
   int numBatches = 5;
diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.cpp b/velox/exec/tests/utils/HiveConnectorTestBase.cpp
index ece0d3545bdf..c3c6ccb2a166 100644
--- a/velox/exec/tests/utils/HiveConnectorTestBase.cpp
+++ b/velox/exec/tests/utils/HiveConnectorTestBase.cpp
@@ -171,7 +171,12 @@ HiveConnectorTestBase::makeHiveConnectorSplits(
     const std::vector<std::shared_ptr<TempFilePath>>& filePaths) {
   std::vector<std::shared_ptr<connector::ConnectorSplit>> splits;
   for (auto filePath : filePaths) {
-    splits.push_back(makeHiveConnectorSplit(filePath->path));
+    splits.push_back(makeHiveConnectorSplit(
+        filePath->path,
+        filePath->fileSize(),
+        filePath->fileModifiedTime(),
+        0,
+        std::numeric_limits<uint64_t>::max()));
   }
   return splits;
 }
@@ -189,6 +194,21 @@ HiveConnectorTestBase::makeHiveConnectorSplit(
       .build();
 }
 
+std::shared_ptr<connector::ConnectorSplit>
+HiveConnectorTestBase::makeHiveConnectorSplit(
+    const std::string& filePath,
+    int64_t fileSize,
+    int64_t fileModifiedTime,
+    uint64_t start,
+    uint64_t length) {
+  return HiveConnectorSplitBuilder(filePath)
+      .infoColumn("$file_size", fmt::format("{}", fileSize))
+      .infoColumn("$file_modified_time", fmt::format("{}", fileModifiedTime))
+      .start(start)
+      .length(length)
+      .build();
+}
+
 // static
 std::shared_ptr<connector::hive::HiveInsertTableHandle>
 HiveConnectorTestBase::makeHiveInsertTableHandle(
diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.h b/velox/exec/tests/utils/HiveConnectorTestBase.h
index 8f7a03fe6cff..34019c5d65b2 100644
--- a/velox/exec/tests/utils/HiveConnectorTestBase.h
+++ b/velox/exec/tests/utils/HiveConnectorTestBase.h
@@ -74,6 +74,13 @@ class HiveConnectorTestBase : public OperatorTestBase {
       uint64_t length = std::numeric_limits<uint64_t>::max(),
       int64_t splitWeight = 0);
 
+  static std::shared_ptr<connector::ConnectorSplit> makeHiveConnectorSplit(
+      const std::string& filePath,
+      int64_t fileSize,
+      int64_t fileModifiedTime,
+      uint64_t start,
+      uint64_t length);
+
   /// Split file at path 'filePath' into 'splitCount' splits. If not local file,
   /// file size can be given as 'externalSize'.
   static std::vector<std::shared_ptr<connector::hive::HiveConnectorSplit>>
@@ -208,6 +215,13 @@ class HiveConnectorSplitBuilder {
     return *this;
   }
 
+  HiveConnectorSplitBuilder& infoColumn(
+      const std::string& name,
+      const std::string& value) {
+    infoColumns_.emplace(std::move(name), std::move(value));
+    return *this;
+  }
+
   HiveConnectorSplitBuilder& partitionKey(
       std::string name,
       std::optional<std::string> value) {
@@ -220,6 +234,24 @@ class HiveConnectorSplitBuilder {
     return *this;
   }
 
+  HiveConnectorSplitBuilder& customSplitInfo(
+      const std::unordered_map<std::string, std::string>& customSplitInfo) {
+    customSplitInfo_ = customSplitInfo;
+    return *this;
+  }
+
+  HiveConnectorSplitBuilder& extraFileInfo(
+      const std::shared_ptr<std::string>& extraFileInfo) {
+    extraFileInfo_ = extraFileInfo;
+    return *this;
+  }
+
+  HiveConnectorSplitBuilder& serdeParameters(
+      const std::unordered_map<std::string, std::string>& serdeParameters) {
+    serdeParameters_ = serdeParameters;
+    return *this;
+  }
+
   HiveConnectorSplitBuilder& connectorId(const std::string& connectorId) {
     connectorId_ = connectorId;
     return *this;
@@ -240,7 +272,8 @@ class HiveConnectorSplitBuilder {
         customSplitInfo,
         extraFileInfo,
         serdeParameters,
-        splitWeight_);
+        splitWeight_,
+        infoColumns_);
   }
 
  private:
@@ -250,6 +283,10 @@ class HiveConnectorSplitBuilder {
   uint64_t length_{std::numeric_limits<uint64_t>::max()};
   std::unordered_map<std::string, std::optional<std::string>> partitionKeys_;
   std::optional<int32_t> tableBucketNumber_;
+  std::unordered_map<std::string, std::string> customSplitInfo_ = {};
+  std::shared_ptr<std::string> extraFileInfo_ = {};
+  std::unordered_map<std::string, std::string> serdeParameters_ = {};
+  std::unordered_map<std::string, std::string> infoColumns_ = {};
   std::string connectorId_ = kHiveConnectorId;
   int64_t splitWeight_{0};
 };
diff --git a/velox/exec/tests/utils/TempFilePath.h b/velox/exec/tests/utils/TempFilePath.h
index cf615b413138..d993795f1e3a 100644
--- a/velox/exec/tests/utils/TempFilePath.h
+++ b/velox/exec/tests/utils/TempFilePath.h
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <sys/stat.h>
 #include <unistd.h>
 #include <cstdlib>
 #include <fstream>
@@ -47,6 +48,18 @@ class TempFilePath {
     file.close();
   }
 
+  const int64_t fileSize() {
+    struct stat st;
+    stat(path.data(), &st);
+    return st.st_size;
+  }
+
+  const int64_t fileModifiedTime() {
+    struct stat st;
+    stat(path.data(), &st);
+    return st.st_mtime;
+  }
+
  private:
   int fd;