apache · WangGuangxin · May 29, 2024 · Jun 2, 2024 · Jun 23, 2024 · Jul 21, 2024
diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala
@@ -18,11 +18,13 @@ package org.apache.gluten.execution
 
 import org.apache.gluten.GlutenConfig
 import org.apache.gluten.backendsapi.velox.VeloxBackendSettings
+import org.apache.gluten.benchmarks.RandomParquetDataGenerator
 import org.apache.gluten.utils.VeloxFileSystemValidationJniWrapper
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.catalyst.expressions.GreaterThan
 import org.apache.spark.sql.execution.ScalarSubquery
+import org.apache.spark.sql.types._
 
 class VeloxScanSuite extends VeloxWholeStageTransformerSuite {
   protected val rootPath: String = getClass.getResource("/").getPath
@@ -118,4 +120,34 @@ class VeloxScanSuite extends VeloxWholeStageTransformerSuite {
       !VeloxFileSystemValidationJniWrapper.allSupportedByRegisteredFileSystems(
         Array("file:/test_path/", "unsupported://test_path")))
   }
+
+  test("unsupported data type scan filter pushdown") {
+    withTempView("t") {
+      withTempDir {
+        dir =>
+          val path = dir.getAbsolutePath
+          val schema = StructType(
+            Array(
+              StructField("short_decimal_field", DecimalType(5, 2), true),
+              StructField("long_decimal_field", DecimalType(32, 8), true),
+              StructField("binary_field", BinaryType, true),
+              StructField("timestamp_field", TimestampType, true)
+            ))
+          RandomParquetDataGenerator(0).generateRandomData(spark, schema, 10, Some(path))
+          spark.catalog.createTable("t", path, "parquet")
+          runQueryAndCompare(
+            """select * from t where long_decimal_field = 3.14""".stripMargin
+          )(checkGlutenOperatorMatch[FileSourceScanExecTransformer])
+          runQueryAndCompare(
+            """select * from t where short_decimal_field = 3.14""".stripMargin
+          )(checkGlutenOperatorMatch[FileSourceScanExecTransformer])
+          runQueryAndCompare(
+            """select * from t where binary_field = '3.14'""".stripMargin
+          )(checkGlutenOperatorMatch[FileSourceScanExecTransformer])
+          runQueryAndCompare(
+            """select * from t where timestamp_field = current_timestamp()""".stripMargin
+          )(checkGlutenOperatorMatch[FileSourceScanExecTransformer])
+      }
+    }
+  }
 }
diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
@@ -1600,17 +1600,26 @@ bool SubstraitToVeloxPlanConverter::childrenFunctionsOnSameField(
 bool SubstraitToVeloxPlanConverter::canPushdownFunction(
     const ::substrait::Expression_ScalarFunction& scalarFunction,
     const std::string& filterName,
-    uint32_t& fieldIdx) {
-  // Condtions can be pushed down.
+    uint32_t& fieldIdx,
+    const std::vector<TypePtr>& veloxTypeList) {
+  // Conditions can be pushed down.
   static const std::unordered_set<std::string> supportedFunctions = {sIsNotNull, sIsNull, sGte, sGt, sLte, sLt, sEqual};
 
-  bool canPushdown = false;
-  if (supportedFunctions.find(filterName) != supportedFunctions.end() &&
-      fieldOrWithLiteral(scalarFunction.arguments(), fieldIdx)) {
-    // The arg should be field or field with literal.
-    canPushdown = true;
+  if (supportedFunctions.find(filterName) == supportedFunctions.end()) {
+    return false;
+  }
+
+  // The arg should be field or field with literal.
+  if (!fieldOrWithLiteral(scalarFunction.arguments(), fieldIdx)) {
+    return false;
   }
-  return canPushdown;
+
+  // Check whether data type is supported or not
+  if (!veloxTypeList.empty() && fieldIdx < veloxTypeList.size() && !isPushdownSupported(veloxTypeList.at(fieldIdx))) {
+    return false;
+  }
+
+  return true;
 }
 
 bool SubstraitToVeloxPlanConverter::canPushdownNot(
@@ -1686,6 +1695,18 @@ bool SubstraitToVeloxPlanConverter::canPushdownOr(
   return true;
 }
 
+bool SubstraitToVeloxPlanConverter::isPushdownSupported(TypePtr inputType) {
+  // Keep the same with mapToFilters
+  switch (inputType->kind()) {
+    case TypeKind::TIMESTAMP:
+    case TypeKind::VARBINARY:
+    case TypeKind::HUGEINT:
+      return false;
+    default:
+      return true;
+  }
+}
+
 void SubstraitToVeloxPlanConverter::separateFilters(
     std::vector<RangeRecorder>& rangeRecorders,
     const std::vector<::substrait::Expression_ScalarFunction>& scalarFunctions,
@@ -1712,19 +1733,6 @@ void SubstraitToVeloxPlanConverter::separateFilters(
   for (const auto& scalarFunction : scalarFunctions) {
     auto filterNameSpec = SubstraitParser::findFunctionSpec(functionMap_, scalarFunction.function_reference());
     auto filterName = SubstraitParser::getNameBeforeDelimiter(filterNameSpec);
-    // Add all decimal filters to remaining functions because their pushdown are not supported.
-    if (format == dwio::common::FileFormat::ORC && scalarFunction.arguments().size() > 0) {
-      auto value = scalarFunction.arguments().at(0).value();
-      if (value.has_selection()) {
-        uint32_t fieldIndex;
-        bool parsed = SubstraitParser::parseReferenceSegment(value.selection().direct_reference(), fieldIndex);
-        if (!parsed || (!veloxTypeList.empty() && veloxTypeList.at(fieldIndex)->isDecimal())) {
-          remainingFunctions.emplace_back(scalarFunction);
-          continue;
-        }
-      }
-    }
-
     // Check whether NOT and OR functions can be pushed down.
     // If yes, the scalar function will be added into the subfield functions.
     if (filterName == sNot) {
@@ -1742,7 +1750,7 @@ void SubstraitToVeloxPlanConverter::separateFilters(
     } else {
       // Check if the condition is supported to be pushed down.
       uint32_t fieldIdx;
-      if (canPushdownFunction(scalarFunction, filterName, fieldIdx) &&
+      if (canPushdownFunction(scalarFunction, filterName, fieldIdx, veloxTypeList) &&
           rangeRecorders.at(fieldIdx).setCertainRangeForFunction(filterName)) {
         subfieldFunctions.emplace_back(scalarFunction);
       } else {

diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.h b/cpp/velox/substrait/SubstraitToVeloxPlan.h
@@ -451,7 +451,8 @@ class SubstraitToVeloxPlanConverter {
   static bool canPushdownFunction(
       const ::substrait::Expression_ScalarFunction& scalarFunction,
       const std::string& filterName,
-      uint32_t& fieldIdx);
+      uint32_t& fieldIdx,
+      const std::vector<TypePtr>& veloxTypeList);
 
   /// Returns whether a NOT function can be pushed down.
   bool canPushdownNot(
@@ -473,6 +474,9 @@ class SubstraitToVeloxPlanConverter {
   /// 'or' expression are effective on the same column.
   static bool childrenFunctionsOnSameField(const ::substrait::Expression_ScalarFunction& function);
 
+  /// Check whether the data type is supported to pushdown.
+  static bool isPushdownSupported(TypePtr inputType);
+
   /// Extract the scalar function, and set the filter info for different types
   /// of columns. If reverse is true, the opposite filter info will be set.
   void setFilterInfo(

diff --git a/gluten-core/src/test/scala/org/apache/gluten/benchmarks/RandomParquetDataGenerator.scala b/gluten-core/src/test/scala/org/apache/gluten/benchmarks/RandomParquetDataGenerator.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.types._
 
 import com.github.javafaker.Faker
 
-import java.sql.Date
+import java.sql.{Date, Timestamp}
 import java.util.Random
 
 case class RandomParquetDataGenerator(initialSeed: Long = 0L) extends Logging {
@@ -67,7 +67,7 @@ case class RandomParquetDataGenerator(initialSeed: Long = 0L) extends Logging {
       case DoubleType =>
         faker.number().randomDouble(2, Double.MinValue.toLong, Double.MaxValue.toLong)
       case DateType => new Date(faker.date().birthday().getTime)
-//      case TimestampType => new Timestamp(faker.date().birthday().getTime)
+      case TimestampType => new Timestamp(faker.date().birthday().getTime)
       case t: DecimalType =>
         BigDecimal(
           faker.number().randomDouble(t.scale, 0, Math.pow(10, t.precision - t.scale).toLong))
@@ -124,7 +124,7 @@ case class RandomParquetDataGenerator(initialSeed: Long = 0L) extends Logging {
     () => StructField(fieldName, FloatType, nullable = true),
     () => StructField(fieldName, DoubleType, nullable = true),
     () => StructField(fieldName, DateType, nullable = true),
-//    () => StructField(fieldName, TimestampType, nullable = true),
+    () => StructField(fieldName, TimestampType, nullable = true),
     () => StructField(fieldName, DecimalType(10, 2), nullable = true),
     () => StructField(fieldName, DecimalType(30, 10), nullable = true)
   )