[VL] Enable more tests of GlutenParquetIOSuite for Spark32/33/34 (#4823)

apache · Mar 1, 2024 · 731c84c · 731c84c
1 parent 96e1c3a
commit 731c84c
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 71 deletions.
diff --git a/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
@@ -838,23 +838,11 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenParquetInteroperabilitySuite]
     .exclude("parquet timestamp conversion")
   enableSuite[GlutenParquetIOSuite]
-    // Disable Spark's vectorized reading tests.
-    .exclude("Standard mode - fixed-length decimals")
-    .exclude("Legacy mode - fixed-length decimals")
-    .exclude("SPARK-34167: read LongDecimals with precision < 10, VectorizedReader true")
-    .exclude("read dictionary encoded decimals written as FIXED_LEN_BYTE_ARRAY")
-    .exclude("read dictionary encoded decimals written as INT64")
-    .exclude("read dictionary encoded decimals written as INT32")
-    .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
-    // Spark plans scan schema as (i16/i32/i64) so the fallback does not take effect.
-    // But Velox reads data based on the schema acquired from file metadata,
-    // while i8 is not supported, so error occurs.
-    .exclude("SPARK-34817: Read UINT_8/UINT_16/UINT_32 from parquet")
     // Exception.
     .exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
     // Exception msg.
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
-    // Timestamp is read as INT96.
+    // Velox only support read Timestamp with INT96 for now.
     .exclude("read dictionary and plain encoded timestamp_millis written as INT64")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]

diff --git a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
@@ -658,30 +658,20 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenParquetInteroperabilitySuite]
     .exclude("parquet timestamp conversion")
   enableSuite[GlutenParquetIOSuite]
-    // Disable Spark's vectorized reading tests.
-    .exclude("Standard mode - fixed-length decimals")
-    .exclude("Legacy mode - fixed-length decimals")
-    .exclude("SPARK-34167: read LongDecimals with precision < 10, VectorizedReader true")
-    .exclude("read dictionary encoded decimals written as FIXED_LEN_BYTE_ARRAY")
-    .exclude("read dictionary encoded decimals written as INT64")
-    .exclude("read dictionary encoded decimals written as INT32")
-    .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
-    // Spark plans scan schema as (i16/i32/i64) so the fallback does not take effect.
-    // But Velox reads data based on the schema acquired from file metadata,
-    // while i8 is not supported, so error occurs.
-    .exclude("SPARK-34817: Read UINT_8/UINT_16/UINT_32 from parquet")
     // Exception.
     .exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
     // Exception msg.
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
-    // Timestamp is read as INT96.
+    // Velox only support read Timestamp with INT96 for now.
     .exclude("read dictionary and plain encoded timestamp_millis written as INT64")
     // TODO: Unsupported Array schema in Parquet.
     .exclude("vectorized reader: optional array with required elements")
     .exclude("vectorized reader: required array with required elements")
     .exclude("vectorized reader: required array with optional elements")
     .exclude("vectorized reader: required array with legacy format")
-    .exclude("SPARK-36726: test incorrect Parquet row group file offset")
+    // add support in native reader
+    .exclude("SPARK-41096: FIXED_LEN_BYTE_ARRAY support")
+    .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
     // Timezone is not supported yet.

diff --git a/.../test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetIOSuite.scala b/.../test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetIOSuite.scala
@@ -17,26 +17,13 @@
 package org.apache.spark.sql.execution.datasources.parquet
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.internal.SQLConf
 
 /** A test suite that tests basic Parquet I/O. */
 class GlutenParquetIOSuite extends ParquetIOSuite with GlutenSQLTestsBaseTrait {
-  override protected val vectorizedReaderEnabledKey: String =
-    SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key + "_DISABLED"
-  override protected val vectorizedReaderNestedEnabledKey: String =
-    SQLConf.PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED.key + "_DISABLED"
-
   override protected def testFile(fileName: String): String = {
     getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName
   }
-  override def withAllParquetReaders(code: => Unit): Unit = {
-    // test the row-based reader
-    withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
-      withClue("Parquet-mr reader") {
-        code
-      }
-    }
-  }
+
   override protected def readResourceParquetFile(name: String): DataFrame = {
     spark.read.parquet(testFile(name))
   }

diff --git a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
@@ -642,35 +642,21 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenParquetIOSuite]
     // Velox doesn't write file metadata into parquet file.
     .exclude("Write Spark version into Parquet metadata")
-    // Disable Spark's vectorized reading tests.
-    .exclude("Standard mode - fixed-length decimals")
-    .exclude("Legacy mode - fixed-length decimals")
-    .exclude("SPARK-34167: read LongDecimals with precision < 10, VectorizedReader true")
-    .exclude("read dictionary encoded decimals written as FIXED_LEN_BYTE_ARRAY")
-    .exclude("read dictionary encoded decimals written as INT64")
-    .exclude("read dictionary encoded decimals written as INT32")
-    .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
-    // Spark plans scan schema as (i16/i32/i64) so the fallback does not take effect.
-    // But Velox reads data based on the schema acquired from file metadata,
-    // while i8 is not supported, so error occurs.
-    .exclude("SPARK-34817: Read UINT_8/UINT_16/UINT_32 from parquet")
     // Exception.
     .exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
     // Exception msg.
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
-    // Timestamp is read as INT96.
+    // Velox only support read Timestamp with INT96 for now.
     .exclude("read dictionary and plain encoded timestamp_millis written as INT64")
+    .exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP types")
     // TODO: Unsupported Array schema in Parquet.
     .exclude("vectorized reader: optional array with required elements")
     .exclude("vectorized reader: required array with required elements")
     .exclude("vectorized reader: required array with optional elements")
     .exclude("vectorized reader: required array with legacy format")
-    .exclude("SPARK-36726: test incorrect Parquet row group file offset")
     // add support in native reader
     .exclude("SPARK-41096: FIXED_LEN_BYTE_ARRAY support")
     .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
-    // Timestamp is read as INT96
-    .exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP types")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
     // Timezone is not supported yet.

diff --git a/.../test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetIOSuite.scala b/.../test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetIOSuite.scala
@@ -17,26 +17,13 @@
 package org.apache.spark.sql.execution.datasources.parquet
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.internal.SQLConf
 
 /** A test suite that tests basic Parquet I/O. */
 class GlutenParquetIOSuite extends ParquetIOSuite with GlutenSQLTestsBaseTrait {
-  override protected val vectorizedReaderEnabledKey: String =
-    SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key + "_DISABLED"
-  override protected val vectorizedReaderNestedEnabledKey: String =
-    SQLConf.PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED.key + "_DISABLED"
-
   override protected def testFile(fileName: String): String = {
     getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName
   }
-  override def withAllParquetReaders(code: => Unit): Unit = {
-    // test the row-based reader
-    withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
-      withClue("Parquet-mr reader") {
-        code
-      }
-    }
-  }
+
   override protected def readResourceParquetFile(name: String): DataFrame = {
     spark.read.parquet(testFile(name))
   }