Skip to content

Commit

Permalink
[VL] Enable more tests of GlutenParquetIOSuite for Spark32/33/34 (#4823)
Browse files Browse the repository at this point in the history
  • Loading branch information
Yohahaha authored Mar 1, 2024
1 parent 96e1c3a commit 731c84c
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 71 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -838,23 +838,11 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenParquetInteroperabilitySuite]
.exclude("parquet timestamp conversion")
enableSuite[GlutenParquetIOSuite]
// Disable Spark's vectorized reading tests.
.exclude("Standard mode - fixed-length decimals")
.exclude("Legacy mode - fixed-length decimals")
.exclude("SPARK-34167: read LongDecimals with precision < 10, VectorizedReader true")
.exclude("read dictionary encoded decimals written as FIXED_LEN_BYTE_ARRAY")
.exclude("read dictionary encoded decimals written as INT64")
.exclude("read dictionary encoded decimals written as INT32")
.exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
// Spark plans scan schema as (i16/i32/i64) so the fallback does not take effect.
// But Velox reads data based on the schema acquired from file metadata,
// while i8 is not supported, so error occurs.
.exclude("SPARK-34817: Read UINT_8/UINT_16/UINT_32 from parquet")
// Exception.
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Timestamp is read as INT96.
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -658,30 +658,20 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenParquetInteroperabilitySuite]
.exclude("parquet timestamp conversion")
enableSuite[GlutenParquetIOSuite]
// Disable Spark's vectorized reading tests.
.exclude("Standard mode - fixed-length decimals")
.exclude("Legacy mode - fixed-length decimals")
.exclude("SPARK-34167: read LongDecimals with precision < 10, VectorizedReader true")
.exclude("read dictionary encoded decimals written as FIXED_LEN_BYTE_ARRAY")
.exclude("read dictionary encoded decimals written as INT64")
.exclude("read dictionary encoded decimals written as INT32")
.exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
// Spark plans scan schema as (i16/i32/i64) so the fallback does not take effect.
// But Velox reads data based on the schema acquired from file metadata,
// while i8 is not supported, so error occurs.
.exclude("SPARK-34817: Read UINT_8/UINT_16/UINT_32 from parquet")
// Exception.
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Timestamp is read as INT96.
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
// TODO: Unsupported Array schema in Parquet.
.exclude("vectorized reader: optional array with required elements")
.exclude("vectorized reader: required array with required elements")
.exclude("vectorized reader: required array with optional elements")
.exclude("vectorized reader: required array with legacy format")
.exclude("SPARK-36726: test incorrect Parquet row group file offset")
// add support in native reader
.exclude("SPARK-41096: FIXED_LEN_BYTE_ARRAY support")
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
// Timezone is not supported yet.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,13 @@
package org.apache.spark.sql.execution.datasources.parquet

import org.apache.spark.sql._
import org.apache.spark.sql.internal.SQLConf

/** A test suite that tests basic Parquet I/O. */
class GlutenParquetIOSuite extends ParquetIOSuite with GlutenSQLTestsBaseTrait {
override protected val vectorizedReaderEnabledKey: String =
SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key + "_DISABLED"
override protected val vectorizedReaderNestedEnabledKey: String =
SQLConf.PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED.key + "_DISABLED"

override protected def testFile(fileName: String): String = {
getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName
}
override def withAllParquetReaders(code: => Unit): Unit = {
// test the row-based reader
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
withClue("Parquet-mr reader") {
code
}
}
}

override protected def readResourceParquetFile(name: String): DataFrame = {
spark.read.parquet(testFile(name))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -642,35 +642,21 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenParquetIOSuite]
// Velox doesn't write file metadata into parquet file.
.exclude("Write Spark version into Parquet metadata")
// Disable Spark's vectorized reading tests.
.exclude("Standard mode - fixed-length decimals")
.exclude("Legacy mode - fixed-length decimals")
.exclude("SPARK-34167: read LongDecimals with precision < 10, VectorizedReader true")
.exclude("read dictionary encoded decimals written as FIXED_LEN_BYTE_ARRAY")
.exclude("read dictionary encoded decimals written as INT64")
.exclude("read dictionary encoded decimals written as INT32")
.exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
// Spark plans scan schema as (i16/i32/i64) so the fallback does not take effect.
// But Velox reads data based on the schema acquired from file metadata,
// while i8 is not supported, so error occurs.
.exclude("SPARK-34817: Read UINT_8/UINT_16/UINT_32 from parquet")
// Exception.
.exclude("SPARK-35640: read binary as timestamp should throw schema incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Timestamp is read as INT96.
// Velox only support read Timestamp with INT96 for now.
.exclude("read dictionary and plain encoded timestamp_millis written as INT64")
.exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP types")
// TODO: Unsupported Array schema in Parquet.
.exclude("vectorized reader: optional array with required elements")
.exclude("vectorized reader: required array with required elements")
.exclude("vectorized reader: required array with optional elements")
.exclude("vectorized reader: required array with legacy format")
.exclude("SPARK-36726: test incorrect Parquet row group file offset")
// add support in native reader
.exclude("SPARK-41096: FIXED_LEN_BYTE_ARRAY support")
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
// Timestamp is read as INT96
.exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP types")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
// Timezone is not supported yet.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,13 @@
package org.apache.spark.sql.execution.datasources.parquet

import org.apache.spark.sql._
import org.apache.spark.sql.internal.SQLConf

/** A test suite that tests basic Parquet I/O. */
class GlutenParquetIOSuite extends ParquetIOSuite with GlutenSQLTestsBaseTrait {
override protected val vectorizedReaderEnabledKey: String =
SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key + "_DISABLED"
override protected val vectorizedReaderNestedEnabledKey: String =
SQLConf.PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED.key + "_DISABLED"

override protected def testFile(fileName: String): String = {
getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName
}
override def withAllParquetReaders(code: => Unit): Unit = {
// test the row-based reader
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
withClue("Parquet-mr reader") {
code
}
}
}

override protected def readResourceParquetFile(name: String): DataFrame = {
spark.read.parquet(testFile(name))
}
Expand Down

0 comments on commit 731c84c

Please sign in to comment.