diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala b/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala index 2cae6bdfe404..dbc1e1e471c1 100644 --- a/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala +++ b/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala @@ -470,7 +470,7 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla checkOperatorMatch[HashAggregateExecTransformer](result) } - ignore("orc scan") { + test("orc scan") { val df = spark.read .format("orc") .load("../cpp/velox/benchmarks/data/bm_lineitem/orc/lineitem.orc") diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh index 73bc546b2c18..d26e1f6cad34 100755 --- a/ep/build-velox/src/get_velox.sh +++ b/ep/build-velox/src/get_velox.sh @@ -16,8 +16,8 @@ set -exu -VELOX_REPO=https://github.com/oap-project/velox.git -VELOX_BRANCH=update +VELOX_REPO=https://github.com/chenxu14/velox.git +VELOX_BRANCH=chenxu14_dev VELOX_HOME="" #Set on run gluten on HDFS diff --git a/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala index c5a01d08ea74..def4ed70681e 100644 --- a/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala @@ -328,8 +328,6 @@ class VeloxTestSettings extends BackendTestSettings { // Not useful and time consuming. .exclude("SPARK-33084: Add jar support Ivy URI in SQL") .exclude("SPARK-33084: Add jar support Ivy URI in SQL -- jar contains udf class") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-33593: Vector reader got incorrect data with binary partition value") enableSuite[GlutenDatasetAggregatorSuite] enableSuite[GlutenDatasetOptimizationSuite] enableSuite[GlutenDatasetPrimitiveSuite] @@ -364,8 +362,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Return correct results when data columns overlap with partition " + "columns (nested data)") .exclude("SPARK-31116: Select nested schema with case insensitive mode") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-15474 Write and read back non-empty schema with empty dataframe - orc") .exclude("SPARK-23271 empty RDD when saved should write a metadata only file - orc") .exclude("SPARK-22146 read files containing special characters using orc") .exclude("Do not use cache on overwrite") @@ -407,13 +403,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenOrcPartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1PartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") .exclude("read partitioned table - partition key included in orc file") .exclude("read partitioned table - with nulls and partition keys are included in Orc file") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1QuerySuite] // Rewrite to disable Spark's columnar reader. .exclude("Simple selection form ORC table") @@ -456,7 +450,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time zone") .exclude("SPARK-39381: Make vectorized orc columar writer batch size configurable") .exclude("SPARK-39830: Reading ORC table that requires type promotion may throw AIOOBE") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2QuerySuite] .exclude("Read/write binary data") .exclude("Read/write all types with non-primitive type") @@ -498,7 +491,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-5309 strings stored using dictionary compression in orc") // For exception test. .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcSourceSuite] // Rewrite to disable Spark's columnar reader. .exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates") @@ -516,7 +508,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Gluten - SPARK-31238: compatibility with Spark 2.4 in reading dates") .exclude("Gluten - SPARK-31238, SPARK-31423: rebasing dates in write") .exclude("Gluten - SPARK-34862: Support ORC vectorized reader for nested column") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1FilterSuite] .exclude("SPARK-32622: case sensitivity in predicate pushdown") enableSuite[GlutenOrcV1SchemaPruningSuite] @@ -659,7 +650,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2SchemaPruningSuite] .exclude( "Spark vectorized reader - without partition data column - select only top-level fields") @@ -795,7 +785,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetColumnIndexSuite] // Rewrite by just removing test timestamp. .exclude("test reading unaligned pages - test all types") @@ -925,7 +914,6 @@ class VeloxTestSettings extends BackendTestSettings { // Unsupported compression codec. .exclude("write and read - file source parquet - codec: lz4") enableSuite[GlutenOrcCodecSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenFileSourceStrategySuite] // Plan comparison. .exclude("partitioned table - after scan filters") @@ -959,7 +947,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("hide a nested column in the middle of the leaf struct column") .exclude("hide a nested column at the end of the middle struct column") .exclude("hide a nested column in the middle of the middle struct column") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenVectorizedOrcReadSchemaSuite] // Rewrite to disable Spark's vectorized reading. .exclude("change column position") @@ -981,7 +968,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("change column type from float to double") .exclude("Gluten - read byte, int, short, long together") .exclude("Gluten - read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenMergedOrcReadSchemaSuite] .exclude("append column into middle") .exclude("add a nested column at the end of the leaf struct column") @@ -999,7 +985,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("read byte, int, short, long together") .exclude("change column type from float to double") .exclude("read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetReadSchemaSuite] enableSuite[GlutenVectorizedParquetReadSchemaSuite] enableSuite[GlutenMergedParquetReadSchemaSuite] @@ -1089,7 +1074,5 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-33687: analyze all tables in a specific database") enableSuite[FallbackStrategiesSuite] enableSuite[GlutenHiveSQLQuerySuite] - // ReaderFactory is not registered for format orc. - .exclude("hive orc scan") } // scalastyle:on line.size.limit diff --git a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala index b1607ad6198d..531d873dfe0a 100644 --- a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala @@ -229,13 +229,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenOrcPartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1PartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") .exclude("read partitioned table - partition key included in orc file") .exclude("read partitioned table - with nulls and partition keys are included in Orc file") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1QuerySuite] // Rewrite to disable Spark's columnar reader. .exclude("Simple selection form ORC table") @@ -278,7 +276,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time zone") .exclude("SPARK-39381: Make vectorized orc columar writer batch size configurable") .exclude("SPARK-39830: Reading ORC table that requires type promotion may throw AIOOBE") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2QuerySuite] .exclude("Read/write binary data") .exclude("Read/write all types with non-primitive type") @@ -320,7 +317,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-5309 strings stored using dictionary compression in orc") // For exception test. .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcSourceSuite] // Rewrite to disable Spark's columnar reader. .exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates") @@ -344,7 +340,6 @@ class VeloxTestSettings extends BackendTestSettings { // rewrite .exclude("SPARK-36931: Support reading and writing ANSI intervals (spark.sql.orc.enableVectorizedReader=true, spark.sql.orc.enableNestedColumnVectorizedReader=true)") .exclude("SPARK-36931: Support reading and writing ANSI intervals (spark.sql.orc.enableVectorizedReader=true, spark.sql.orc.enableNestedColumnVectorizedReader=false)") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1FilterSuite] .exclude("SPARK-32622: case sensitivity in predicate pushdown") enableSuite[GlutenOrcV1SchemaPruningSuite] @@ -487,7 +482,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2SchemaPruningSuite] .exclude( "Spark vectorized reader - without partition data column - select only top-level fields") @@ -623,7 +617,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetColumnIndexSuite] // Rewrite by just removing test timestamp. .exclude("test reading unaligned pages - test all types") @@ -788,14 +781,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenParquetV1AggregatePushDownSuite] enableSuite[GlutenParquetV2AggregatePushDownSuite] enableSuite[GlutenOrcV1AggregatePushDownSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2AggregatePushDownSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetCodecSuite] // Unsupported compression codec. .exclude("write and read - file source parquet - codec: lz4") enableSuite[GlutenOrcCodecSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenFileSourceStrategySuite] // Plan comparison. .exclude("partitioned table - after scan filters") @@ -829,7 +819,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("hide a nested column in the middle of the leaf struct column") .exclude("hide a nested column at the end of the middle struct column") .exclude("hide a nested column in the middle of the middle struct column") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenVectorizedOrcReadSchemaSuite] // Rewrite to disable Spark's vectorized reading. .exclude("change column position") @@ -851,7 +840,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("change column type from float to double") .exclude("Gluten - read byte, int, short, long together") .exclude("Gluten - read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenMergedOrcReadSchemaSuite] .exclude("append column into middle") .exclude("add a nested column at the end of the leaf struct column") @@ -869,7 +857,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("read byte, int, short, long together") .exclude("change column type from float to double") .exclude("read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetReadSchemaSuite] enableSuite[GlutenVectorizedParquetReadSchemaSuite] enableSuite[GlutenMergedParquetReadSchemaSuite] @@ -1099,8 +1086,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-31116: Select nested schema with case insensitive mode") // exclude as original metric not correct when task offloaded to velox .exclude("SPARK-37585: test input metrics for DSV2 with output limits") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-15474 Write and read back non-empty schema with empty dataframe - orc") .exclude("SPARK-23271 empty RDD when saved should write a metadata only file - orc") .exclude("SPARK-22146 read files containing special characters using orc") .exclude("SPARK-30362: test input metrics for DSV2") @@ -1152,8 +1137,6 @@ class VeloxTestSettings extends BackendTestSettings { // Not useful and time consuming. .exclude("SPARK-33084: Add jar support Ivy URI in SQL") .exclude("SPARK-33084: Add jar support Ivy URI in SQL -- jar contains udf class") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-33593: Vector reader got incorrect data with binary partition value") enableSuite[GlutenSQLQueryTestSuite] enableSuite[GlutenStatisticsCollectionSuite] .exclude("SPARK-33687: analyze all tables in a specific database") @@ -1170,8 +1153,6 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenXPathFunctionsSuite] enableSuite[GlutenFallbackSuite] enableSuite[GlutenHiveSQLQuerySuite] - // ReaderFactory is not registered for format orc. - .exclude("hive orc scan") enableSuite[GlutenImplicitsTest] } // scalastyle:on line.size.limit diff --git a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala index 44d577d88158..40b068dc0aed 100644 --- a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala @@ -245,13 +245,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenOrcPartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1PartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") .exclude("read partitioned table - partition key included in orc file") .exclude("read partitioned table - with nulls and partition keys are included in Orc file") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1QuerySuite] // Rewrite to disable Spark's columnar reader. .exclude("Simple selection form ORC table") @@ -294,7 +292,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time zone") .exclude("SPARK-39381: Make vectorized orc columar writer batch size configurable") .exclude("SPARK-39830: Reading ORC table that requires type promotion may throw AIOOBE") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2QuerySuite] .exclude("Read/write binary data") .exclude("Read/write all types with non-primitive type") @@ -336,7 +333,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-5309 strings stored using dictionary compression in orc") // For exception test. .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcSourceSuite] // Rewrite to disable Spark's columnar reader. .exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates") @@ -360,7 +356,6 @@ class VeloxTestSettings extends BackendTestSettings { // rewrite .exclude("SPARK-36931: Support reading and writing ANSI intervals (spark.sql.orc.enableVectorizedReader=true, spark.sql.orc.enableNestedColumnVectorizedReader=true)") .exclude("SPARK-36931: Support reading and writing ANSI intervals (spark.sql.orc.enableVectorizedReader=true, spark.sql.orc.enableNestedColumnVectorizedReader=false)") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1FilterSuite] .exclude("SPARK-32622: case sensitivity in predicate pushdown") enableSuite[GlutenOrcV1SchemaPruningSuite] @@ -503,7 +498,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2SchemaPruningSuite] .exclude( "Spark vectorized reader - without partition data column - select only top-level fields") @@ -639,7 +633,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetColumnIndexSuite] // Rewrite by just removing test timestamp. .exclude("test reading unaligned pages - test all types") @@ -813,14 +806,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenParquetV1AggregatePushDownSuite] enableSuite[GlutenParquetV2AggregatePushDownSuite] enableSuite[GlutenOrcV1AggregatePushDownSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2AggregatePushDownSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetCodecSuite] // Unsupported compression codec. .exclude("write and read - file source parquet - codec: lz4") enableSuite[GlutenOrcCodecSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenFileSourceStrategySuite] // Plan comparison. .exclude("partitioned table - after scan filters") @@ -854,7 +844,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("hide a nested column in the middle of the leaf struct column") .exclude("hide a nested column at the end of the middle struct column") .exclude("hide a nested column in the middle of the middle struct column") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenVectorizedOrcReadSchemaSuite] // Rewrite to disable Spark's vectorized reading. .exclude("change column position") @@ -876,7 +865,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("change column type from float to double") .exclude("Gluten - read byte, int, short, long together") .exclude("Gluten - read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenMergedOrcReadSchemaSuite] .exclude("append column into middle") .exclude("add a nested column at the end of the leaf struct column") @@ -894,7 +882,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("read byte, int, short, long together") .exclude("change column type from float to double") .exclude("read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetReadSchemaSuite] enableSuite[GlutenVectorizedParquetReadSchemaSuite] enableSuite[GlutenMergedParquetReadSchemaSuite] @@ -1139,8 +1126,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-31116: Select nested schema with case insensitive mode") // exclude as original metric not correct when task offloaded to velox .exclude("SPARK-37585: test input metrics for DSV2 with output limits") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-15474 Write and read back non-empty schema with empty dataframe - orc") .exclude("SPARK-23271 empty RDD when saved should write a metadata only file - orc") .exclude("SPARK-22146 read files containing special characters using orc") .exclude("SPARK-30362: test input metrics for DSV2") @@ -1198,8 +1183,6 @@ class VeloxTestSettings extends BackendTestSettings { // Not useful and time consuming. .exclude("SPARK-33084: Add jar support Ivy URI in SQL") .exclude("SPARK-33084: Add jar support Ivy URI in SQL -- jar contains udf class") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-33593: Vector reader got incorrect data with binary partition value") .exclude("SPARK-38548: try_sum should return null if overflow happens before merging") .exclude("the escape character is not allowed to end with") .exclude("SPARK-40245: Fix FileScan canonicalization when partition or data filter columns are not read") @@ -1219,7 +1202,5 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenXPathFunctionsSuite] enableSuite[GlutenFallbackSuite] enableSuite[GlutenHiveSQLQuerySuite] - // ReaderFactory is not registered for format orc. - .exclude("hive orc scan") } // scalastyle:on line.size.limit