diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala b/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala index ca5118089755..16361850b051 100644 --- a/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala +++ b/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala @@ -465,7 +465,7 @@ class TestOperator extends VeloxWholeStageTransformerSuite { checkOperatorMatch[HashAggregateExecTransformer](result) } - ignore("orc scan") { + test("orc scan") { val df = spark.read .format("orc") .load("../cpp/velox/benchmarks/data/bm_lineitem/orc/lineitem.orc") diff --git a/cpp/velox/compute/VeloxBackend.cc b/cpp/velox/compute/VeloxBackend.cc index a4c5ffdf3aa0..4a0f98e01d8a 100644 --- a/cpp/velox/compute/VeloxBackend.cc +++ b/cpp/velox/compute/VeloxBackend.cc @@ -239,6 +239,7 @@ void VeloxBackend::init(const std::unordered_map& conf registerConnector(hiveConnector); velox::parquet::registerParquetReaderFactory(); velox::dwrf::registerDwrfReaderFactory(); + velox::dwrf::registerOrcReaderFactory(); // Register Velox functions registerAllFunctions(); if (!facebook::velox::isRegisteredVectorSerde()) { diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh index c74dcf5f16e1..aead841e987b 100755 --- a/ep/build-velox/src/get_velox.sh +++ b/ep/build-velox/src/get_velox.sh @@ -16,8 +16,8 @@ set -exu -VELOX_REPO=https://github.com/oap-project/velox.git -VELOX_BRANCH=update +VELOX_REPO=https://github.com/chenxu14/velox.git +VELOX_BRANCH=chenxu14_dev VELOX_HOME="" #Set on run gluten on HDFS diff --git a/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala index 51256bb390e5..a73dca710bd5 100644 --- a/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala @@ -322,8 +322,6 @@ class VeloxTestSettings extends BackendTestSettings { // Not useful and time consuming. .exclude("SPARK-33084: Add jar support Ivy URI in SQL") .exclude("SPARK-33084: Add jar support Ivy URI in SQL -- jar contains udf class") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-33593: Vector reader got incorrect data with binary partition value") enableSuite[GlutenDatasetAggregatorSuite] enableSuite[GlutenDatasetOptimizationSuite] enableSuite[GlutenDatasetPrimitiveSuite] @@ -358,8 +356,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Return correct results when data columns overlap with partition " + "columns (nested data)") .exclude("SPARK-31116: Select nested schema with case insensitive mode") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-15474 Write and read back non-empty schema with empty dataframe - orc") .exclude("SPARK-23271 empty RDD when saved should write a metadata only file - orc") .exclude("SPARK-22146 read files containing special characters using orc") .exclude("Do not use cache on overwrite") @@ -403,13 +399,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenOrcPartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1PartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") .exclude("read partitioned table - partition key included in orc file") .exclude("read partitioned table - with nulls and partition keys are included in Orc file") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1QuerySuite] // Rewrite to disable Spark's columnar reader. .exclude("Simple selection form ORC table") @@ -452,7 +446,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time zone") .exclude("SPARK-39381: Make vectorized orc columar writer batch size configurable") .exclude("SPARK-39830: Reading ORC table that requires type promotion may throw AIOOBE") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2QuerySuite] .exclude("Read/write binary data") .exclude("Read/write all types with non-primitive type") @@ -494,7 +487,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-5309 strings stored using dictionary compression in orc") // For exception test. .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcSourceSuite] // Rewrite to disable Spark's columnar reader. .exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates") @@ -512,7 +504,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Gluten - SPARK-31238: compatibility with Spark 2.4 in reading dates") .exclude("Gluten - SPARK-31238, SPARK-31423: rebasing dates in write") .exclude("Gluten - SPARK-34862: Support ORC vectorized reader for nested column") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1FilterSuite] .exclude("SPARK-32622: case sensitivity in predicate pushdown") enableSuite[GlutenOrcV1SchemaPruningSuite] @@ -655,7 +646,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2SchemaPruningSuite] .exclude( "Spark vectorized reader - without partition data column - select only top-level fields") @@ -791,7 +781,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetColumnIndexSuite] enableSuite[GlutenParquetCompressionCodecPrecedenceSuite] enableSuite[GlutenParquetEncodingSuite] @@ -915,7 +904,6 @@ class VeloxTestSettings extends BackendTestSettings { // Unsupported compression codec. .exclude("write and read - file source parquet - codec: lz4") enableSuite[GlutenOrcCodecSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenFileSourceStrategySuite] // Plan comparison. .exclude("partitioned table - after scan filters") @@ -949,7 +937,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("hide a nested column in the middle of the leaf struct column") .exclude("hide a nested column at the end of the middle struct column") .exclude("hide a nested column in the middle of the middle struct column") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenVectorizedOrcReadSchemaSuite] // Rewrite to disable Spark's vectorized reading. .exclude("change column position") @@ -971,7 +958,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("change column type from float to double") .exclude("Gluten - read byte, int, short, long together") .exclude("Gluten - read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenMergedOrcReadSchemaSuite] .exclude("append column into middle") .exclude("add a nested column at the end of the leaf struct column") @@ -989,7 +975,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("read byte, int, short, long together") .exclude("change column type from float to double") .exclude("read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetReadSchemaSuite] enableSuite[GlutenVectorizedParquetReadSchemaSuite] enableSuite[GlutenMergedParquetReadSchemaSuite] @@ -1079,7 +1064,5 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-33687: analyze all tables in a specific database") enableSuite[FallbackStrategiesSuite] enableSuite[GlutenHiveSQLQuerySuite] - // ReaderFactory is not registered for format orc. - .exclude("hive orc scan") } // scalastyle:on line.size.limit diff --git a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala index 4cf2768369f7..2bd2070ed260 100644 --- a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala @@ -227,13 +227,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenOrcPartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1PartitionDiscoverySuite] .exclude("read partitioned table - normal case") .exclude("read partitioned table - with nulls") .exclude("read partitioned table - partition key included in orc file") .exclude("read partitioned table - with nulls and partition keys are included in Orc file") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1QuerySuite] // Rewrite to disable Spark's columnar reader. .exclude("Simple selection form ORC table") @@ -276,7 +274,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time zone") .exclude("SPARK-39381: Make vectorized orc columar writer batch size configurable") .exclude("SPARK-39830: Reading ORC table that requires type promotion may throw AIOOBE") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2QuerySuite] .exclude("Read/write binary data") .exclude("Read/write all types with non-primitive type") @@ -318,7 +315,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-5309 strings stored using dictionary compression in orc") // For exception test. .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcSourceSuite] // Rewrite to disable Spark's columnar reader. .exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates") @@ -342,7 +338,6 @@ class VeloxTestSettings extends BackendTestSettings { // rewrite .exclude("SPARK-36931: Support reading and writing ANSI intervals (spark.sql.orc.enableVectorizedReader=true, spark.sql.orc.enableNestedColumnVectorizedReader=true)") .exclude("SPARK-36931: Support reading and writing ANSI intervals (spark.sql.orc.enableVectorizedReader=true, spark.sql.orc.enableNestedColumnVectorizedReader=false)") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV1FilterSuite] .exclude("SPARK-32622: case sensitivity in predicate pushdown") enableSuite[GlutenOrcV1SchemaPruningSuite] @@ -485,7 +480,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2SchemaPruningSuite] .exclude( "Spark vectorized reader - without partition data column - select only top-level fields") @@ -621,7 +615,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("Spark vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - without partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") .exclude("Non-vectorized reader - with partition data column - SPARK-38977: schema pruning with correlated NOT IN subquery") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetColumnIndexSuite] enableSuite[GlutenParquetCompressionCodecPrecedenceSuite] enableSuite[GlutenParquetDeltaByteArrayEncodingSuite] @@ -780,14 +773,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenParquetV1AggregatePushDownSuite] enableSuite[GlutenParquetV2AggregatePushDownSuite] enableSuite[GlutenOrcV1AggregatePushDownSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenOrcV2AggregatePushDownSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetCodecSuite] // Unsupported compression codec. .exclude("write and read - file source parquet - codec: lz4") enableSuite[GlutenOrcCodecSuite] - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenFileSourceStrategySuite] // Plan comparison. .exclude("partitioned table - after scan filters") @@ -821,7 +811,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("hide a nested column in the middle of the leaf struct column") .exclude("hide a nested column at the end of the middle struct column") .exclude("hide a nested column in the middle of the middle struct column") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenVectorizedOrcReadSchemaSuite] // Rewrite to disable Spark's vectorized reading. .exclude("change column position") @@ -843,7 +832,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("change column type from float to double") .exclude("Gluten - read byte, int, short, long together") .exclude("Gluten - read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenMergedOrcReadSchemaSuite] .exclude("append column into middle") .exclude("add a nested column at the end of the leaf struct column") @@ -861,7 +849,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("read byte, int, short, long together") .exclude("change column type from float to double") .exclude("read float and double together") - .disableByReason("Blocked by ORC Velox upstream not ready") enableSuite[GlutenParquetReadSchemaSuite] enableSuite[GlutenVectorizedParquetReadSchemaSuite] enableSuite[GlutenMergedParquetReadSchemaSuite] @@ -1089,8 +1076,6 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-31116: Select nested schema with case insensitive mode") // exclude as original metric not correct when task offloaded to velox .exclude("SPARK-37585: test input metrics for DSV2 with output limits") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-15474 Write and read back non-empty schema with empty dataframe - orc") .exclude("SPARK-23271 empty RDD when saved should write a metadata only file - orc") .exclude("SPARK-22146 read files containing special characters using orc") .exclude("SPARK-30362: test input metrics for DSV2") @@ -1142,8 +1127,6 @@ class VeloxTestSettings extends BackendTestSettings { // Not useful and time consuming. .exclude("SPARK-33084: Add jar support Ivy URI in SQL") .exclude("SPARK-33084: Add jar support Ivy URI in SQL -- jar contains udf class") - // ReaderFactory is not registered for format orc. - .exclude("SPARK-33593: Vector reader got incorrect data with binary partition value") enableSuite[GlutenSQLQueryTestSuite] enableSuite[GlutenStatisticsCollectionSuite] .exclude("SPARK-33687: analyze all tables in a specific database") @@ -1160,7 +1143,5 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenXPathFunctionsSuite] enableSuite[GlutenFallbackSuite] enableSuite[GlutenHiveSQLQuerySuite] - // ReaderFactory is not registered for format orc. - .exclude("hive orc scan") } // scalastyle:on line.size.limit