apache · holdenk · Nov 26, 2023 · Dec 29, 2023 · Dec 29, 2023 · Dec 29, 2023
diff --git a/backends-velox/src/test/java/io/glutenproject/fs/OnHeapFileSystemTest.java b/backends-velox/src/test/java/io/glutenproject/fs/OnHeapFileSystemTest.java
@@ -23,6 +23,8 @@
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 
+import static org.junit.Assume.*;
+
 // FIXME our checkstyle config doesn't allow "Suite" as suffix of Java tests
 public class OnHeapFileSystemTest {
   private final JniFilesystem fs = OnHeapFileSystem.INSTANCE;
@@ -33,6 +35,13 @@ public void testRoundTrip() {
     final String text = "HELLO WORLD";
     final long fileSize;
     JniFilesystem.WriteFile writeFile = fs.openFileForWrite(path);
+
+    try {
+      ByteBuffer buf = PlatformDependent.allocateDirectNoCleaner(1);
+      PlatformDependent.freeDirectNoCleaner(buf);
+    } catch (java.lang.AssertionError e) {
+      assumeTrue("We are in a JVM which does not support allocateDirectNoCleaner.", false);
+    }
     try {
       byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
       ByteBuffer buf = PlatformDependent.allocateDirectNoCleaner(bytes.length);

diff --git a/dev/buildbundle-veloxbe.sh b/dev/buildbundle-veloxbe.sh
@@ -7,3 +7,4 @@ cd $GLUTEN_DIR
 mvn clean package -Pbackends-velox -Prss -Pspark-3.2 -DskipTests
 mvn clean package -Pbackends-velox -Prss -Pspark-3.3 -DskipTests
 mvn clean package -Pbackends-velox -Prss -Pspark-3.4 -DskipTests
+mvn clean package -Pbackends-velox -Prss -Pspark-3.5 -DskipTests
diff --git a/ep/build-velox/src/build_velox.sh b/ep/build-velox/src/build_velox.sh
@@ -207,8 +207,10 @@ function check_commit {
       fi
     fi
   else
-    # Branch-new build requires all untracked files to be deleted. We only need the source code.
-    git clean -dffx :/
+    # On linux some of the scripts use sudo in the build so it fails to clean.
+    # Yes the right solution is having the build scripts not use sudo, but that's
+    # huge change and depends on sub-modules.
+    git clean -dffx :/ || sudo git clean -dffx :/ 
   fi
 
   if [ -f ${VELOX_HOME}/velox-build.cache ]; then

diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/WholeStageTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/execution/WholeStageTransformer.scala
@@ -155,35 +155,6 @@ case class WholeStageTransformer(child: SparkPlan, materializeInput: Boolean = f
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
   override def otherCopyArgs: Seq[AnyRef] = Seq(transformStageId.asInstanceOf[Integer])
-
-  override def generateTreeString(
-      depth: Int,
-      lastChildren: Seq[Boolean],
-      append: String => Unit,
-      verbose: Boolean,
-      prefix: String = "",
-      addSuffix: Boolean = false,
-      maxFields: Int,
-      printNodeId: Boolean,
-      indent: Int = 0): Unit = {
-    val prefix = if (printNodeId) "^ " else s"^($transformStageId) "
-    child.generateTreeString(
-      depth,
-      lastChildren,
-      append,
-      verbose,
-      prefix,
-      addSuffix = false,
-      maxFields,
-      printNodeId = printNodeId,
-      indent)
-    if (verbose && wholeStageTransformerContext.isDefined) {
-      append(prefix + "Substrait plan:\n")
-      append(substraitPlanJson)
-      append("\n")
-    }
-  }
-
   // It's misleading with "Codegen" used. But we have to keep "WholeStageCodegen" prefixed to
   // make whole stage transformer clearly plotted in UI, like spark's whole stage codegen.
   // See buildSparkPlanGraphNode in SparkPlanGraph.scala of Spark.

diff --git a/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala b/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala
@@ -192,7 +192,7 @@ object ConverterUtils extends Logging {
           typ =>
             val (field, nullable) = parseFromSubstraitType(typ)
             StructField("", field, nullable)
-        }
+        }.asJava
         (StructType(fields), isNullable(substraitType.getStruct.getNullability))
       case Type.KindCase.LIST =>
         val list = substraitType.getList

diff --git a/gluten-core/src/main/scala/io/glutenproject/utils/InputPartitionsUtil.scala b/gluten-core/src/main/scala/io/glutenproject/utils/InputPartitionsUtil.scala
@@ -21,7 +21,6 @@ import io.glutenproject.sql.shims.SparkShimLoader
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.connector.read.InputPartition
-import org.apache.spark.sql.execution.PartitionedFileUtil
 import org.apache.spark.sql.execution.datasources.{FilePartition, HadoopFsRelation, PartitionDirectory}
 import org.apache.spark.util.collection.BitSet
 
@@ -60,7 +59,7 @@ case class InputPartitionsUtil(
               val filePath = file.getPath
               val isSplitable =
                 relation.fileFormat.isSplitable(relation.sparkSession, relation.options, filePath)
-              PartitionedFileUtil.splitFiles(
+              SparkShimLoader.getSparkShims.splitFiles(
                 sparkSession = relation.sparkSession,
                 file = file,
                 filePath = filePath,

diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarShuffleExchangeExec.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarShuffleExchangeExec.scala
@@ -66,6 +66,9 @@ case class ColumnarShuffleExchangeExec(
     }
   }
 
+  // Added in 3.5
+  def advisoryPartitionSize: Option[Long] = None
+
   /**
    * A [[ShuffleDependency]] that will partition rows of its child based on the partitioning scheme
    * defined in `newPartitioning`. Those partitions of the returned ShuffleDependency will be the

diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenImplicits.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/GlutenImplicits.scala
@@ -205,8 +205,8 @@ object GlutenImplicits {
       FallbackSummary(
         totalNumGlutenNodes,
         totalNumFallbackNodes,
-        totalPhysicalPlanDescription,
-        totalFallbackNodeToReason
+        totalPhysicalPlanDescription.toSeq,
+        totalFallbackNodeToReason.toSeq
       )
     }
 

diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HivePartitionConverter.scala b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HivePartitionConverter.scala
@@ -17,12 +17,12 @@
 package org.apache.spark.sql.hive
 
 import io.glutenproject.backendsapi.BackendsApiManager
+import io.glutenproject.sql.shims.SparkShimLoader
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper}
 import org.apache.spark.sql.catalyst.analysis.CastSupport
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.PartitionedFileUtil
 import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionDirectory}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.DataType
@@ -137,13 +137,13 @@ class HivePartitionConverter(hadoopConf: Configuration, session: SparkSession)
         partition.files
           .flatMap {
             f =>
-              PartitionedFileUtil.splitFiles(
+              SparkShimLoader.getSparkShims.splitFiles(
                 session,
                 f,
                 f.getPath,
                 isSplitable = canBeSplit(f.getPath),
-                maxSplitBytes,
-                partition.values
+                maxSplitBytes = maxSplitBytes,
+                partitionValues = partition.values
               )
           }
           .sortBy(_.length)(implicitly[Ordering[Long]].reverse)

diff --git a/gluten-core/src/test/scala/org/apache/spark/sql/GlutenQueryTest.scala b/gluten-core/src/test/scala/org/apache/spark/sql/GlutenQueryTest.scala
@@ -107,21 +107,7 @@ abstract class GlutenQueryTest extends PlanTest {
   }
 
   private def getResult[T](ds: => Dataset[T]): Array[T] = {
-    val analyzedDS =
-      try ds
-      catch {
-        case ae: AnalysisException =>
-          if (ae.plan.isDefined) {
-            fail(s"""
-                    |Failed to analyze query: $ae
-                    |${ae.plan.get}
-                    |
-                    |${stackTraceToString(ae)}
-             """.stripMargin)
-          } else {
-            throw ae
-          }
-      }
+    val analyzedDS = ds
     assertEmptyMissingInput(analyzedDS)
 
     try ds.collect()
@@ -148,21 +134,7 @@ abstract class GlutenQueryTest extends PlanTest {
    *   the expected result in a [[Seq]] of [[Row]]s.
    */
   protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = {
-    val analyzedDF =
-      try df
-      catch {
-        case ae: AnalysisException =>
-          if (ae.plan.isDefined) {
-            fail(s"""
-                    |Failed to analyze query: $ae
-                    |${ae.plan.get}
-                    |
-                    |${stackTraceToString(ae)}
-                    |""".stripMargin)
-          } else {
-            throw ae
-          }
-      }
+    val analyzedDF = df
 
     assertEmptyMissingInput(analyzedDF)
 

diff --git a/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala b/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala
@@ -20,14 +20,14 @@ import io.glutenproject.columnarbatch.ColumnarBatches
 import io.glutenproject.exec.Runtimes
 import io.glutenproject.memory.arrowalloc.ArrowBufferAllocators
 import io.glutenproject.memory.nmm.NativeMemoryManagers
+import io.glutenproject.sql.shims._
 import io.glutenproject.utils.{ArrowAbiUtil, Iterators}
 import io.glutenproject.vectorized.{ColumnarBatchSerializerJniWrapper, NativeColumnarToRowJniWrapper}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, Expression, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.execution.joins.BuildSideRelation
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.utils.SparkArrowUtil
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.TaskResources
@@ -45,7 +45,7 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra
       val allocator = ArrowBufferAllocators.contextInstance()
       val cSchema = ArrowSchema.allocateNew(allocator)
       val arrowSchema = SparkArrowUtil.toArrowSchema(
-        StructType.fromAttributes(output),
+        SparkShimLoader.getSparkShims.structFromAttributes(output),
         SQLConf.get.sessionLocalTimeZone)
       ArrowAbiUtil.exportSchema(allocator, arrowSchema, cSchema)
       val handle = jniWrapper
@@ -96,7 +96,7 @@ case class ColumnarBuildSideRelation(output: Seq[Attribute], batches: Array[Arra
       val allocator = ArrowBufferAllocators.contextInstance()
       val cSchema = ArrowSchema.allocateNew(allocator)
       val arrowSchema = SparkArrowUtil.toArrowSchema(
-        StructType.fromAttributes(output),
+        SparkShimLoader.getSparkShims.structFromAttributes(output),
         SQLConf.get.sessionLocalTimeZone)
       ArrowAbiUtil.exportSchema(allocator, arrowSchema, cSchema)
       val handle = serializerJniWrapper