apache · marin-ma · Jul 29, 2024 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxMetricsApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxMetricsApi.scala
@@ -272,6 +272,7 @@ class VeloxMetricsApi extends MetricsApi with Logging {
       "compressTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to compress"),
       "decompressTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to decompress"),
       "deserializeTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to deserialize"),
+      "shuffleWallTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "shuffle wall time"),
       // For hash shuffle writer, the peak bytes represents the maximum split buffer size.
       // For sort shuffle writer, the peak bytes represents the maximum
       // row buffer + sort buffer size.

diff --git a/cpp/core/CMakeLists.txt b/cpp/core/CMakeLists.txt
@@ -191,7 +191,6 @@ set(SPARK_COLUMNAR_PLUGIN_SRCS
     shuffle/FallbackRangePartitioner.cc
     shuffle/HashPartitioner.cc
     shuffle/LocalPartitionWriter.cc
-    shuffle/Options.cc
     shuffle/Partitioner.cc
     shuffle/Partitioning.cc
     shuffle/Payload.cc

diff --git a/cpp/core/jni/JniWrapper.cc b/cpp/core/jni/JniWrapper.cc
@@ -755,6 +755,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe
     jint compressionLevel,
     jint compressionThreshold,
     jstring compressionModeJstr,
+    jint sortBufferInitialSize,
+    jboolean useRadixSort,
     jstring dataFileJstr,
     jint numSubDirs,
     jstring localDirsJstr,
@@ -780,7 +782,9 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe
       .partitioning = gluten::toPartitioning(jStringToCString(env, partitioningNameJstr)),
       .taskAttemptId = (int64_t)taskAttemptId,
       .startPartitionId = startPartitionId,
-      .shuffleWriterType = gluten::ShuffleWriter::stringToType(jStringToCString(env, shuffleWriterTypeJstr))};
+      .shuffleWriterType = gluten::ShuffleWriter::stringToType(jStringToCString(env, shuffleWriterTypeJstr)),
+      .sortBufferInitialSize = sortBufferInitialSize,
+      .useRadixSort = static_cast<bool>(useRadixSort)};
 
   // Build PartitionWriterOptions.
   auto partitionWriterOptions = PartitionWriterOptions{

diff --git a/cpp/core/shuffle/Options.cc b/cpp/core/shuffle/Options.cc
diff --git a/cpp/core/shuffle/Options.h b/cpp/core/shuffle/Options.h
@@ -35,9 +35,12 @@ static constexpr int32_t kDefaultBufferAlignment = 64;
 static constexpr double kDefaultBufferReallocThreshold = 0.25;
 static constexpr double kDefaultMergeBufferThreshold = 0.25;
 static constexpr bool kEnableBufferedWrite = true;
+static constexpr bool kDefaultUseRadixSort = true;
+static constexpr int32_t kDefaultSortBufferSize = 4096;
 
 enum ShuffleWriterType { kHashShuffle, kSortShuffle, kRssSortShuffle };
 enum PartitionWriterType { kLocal, kRss };
+enum SortAlgorithm { kRadixSort, kQuickSort };
 
 struct ShuffleReaderOptions {
   arrow::Compression::type compressionType = arrow::Compression::type::LZ4_FRAME;
@@ -56,6 +59,10 @@ struct ShuffleWriterOptions {
   int32_t startPartitionId = 0;
   int64_t threadId = -1;
   ShuffleWriterType shuffleWriterType = kHashShuffle;
+
+  // Sort shuffle writer.
+  int32_t sortBufferInitialSize = kDefaultSortBufferSize;
+  bool useRadixSort = kDefaultUseRadixSort;
 };
 
 struct PartitionWriterOptions {

diff --git a/cpp/velox/shuffle/RadixSort.h b/cpp/velox/shuffle/RadixSort.h
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <vector>
+
+namespace gluten {
+
+template <typename Element>
+class RadixSort {
+ public:
+  /**
+   * Sorts a given array of longs using least-significant-digit radix sort. This routine assumes
+   * you have extra space at the end of the array at least equal to the number of records. The
+   * sort is destructive and may relocate the data positioned within the array.
+   *
+   * @param array array of long elements followed by at least that many empty slots.
+   * @param numRecords number of data records in the array.
+   * @param startByteIndex the first byte (in range [0, 7]) to sort each long by, counting from the
+   *                       least significant byte.
+   * @param endByteIndex the last byte (in range [0, 7]) to sort each long by, counting from the
+   *                     least significant byte. Must be greater than startByteIndex.
+   *
+   * @return The starting index of the sorted data within the given array. We return this instead
+   *         of always copying the data back to position zero for efficiency.
+   */
+  static int32_t sort(Element* array, size_t size, int64_t numRecords, int32_t startByteIndex, int32_t endByteIndex) {
+    assert(startByteIndex >= 0 && "startByteIndex should >= 0");
+    assert(endByteIndex <= 7 && "endByteIndex should <= 7");
+    assert(endByteIndex > startByteIndex);
+    assert(numRecords * 2 <= size);
+
+    int64_t inIndex = 0;
+    int64_t outIndex = numRecords;
+
+    if (numRecords > 0) {
+      auto counts = getCounts(array, numRecords, startByteIndex, endByteIndex);
+
+      for (auto i = startByteIndex; i <= endByteIndex; i++) {
+        if (!counts[i].empty()) {
+          sortAtByte(array, numRecords, counts[i], i, inIndex, outIndex);
+          std::swap(inIndex, outIndex);
+        }
+      }
+    }
+
+    return static_cast<int32_t>(inIndex);
+  }
+
+ private:
+  /**
+   * Performs a partial sort by copying data into destination offsets for each byte value at the
+   * specified byte offset.
+   *
+   * @param array array to partially sort.
+   * @param numRecords number of data records in the array.
+   * @param counts counts for each byte value. This routine destructively modifies this array.
+   * @param byteIdx the byte in a long to sort at, counting from the least significant byte.
+   * @param inIndex the starting index in the array where input data is located.
+   * @param outIndex the starting index where sorted output data should be written.
+   */
+  static void sortAtByte(
+      Element* array,
+      int64_t numRecords,
+      std::vector<int64_t>& counts,
+      int32_t byteIdx,
+      int64_t inIndex,
+      int64_t outIndex) {
+    assert(counts.size() == 256);
+
+    auto offsets = transformCountsToOffsets(counts, outIndex);
+
+    for (auto offset = inIndex; offset < inIndex + numRecords; ++offset) {
+      auto bucket = (array[offset] >> (byteIdx * 8)) & 0xff;
+      array[offsets[bucket]++] = array[offset];
+    }
+  }
+
+  /**
+   * Computes a value histogram for each byte in the given array.
+   *
+   * @param array array to count records in.
+   * @param numRecords number of data records in the array.
+   * @param startByteIndex the first byte to compute counts for (the prior are skipped).
+   * @param endByteIndex the last byte to compute counts for.
+   *
+   * @return a vector of eight 256-element count arrays, one for each byte starting from the least
+   *         significant byte. If the byte does not need sorting the vector entry will be empty.
+   */
+  static std::vector<std::vector<int64_t>>
+  getCounts(Element* array, int64_t numRecords, int32_t startByteIndex, int32_t endByteIndex) {
+    std::vector<std::vector<int64_t>> counts;
+    counts.resize(8);
+
+    // Optimization: do a fast pre-pass to determine which byte indices we can skip for sorting.
+    // If all the byte values at a particular index are the same we don't need to count it.
+    int64_t bitwiseMax = 0;
+    int64_t bitwiseMin = -1L;
+    for (auto offset = 0; offset < numRecords; ++offset) {
+      auto value = array[offset];
+      bitwiseMax |= value;
+      bitwiseMin &= value;
+    }
+    auto bitsChanged = bitwiseMin ^ bitwiseMax;
+
+    // Compute counts for each byte index.
+    for (auto i = startByteIndex; i <= endByteIndex; i++) {
+      if (((bitsChanged >> (i * 8)) & 0xff) != 0) {
+        counts[i].resize(256);
+        for (auto offset = 0; offset < numRecords; ++offset) {
+          counts[i][(array[offset] >> (i * 8)) & 0xff]++;
+        }
+      }
+    }
+
+    return counts;
+  }
+
+  /**
+   * Transforms counts into the proper output offsets for the sort type.
+   *
+   * @param counts counts for each byte value. This routine destructively modifies this vector.
+   * @param numRecords number of data records in the original data array.
+   * @param outputOffset output offset in bytes from the base array object.
+   *
+   * @return the input counts vector.
+   */
+  static std::vector<int64_t>& transformCountsToOffsets(std::vector<int64_t>& counts, int64_t outputOffset) {
+    assert(counts.size() == 256);
+
+    int64_t pos = outputOffset;
+    for (auto i = 0; i < 256; i++) {
+      auto tmp = counts[i & 0xff];
+      counts[i & 0xff] = pos;
+      pos += tmp;
+    }
+
+    return counts;
+  }
+};
+
+} // namespace gluten
diff --git a/cpp/velox/shuffle/VeloxShuffleReader.cc b/cpp/velox/shuffle/VeloxShuffleReader.cc
@@ -428,8 +428,8 @@ std::shared_ptr<ColumnarBatch> VeloxSortShuffleReaderDeserializer::deserializeTo
     auto buffer = cur->second;
     const auto* rawBuffer = buffer->as<char>();
     while (rowOffset_ < cur->first && readRows < batchSize_) {
-      auto rowSize = *(uint32_t*)(rawBuffer + byteOffset_);
-      byteOffset_ += sizeof(uint32_t);
+      auto rowSize = *(RowSizeType*)(rawBuffer + byteOffset_) - sizeof(RowSizeType);
+      byteOffset_ += sizeof(RowSizeType);
       data.push_back(std::string_view(rawBuffer + byteOffset_, rowSize));
       byteOffset_ += rowSize;
       ++rowOffset_;

diff --git a/cpp/velox/shuffle/VeloxShuffleReader.h b/cpp/velox/shuffle/VeloxShuffleReader.h
@@ -20,6 +20,7 @@
 #include "operators/serializer/VeloxColumnarBatchSerializer.h"
 #include "shuffle/Payload.h"
 #include "shuffle/ShuffleReader.h"
+#include "shuffle/VeloxSortShuffleWriter.h"
 #include "velox/type/Type.h"
 #include "velox/vector/ComplexVector.h"
 
@@ -64,6 +65,8 @@ class VeloxHashShuffleReaderDeserializer final : public ColumnarBatchIterator {
 
 class VeloxSortShuffleReaderDeserializer final : public ColumnarBatchIterator {
  public:
+  using RowSizeType = VeloxSortShuffleWriter::RowSizeType;
+
   VeloxSortShuffleReaderDeserializer(
       std::shared_ptr<arrow::io::InputStream> in,
       const std::shared_ptr<arrow::Schema>& schema,