Add support for smallint to PrefixSort (facebookincubator#10946)

Summary: According to the benchmark, for data sets larger than 0.5k, PrefixSort outperforms std::sort with performance improvements ranging from approximately 250% to over 500%. Here's a summary of the benchmark results: | Dataset Size | PrefixSort Improvement (No Payload) |PrefixSort Improvement(With Payload) | |--------------|-------------------------------------|-------------------------------------| | 0.5k | 248.97% - 287.43% | 249.71% - 289.74% | | 1k | 214.44% - 310.92% | 215.03% - 315.14% | | 10k | 216.21% - 255.38% | 217.88% - 256.88% | | 100k | 279.81% - 318.26% | 284.89% - 295.21% | | 1000k | 304.36% - 351.31% | 454.04% - 514.28% | follow-up facebookincubator#8350 Part of facebookincubator#6766 Pull Request resolved: facebookincubator#10946 Reviewed By: Yuhta Differential Revision: D62373593 Pulled By: mbasmanova fbshipit-source-id: b8594e05cc6aee736d09db1695db770b84e9d4bd
Joe-Abraham · Sep 12, 2024 · 98bbb73 · 98bbb73
1 parent 741876f
commit 98bbb73
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 2 deletions.
diff --git a/velox/exec/PrefixSort.cpp b/velox/exec/PrefixSort.cpp
@@ -51,6 +51,10 @@ FOLLY_ALWAYS_INLINE void extractRowColumnToPrefix(
     char* const row,
     char* const prefix) {
   switch (typeKind) {
+    case TypeKind::SMALLINT: {
+      encodeRowColumn<int16_t>(prefixSortLayout, index, rowColumn, row, prefix);
+      return;
+    }
     case TypeKind::INTEGER: {
       encodeRowColumn<int32_t>(prefixSortLayout, index, rowColumn, row, prefix);
       return;

diff --git a/velox/exec/benchmarks/PrefixSortBenchmark.cpp b/velox/exec/benchmarks/PrefixSortBenchmark.cpp
@@ -238,6 +238,30 @@ class PrefixSortBenchmark {
     }
   }
 
+  std::vector<RowTypePtr> smallintRowTypes(bool noPayload) {
+    if (noPayload) {
+      return {
+          ROW({SMALLINT()}),
+          ROW({SMALLINT(), SMALLINT()}),
+          ROW({SMALLINT(), SMALLINT(), SMALLINT()}),
+          ROW({SMALLINT(), SMALLINT(), SMALLINT(), SMALLINT()}),
+      };
+    } else {
+      return {
+          ROW({SMALLINT(), VARCHAR(), VARCHAR()}),
+          ROW({SMALLINT(), SMALLINT(), VARCHAR(), VARCHAR()}),
+          ROW({SMALLINT(), SMALLINT(), SMALLINT(), VARCHAR(), VARCHAR()}),
+          ROW(
+              {SMALLINT(),
+               SMALLINT(),
+               SMALLINT(),
+               SMALLINT(),
+               VARCHAR(),
+               VARCHAR()}),
+      };
+    }
+  }
+
   void bigint(
       bool noPayload,
       int numIterations,
@@ -296,6 +320,49 @@ class PrefixSortBenchmark {
         "no-payloads", "varchar", batchSizes, rowTypes, numKeys, iterations);
   }
 
+  void smallint(
+      bool noPayload,
+      int numIterations,
+      const std::vector<vector_size_t>& batchSizes) {
+    std::vector<RowTypePtr> rowTypes = smallintRowTypes(noPayload);
+    std::vector<int> numKeys = {1, 2, 3, 4};
+    benchmark(
+        noPayload ? "no-payload" : "payload",
+        "smallint",
+        batchSizes,
+        rowTypes,
+        numKeys,
+        numIterations);
+  }
+
+  void smallSmallint() {
+    // For small dateset, iterations need to be large enough to ensure that the
+    // benchmark runs for enough time.
+    const auto iterations = 100'000;
+    const std::vector<vector_size_t> batchSizes = {10, 50, 100, 500};
+    smallint(true, iterations, batchSizes);
+  }
+
+  void smallSmallintWithPayload() {
+    const auto iterations = 100'000;
+    const std::vector<vector_size_t> batchSizes = {10, 50, 100, 500};
+    smallint(false, iterations, batchSizes);
+  }
+
+  void largeSmallint() {
+    const auto iterations = 10;
+    const std::vector<vector_size_t> batchSizes = {
+        1'000, 10'000, 100'000, 1'000'000};
+    smallint(true, iterations, batchSizes);
+  }
+
+  void largeSmallintWithPayloads() {
+    const auto iterations = 10;
+    const std::vector<vector_size_t> batchSizes = {
+        1'000, 10'000, 100'000, 1'000'000};
+    smallint(false, iterations, batchSizes);
+  }
+
  private:
   std::vector<std::unique_ptr<TestCase>> testCases_;
   memory::MemoryPool* pool_;
@@ -316,7 +383,11 @@ int main(int argc, char** argv) {
   bm.largeBigintWithPayloads();
   bm.smallBigintWithPayload();
   bm.largeVarchar();
-  folly::runBenchmarks();
+  bm.smallSmallint();
+  bm.largeSmallint();
+  bm.smallSmallintWithPayload();
+  bm.largeSmallintWithPayloads();
 
+  folly::runBenchmarks();
   return 0;
 }
diff --git a/velox/exec/prefixsort/PrefixSortEncoder.h b/velox/exec/prefixsort/PrefixSortEncoder.h
@@ -54,7 +54,7 @@ class PrefixSortEncoder {
   }
 
   /// @tparam T Type of value. Supported type are: uint64_t, int64_t, uint32_t,
-  /// int32_t, float, double, Timestamp. TODO Add support for int16_t, uint16_t.
+  /// int32_t, int16_t, uint16_t, float, double, Timestamp.
   template <typename T>
   FOLLY_ALWAYS_INLINE void encodeNoNulls(T value, char* dest) const;
 
@@ -71,6 +71,9 @@ class PrefixSortEncoder {
   FOLLY_ALWAYS_INLINE static std::optional<uint32_t> encodedSize(
       TypeKind typeKind) {
     switch ((typeKind)) {
+      case ::facebook::velox::TypeKind::SMALLINT: {
+        return 3;
+      }
       case ::facebook::velox::TypeKind::INTEGER: {
         return 5;
       }
@@ -147,6 +150,25 @@ FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
   encodeNoNulls((uint64_t)(value ^ (1ull << 63)), dest);
 }
 
+/// Logic is as same as int32_t.
+template <>
+FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
+    uint16_t value,
+    char* dest) const {
+  auto& v = *reinterpret_cast<uint16_t*>(dest);
+  v = __builtin_bswap16(value);
+  if (!ascending_) {
+    v = ~v;
+  }
+}
+
+template <>
+FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
+    int16_t value,
+    char* dest) const {
+  encodeNoNulls(static_cast<uint16_t>(value ^ (1u << 15)), dest);
+}
+
 namespace detail {
 /// Convert double to a uint64_t, their value comparison semantics remain
 /// consistent.

diff --git a/velox/exec/prefixsort/tests/PrefixEncoderTest.cpp b/velox/exec/prefixsort/tests/PrefixEncoderTest.cpp
@@ -303,6 +303,17 @@ TEST_F(PrefixEncoderTest, encode) {
     testEncode<int32_t>(0x11223344, (char*)&ascExpected, (char*)&descExpected);
   }
 
+  {
+    uint16_t ascExpected = 0x2211;
+    uint16_t descExpected = 0xddee;
+    testEncode<uint16_t>(0x1122, (char*)&ascExpected, (char*)&descExpected);
+  }
+  {
+    int16_t ascExpected = 0x2291;
+    int16_t descExpected = 0xdd6e;
+    testEncode<int16_t>(0x1122, (char*)&ascExpected, (char*)&descExpected);
+  }
+
   {
     uint32_t ascExpected = 0x0050c3c7;
     uint32_t descExpected = 0xffaf3c38;
@@ -330,13 +341,19 @@ TEST_F(PrefixEncoderTest, encode) {
 TEST_F(PrefixEncoderTest, compare) {
   testCompare<uint64_t>();
   testCompare<uint32_t>();
+  testCompare<uint16_t>();
   testCompare<int64_t>();
   testCompare<int32_t>();
+  testCompare<int16_t>();
   testCompare<float>();
   testCompare<double>();
   testCompare<Timestamp>();
 }
 
+TEST_F(PrefixEncoderTest, fuzzySmallInt) {
+  testFuzz<TypeKind::SMALLINT>();
+}
+
 TEST_F(PrefixEncoderTest, fuzzyInteger) {
   testFuzz<TypeKind::INTEGER>();
 }