From 98bbb73ec9ee73853cc74668040990b5f85f3c92 Mon Sep 17 00:00:00 2001
From: kevincmchen <kc4163568@gmail.com>
Date: Thu, 12 Sep 2024 16:10:26 -0700
Subject: [PATCH] Add support for smallint to PrefixSort (#10946)

Summary:
According to the benchmark, for data sets larger than 0.5k, PrefixSort outperforms std::sort with performance improvements ranging from approximately 250% to over 500%.  Here's a summary of the benchmark results:

| Dataset Size | PrefixSort Improvement (No Payload) |PrefixSort Improvement(With Payload) |
|--------------|-------------------------------------|-------------------------------------|
| 0.5k         | 248.97% - 287.43%                   | 249.71% - 289.74%                   |
| 1k           | 214.44% - 310.92%                   | 215.03% - 315.14%                   |
| 10k          | 216.21% - 255.38%                   | 217.88% - 256.88%                   |
| 100k         | 279.81% - 318.26%                   | 284.89% - 295.21%                   |
| 1000k        | 304.36% - 351.31%                   | 454.04% - 514.28%                   |

follow-up https://github.com/facebookincubator/velox/issues/8350
Part of https://github.com/facebookincubator/velox/issues/6766

Pull Request resolved: https://github.com/facebookincubator/velox/pull/10946

Reviewed By: Yuhta

Differential Revision: D62373593

Pulled By: mbasmanova

fbshipit-source-id: b8594e05cc6aee736d09db1695db770b84e9d4bd
---
 velox/exec/PrefixSort.cpp                     |  4 +
 velox/exec/benchmarks/PrefixSortBenchmark.cpp | 73 ++++++++++++++++++-
 velox/exec/prefixsort/PrefixSortEncoder.h     | 24 +++++-
 .../prefixsort/tests/PrefixEncoderTest.cpp    | 17 +++++
 4 files changed, 116 insertions(+), 2 deletions(-)
diff --git a/velox/exec/PrefixSort.cpp b/velox/exec/PrefixSort.cpp
index 7fc0dff392c4..5d58ee9b032e 100644
--- a/velox/exec/PrefixSort.cpp
+++ b/velox/exec/PrefixSort.cpp
@@ -51,6 +51,10 @@ FOLLY_ALWAYS_INLINE void extractRowColumnToPrefix(
     char* const row,
     char* const prefix) {
   switch (typeKind) {
+    case TypeKind::SMALLINT: {
+      encodeRowColumn<int16_t>(prefixSortLayout, index, rowColumn, row, prefix);
+      return;
+    }
     case TypeKind::INTEGER: {
       encodeRowColumn<int32_t>(prefixSortLayout, index, rowColumn, row, prefix);
       return;
diff --git a/velox/exec/benchmarks/PrefixSortBenchmark.cpp b/velox/exec/benchmarks/PrefixSortBenchmark.cpp
index bb81eaa4c6aa..0af6a9f90bdf 100644
--- a/velox/exec/benchmarks/PrefixSortBenchmark.cpp
+++ b/velox/exec/benchmarks/PrefixSortBenchmark.cpp
@@ -238,6 +238,30 @@ class PrefixSortBenchmark {
     }
   }
 
+  std::vector<RowTypePtr> smallintRowTypes(bool noPayload) {
+    if (noPayload) {
+      return {
+          ROW({SMALLINT()}),
+          ROW({SMALLINT(), SMALLINT()}),
+          ROW({SMALLINT(), SMALLINT(), SMALLINT()}),
+          ROW({SMALLINT(), SMALLINT(), SMALLINT(), SMALLINT()}),
+      };
+    } else {
+      return {
+          ROW({SMALLINT(), VARCHAR(), VARCHAR()}),
+          ROW({SMALLINT(), SMALLINT(), VARCHAR(), VARCHAR()}),
+          ROW({SMALLINT(), SMALLINT(), SMALLINT(), VARCHAR(), VARCHAR()}),
+          ROW(
+              {SMALLINT(),
+               SMALLINT(),
+               SMALLINT(),
+               SMALLINT(),
+               VARCHAR(),
+               VARCHAR()}),
+      };
+    }
+  }
+
   void bigint(
       bool noPayload,
       int numIterations,
@@ -296,6 +320,49 @@ class PrefixSortBenchmark {
         "no-payloads", "varchar", batchSizes, rowTypes, numKeys, iterations);
   }
 
+  void smallint(
+      bool noPayload,
+      int numIterations,
+      const std::vector<vector_size_t>& batchSizes) {
+    std::vector<RowTypePtr> rowTypes = smallintRowTypes(noPayload);
+    std::vector<int> numKeys = {1, 2, 3, 4};
+    benchmark(
+        noPayload ? "no-payload" : "payload",
+        "smallint",
+        batchSizes,
+        rowTypes,
+        numKeys,
+        numIterations);
+  }
+
+  void smallSmallint() {
+    // For small dateset, iterations need to be large enough to ensure that the
+    // benchmark runs for enough time.
+    const auto iterations = 100'000;
+    const std::vector<vector_size_t> batchSizes = {10, 50, 100, 500};
+    smallint(true, iterations, batchSizes);
+  }
+
+  void smallSmallintWithPayload() {
+    const auto iterations = 100'000;
+    const std::vector<vector_size_t> batchSizes = {10, 50, 100, 500};
+    smallint(false, iterations, batchSizes);
+  }
+
+  void largeSmallint() {
+    const auto iterations = 10;
+    const std::vector<vector_size_t> batchSizes = {
+        1'000, 10'000, 100'000, 1'000'000};
+    smallint(true, iterations, batchSizes);
+  }
+
+  void largeSmallintWithPayloads() {
+    const auto iterations = 10;
+    const std::vector<vector_size_t> batchSizes = {
+        1'000, 10'000, 100'000, 1'000'000};
+    smallint(false, iterations, batchSizes);
+  }
+
  private:
   std::vector<std::unique_ptr<TestCase>> testCases_;
   memory::MemoryPool* pool_;
@@ -316,7 +383,11 @@ int main(int argc, char** argv) {
   bm.largeBigintWithPayloads();
   bm.smallBigintWithPayload();
   bm.largeVarchar();
-  folly::runBenchmarks();
+  bm.smallSmallint();
+  bm.largeSmallint();
+  bm.smallSmallintWithPayload();
+  bm.largeSmallintWithPayloads();
 
+  folly::runBenchmarks();
   return 0;
 }
diff --git a/velox/exec/prefixsort/PrefixSortEncoder.h b/velox/exec/prefixsort/PrefixSortEncoder.h
index 1323c43a4eb9..dec4b6166a4f 100644
--- a/velox/exec/prefixsort/PrefixSortEncoder.h
+++ b/velox/exec/prefixsort/PrefixSortEncoder.h
@@ -54,7 +54,7 @@ class PrefixSortEncoder {
   }
 
   /// @tparam T Type of value. Supported type are: uint64_t, int64_t, uint32_t,
-  /// int32_t, float, double, Timestamp. TODO Add support for int16_t, uint16_t.
+  /// int32_t, int16_t, uint16_t, float, double, Timestamp.
   template <typename T>
   FOLLY_ALWAYS_INLINE void encodeNoNulls(T value, char* dest) const;
 
@@ -71,6 +71,9 @@ class PrefixSortEncoder {
   FOLLY_ALWAYS_INLINE static std::optional<uint32_t> encodedSize(
       TypeKind typeKind) {
     switch ((typeKind)) {
+      case ::facebook::velox::TypeKind::SMALLINT: {
+        return 3;
+      }
       case ::facebook::velox::TypeKind::INTEGER: {
         return 5;
       }
@@ -147,6 +150,25 @@ FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
   encodeNoNulls((uint64_t)(value ^ (1ull << 63)), dest);
 }
 
+/// Logic is as same as int32_t.
+template <>
+FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
+    uint16_t value,
+    char* dest) const {
+  auto& v = *reinterpret_cast<uint16_t*>(dest);
+  v = __builtin_bswap16(value);
+  if (!ascending_) {
+    v = ~v;
+  }
+}
+
+template <>
+FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
+    int16_t value,
+    char* dest) const {
+  encodeNoNulls(static_cast<uint16_t>(value ^ (1u << 15)), dest);
+}
+
 namespace detail {
 /// Convert double to a uint64_t, their value comparison semantics remain
 /// consistent.
diff --git a/velox/exec/prefixsort/tests/PrefixEncoderTest.cpp b/velox/exec/prefixsort/tests/PrefixEncoderTest.cpp
index 11efd8ebc8a0..7f25b4d5cae2 100644
--- a/velox/exec/prefixsort/tests/PrefixEncoderTest.cpp
+++ b/velox/exec/prefixsort/tests/PrefixEncoderTest.cpp
@@ -303,6 +303,17 @@ TEST_F(PrefixEncoderTest, encode) {
     testEncode<int32_t>(0x11223344, (char*)&ascExpected, (char*)&descExpected);
   }
 
+  {
+    uint16_t ascExpected = 0x2211;
+    uint16_t descExpected = 0xddee;
+    testEncode<uint16_t>(0x1122, (char*)&ascExpected, (char*)&descExpected);
+  }
+  {
+    int16_t ascExpected = 0x2291;
+    int16_t descExpected = 0xdd6e;
+    testEncode<int16_t>(0x1122, (char*)&ascExpected, (char*)&descExpected);
+  }
+
   {
     uint32_t ascExpected = 0x0050c3c7;
     uint32_t descExpected = 0xffaf3c38;
@@ -330,13 +341,19 @@ TEST_F(PrefixEncoderTest, encode) {
 TEST_F(PrefixEncoderTest, compare) {
   testCompare<uint64_t>();
   testCompare<uint32_t>();
+  testCompare<uint16_t>();
   testCompare<int64_t>();
   testCompare<int32_t>();
+  testCompare<int16_t>();
   testCompare<float>();
   testCompare<double>();
   testCompare<Timestamp>();
 }
 
+TEST_F(PrefixEncoderTest, fuzzySmallInt) {
+  testFuzz<TypeKind::SMALLINT>();
+}
+
 TEST_F(PrefixEncoderTest, fuzzyInteger) {
   testFuzz<TypeKind::INTEGER>();
 }