diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml
index e8a68eda92f5b..8e619be32d6e0 100644
--- a/.github/workflows/linux-build.yml
+++ b/.github/workflows/linux-build.yml
@@ -91,7 +91,7 @@ jobs:
       - name: Make Release Build
         env:
           MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4'
-          CUDA_ARCHITECTURES: 60
+          CUDA_ARCHITECTURES: 70
           CUDA_COMPILER: /usr/local/cuda-${CUDA_VERSION}/bin/nvcc
           # Without that, nvcc picks /usr/bin/c++ which is GCC 8
           CUDA_FLAGS: "-ccbin /opt/rh/gcc-toolset-9/root/usr/bin"
diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml
index cfc0742056692..68814db1ad74d 100644
--- a/.github/workflows/scheduled.yml
+++ b/.github/workflows/scheduled.yml
@@ -264,14 +264,14 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: presto
-          path: velox/_build/debug/velox/expression/tests/velox_expression_fuzzer_test
+          path: velox/_build/debug/velox/expression/fuzzer/velox_expression_fuzzer_test
           retention-days: "${{ env.RETENTION }}"
 
       - name: Upload spark expression fuzzer
         uses: actions/upload-artifact@v4
         with:
           name: spark_expression_fuzzer
-          path: velox/_build/debug/velox/expression/tests/spark_expression_fuzzer_test
+          path: velox/_build/debug/velox/expression/fuzzer/spark_expression_fuzzer_test
           retention-days: "${{ env.RETENTION }}"
 
       - name: Upload spark aggregation fuzzer
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index c4d716cbc4f56..0000000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,6 +0,0 @@
-[submodule "third_party/googletest"]
-	path = third_party/googletest
-	url = https://github.com/google/googletest.git
-[submodule "third_party/xsimd"]
-	path = third_party/xsimd
-	url = https://github.com/xtensor-stack/xsimd.git
diff --git a/Makefile b/Makefile
index 3e8ff17397bdf..b604d560adf64 100644
--- a/Makefile
+++ b/Makefile
@@ -161,7 +161,7 @@ unittest: debug			#: Build with debugging and run unit tests
 # Build with debugging and run expression fuzzer test. Use a fixed seed to
 # ensure the tests are reproducible.
 fuzzertest: debug
-	$(BUILD_BASE_DIR)/debug/velox/expression/tests/velox_expression_fuzzer_test \
+	$(BUILD_BASE_DIR)/debug/velox/expression/fuzzer/velox_expression_fuzzer_test \
 			--seed $(FUZZER_SEED) \
 			--duration_sec $(FUZZER_DURATION_SEC) \
 			--repro_persist_path $(FUZZER_REPRO_PERSIST_PATH) \
diff --git a/README.md b/README.md
index 69fd642a1d38a..3e1dbc3cade3d 100644
--- a/README.md
+++ b/README.md
@@ -72,11 +72,8 @@ Blog posts are available [here](https://velox-lib.io/blog).
 
 ### Get the Velox Source
 ```
-git clone --recursive https://github.com/facebookincubator/velox.git
+git clone https://github.com/facebookincubator/velox.git
 cd velox
-# if you are updating an existing checkout
-git submodule sync --recursive
-git submodule update --init --recursive
 ```
 Once Velox is checked out, the first step is to install the dependencies.
 Details on the dependencies and how Velox manages some of them for you
@@ -90,7 +87,7 @@ dependencies for a given platform.
 On an Intel MacOS machine you can setup and then build like so:
 
 ```shell
-$ ./scripts/setup-macos.sh 
+$ ./scripts/setup-macos.sh
 $ make
 ```
 
@@ -117,7 +114,7 @@ $ CPU_TARGET="aarch64" make
 Once you have checked out Velox, you can setup and build like so:
 
 ```shell
-$ ./scripts/setup-ubuntu.sh 
+$ ./scripts/setup-ubuntu.sh
 $ make
 ```
 
@@ -135,7 +132,7 @@ Note that,
   * f16c
 * Velox tries to use the following (or equivalent) instruction sets where available:
   * On Intel CPUs
-    * avx  
+    * avx
     * avx2
     * sse
   * On ARM
@@ -167,7 +164,7 @@ contribute to the project.
 ## Community
 
 The main communication channel with the Velox OSS community is through the
-[the Velox-OSS Slack workspace](http://velox-oss.slack.com). 
+[the Velox-OSS Slack workspace](http://velox-oss.slack.com).
 Please reach out to **velox@meta.com** to get access to Velox Slack Channel.
 
 
diff --git a/velox/common/base/CMakeLists.txt b/velox/common/base/CMakeLists.txt
index 37ccfe6f8f894..d70afb59d639a 100644
--- a/velox/common/base/CMakeLists.txt
+++ b/velox/common/base/CMakeLists.txt
@@ -22,6 +22,7 @@ add_library(
   BitUtil.cpp
   Counters.cpp
   Fs.cpp
+  PeriodicStatsReporter.cpp
   RandomUtil.cpp
   RawVector.cpp
   RuntimeMetrics.cpp
diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp
index 6cf0a8fcf8c82..f7e8ab9951fdc 100644
--- a/velox/common/base/Counters.cpp
+++ b/velox/common/base/Counters.cpp
@@ -48,6 +48,24 @@ void registerVeloxMetrics() {
 
   /// ================== Memory Arbitration Counters =================
 
+  // The number of arbitration requests.
+  DEFINE_METRIC(
+      kMetricArbitratorRequestsCount, facebook::velox::StatType::COUNT);
+
+  // The number of times a query level memory pool is aborted as a result of a
+  // memory arbitration process. The memory pool aborted will eventually result
+  // in a cancelling of the original query.
+  DEFINE_METRIC(
+      kMetricArbitratorAbortedCount, facebook::velox::StatType::COUNT);
+
+  // The number of times a memory arbitration request failed. This may occur
+  // either because the requester was terminated during the processing of its
+  // request, the arbitration request would surpass the maximum allowed capacity
+  // for the requester, or the arbitration process couldn't release the
+  // requested amount of memory.
+  DEFINE_METRIC(
+      kMetricArbitratorFailuresCount, facebook::velox::StatType::COUNT);
+
   // Tracks the memory reclaim count on an operator.
   DEFINE_METRIC(kMetricMemoryReclaimCount, facebook::velox::StatType::COUNT);
 
@@ -82,10 +100,6 @@ void registerVeloxMetrics() {
   DEFINE_METRIC(
       kMetricMemoryNonReclaimableCount, facebook::velox::StatType::COUNT);
 
-  // The number of arbitration requests.
-  DEFINE_METRIC(
-      kMetricArbitratorRequestsCount, facebook::velox::StatType::COUNT);
-
   // The number of arbitration that reclaims the used memory from the query
   // which initiates the memory arbitration request itself. It ensures the
   // memory arbitration request won't exceed its per-query memory capacity
@@ -103,20 +117,6 @@ void registerVeloxMetrics() {
       kMetricArbitratorGlobalArbitrationCount,
       facebook::velox::StatType::COUNT);
 
-  // The number of times a query level memory pool is aborted as a result of a
-  // memory arbitration process. The memory pool aborted will eventually result
-  // in a cancelling the original query.
-  DEFINE_METRIC(
-      kMetricArbitratorAbortedCount, facebook::velox::StatType::COUNT);
-
-  // The number of times a memory arbitration request failed. This may occur
-  // either because the requester was terminated during the processing of its
-  // request, the arbitration request would surpass the maximum allowed capacity
-  // for the requester, or the arbitration process couldn't release the
-  // requested amount of memory.
-  DEFINE_METRIC(
-      kMetricArbitratorFailuresCount, facebook::velox::StatType::COUNT);
-
   // The distribution of the amount of time an arbitration request stays queued
   // in range of [0, 600s] with 20 buckets. It is configured to report the
   // latency at P50, P90, P99, and P100 percentiles.
diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h
index 67fd967c938ef..f1fca90d8ea31 100644
--- a/velox/common/base/Counters.h
+++ b/velox/common/base/Counters.h
@@ -70,21 +70,12 @@ constexpr folly::StringPiece kMetricMemoryPoolReservationLeakBytes{
 constexpr folly::StringPiece kMetricMemoryAllocatorDoubleFreeCount{
     "velox.memory_allocator_double_free_count"};
 
-constexpr folly::StringPiece kMetricArbitratorRequestsCount{
-    "velox.arbitrator_requests_count"};
-
 constexpr folly::StringPiece kMetricArbitratorLocalArbitrationCount{
     "velox.arbitrator_local_arbitration_count"};
 
 constexpr folly::StringPiece kMetricArbitratorGlobalArbitrationCount{
     "velox.arbitrator_global_arbitration_count"};
 
-constexpr folly::StringPiece kMetricArbitratorAbortedCount{
-    "velox.arbitrator_aborted_count"};
-
-constexpr folly::StringPiece kMetricArbitratorFailuresCount{
-    "velox.arbitrator_failures_count"};
-
 constexpr folly::StringPiece kMetricArbitratorQueueTimeMs{
     "velox.arbitrator_queue_time_ms"};
 
@@ -128,4 +119,13 @@ constexpr folly::StringPiece kMetricSpillWriteTimeMs{
 
 constexpr folly::StringPiece kMetricFileWriterEarlyFlushedRawBytes{
     "velox.file_writer_early_flushed_raw_bytes"};
+
+constexpr folly::StringPiece kMetricArbitratorRequestsCount{
+    "velox.arbitrator_requests_count"};
+
+constexpr folly::StringPiece kMetricArbitratorAbortedCount{
+    "velox.arbitrator_aborted_count"};
+
+constexpr folly::StringPiece kMetricArbitratorFailuresCount{
+    "velox.arbitrator_failures_count"};
 } // namespace facebook::velox
diff --git a/velox/common/base/PeriodicStatsReporter.cpp b/velox/common/base/PeriodicStatsReporter.cpp
new file mode 100644
index 0000000000000..f3c3b6e5b6086
--- /dev/null
+++ b/velox/common/base/PeriodicStatsReporter.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/common/base/PeriodicStatsReporter.h"
+#include "velox/common/base/Counters.h"
+#include "velox/common/base/StatsReporter.h"
+#include "velox/common/memory/Memory.h"
+
+namespace facebook::velox {
+
+namespace {
+#define REPORT_IF_NOT_ZERO(name, counter)   \
+  if ((counter) != 0) {                     \
+    RECORD_METRIC_VALUE((name), (counter)); \
+  }
+
+std::mutex& instanceMutex() {
+  static std::mutex instanceMu;
+  return instanceMu;
+}
+
+// Global instance. Must be called while holding a lock over instanceMutex().
+std::unique_ptr<PeriodicStatsReporter>& instance() {
+  static std::unique_ptr<PeriodicStatsReporter> reporter;
+  return reporter;
+}
+} // namespace
+
+void startPeriodicStatsReporter(const PeriodicStatsReporter::Options& options) {
+  std::lock_guard<std::mutex> l(instanceMutex());
+  auto& instanceRef = instance();
+  VELOX_CHECK_NULL(
+      instanceRef, "The periodic stats reporter has already started.");
+  instanceRef = std::make_unique<PeriodicStatsReporter>(options);
+  instanceRef->start();
+}
+
+void stopPeriodicStatsReporter() {
+  std::lock_guard<std::mutex> l(instanceMutex());
+  auto& instanceRef = instance();
+  VELOX_CHECK_NOT_NULL(instanceRef, "No periodic stats reporter to stop.");
+  instanceRef->stop();
+  instanceRef.reset();
+}
+
+PeriodicStatsReporter::PeriodicStatsReporter(const Options& options)
+    : arbitrator_(options.arbitrator), options_(options) {}
+
+void PeriodicStatsReporter::start() {
+  LOG(INFO) << "Starting PeriodicStatsReporter with options "
+            << options_.toString();
+  addTask(
+      "report_arbitrator_stats",
+      [this]() { reportArbitratorStats(); },
+      options_.arbitratorStatsIntervalMs);
+}
+
+void PeriodicStatsReporter::stop() {
+  LOG(INFO) << "Stopping PeriodicStatsReporter";
+  scheduler_.stop();
+}
+
+void PeriodicStatsReporter::reportArbitratorStats() {
+  if (arbitrator_ == nullptr) {
+    return;
+  }
+
+  const auto stats = arbitrator_->stats();
+  RECORD_METRIC_VALUE(
+      kMetricArbitratorFreeCapacityBytes,
+      stats.freeCapacityBytes + stats.freeReservedCapacityBytes);
+  RECORD_METRIC_VALUE(
+      kMetricArbitratorFreeReservedCapacityBytes,
+      stats.freeReservedCapacityBytes);
+}
+
+} // namespace facebook::velox
diff --git a/velox/common/base/PeriodicStatsReporter.h b/velox/common/base/PeriodicStatsReporter.h
new file mode 100644
index 0000000000000..7621ac99a01cb
--- /dev/null
+++ b/velox/common/base/PeriodicStatsReporter.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/experimental/ThreadedRepeatingFunctionRunner.h>
+#include "velox/common/memory/MemoryArbitrator.h"
+
+namespace folly {
+class CPUThreadPoolExecutor;
+}
+
+namespace facebook::velox {
+
+namespace memory {
+class MemoryAllocator;
+}
+
+namespace cache {
+class AsyncDataCache;
+}
+
+/// Manages a background daemon thread to report stats through 'StatsReporter'.
+class PeriodicStatsReporter {
+ public:
+  struct Options {
+    Options() {}
+
+    const memory::MemoryArbitrator* arbitrator{nullptr};
+
+    uint64_t arbitratorStatsIntervalMs{60'000};
+
+    std::string toString() const {
+      return fmt::format(
+          "arbitratorStatsIntervalMs:{}", arbitratorStatsIntervalMs);
+    }
+  };
+
+  PeriodicStatsReporter(const Options& options = Options());
+
+  /// Invoked to start the report daemon in background.
+  void start();
+
+  /// Invoked to stop the report daemon in background.
+  void stop();
+
+ private:
+  // Add a task to run periodically.
+  template <typename TFunc>
+  void addTask(const std::string& taskName, TFunc&& func, size_t intervalMs) {
+    scheduler_.add(
+        taskName,
+        [taskName,
+         intervalMs,
+         func = std::forward<TFunc>(func)]() mutable noexcept {
+          try {
+            func();
+          } catch (const std::exception& e) {
+            LOG(ERROR) << "Error running periodic task " << taskName << ": "
+                       << e.what();
+          }
+          return std::chrono::milliseconds(intervalMs);
+        });
+  }
+
+  void reportArbitratorStats();
+
+  const velox::memory::MemoryArbitrator* const arbitrator_{nullptr};
+  const Options options_;
+
+  folly::ThreadedRepeatingFunctionRunner scheduler_;
+};
+
+/// Initializes and starts the process-wide periodic stats reporter. Before
+/// 'stopPeriodicStatsReporter()' is called, this method can only be called once
+/// process-wide, and additional calls to this method will throw.
+void startPeriodicStatsReporter(const PeriodicStatsReporter::Options& options);
+
+/// Stops the process-wide periodic stats reporter.
+void stopPeriodicStatsReporter();
+
+} // namespace facebook::velox
diff --git a/velox/common/base/VeloxException.cpp b/velox/common/base/VeloxException.cpp
index 701edb16c5197..12371def75f2e 100644
--- a/velox/common/base/VeloxException.cpp
+++ b/velox/common/base/VeloxException.cpp
@@ -43,29 +43,31 @@ ExceptionContext& getExceptionContext() {
   return context;
 }
 
-// Retrieves the message of the top-level ancestor of the current exception
-// context. If the top-level context message is not empty and is the same as the
-// current one, returns a string indicating they are the same.
-std::string getTopLevelExceptionContextString(
+// Traverses the context hierarchy and appends messages from all contexts that
+// are marked as essential.
+std::string getAdditionalExceptionContextString(
     VeloxException::Type exceptionType,
     const std::string& currentMessage) {
   auto* context = &getExceptionContext();
-  if (context->parent && context->parent->parent) {
-    while (context->parent && context->parent->parent) {
-      context = context->parent;
-    }
-    auto topLevelMessage = context->message(exceptionType);
-    if (!topLevelMessage.empty() && topLevelMessage == currentMessage) {
-      return "Same as context.";
-    } else {
-      return topLevelMessage;
+  std::string additionalMessage = "";
+  if (!context->parent || !context->parent->parent) {
+    return additionalMessage;
+  }
+  context = context->parent;
+  while (context->parent) {
+    if (context->isEssential) {
+      auto message = context->message(exceptionType);
+      if (!message.empty()) {
+        additionalMessage += message + " ";
+      }
     }
+    context = context->parent;
   }
-
-  if (!currentMessage.empty()) {
-    return "Same as context.";
+  if (!additionalMessage.empty()) {
+    // Get rid of the extra space at the end.
+    additionalMessage.pop_back();
   }
-  return "";
+  return additionalMessage;
 }
 
 VeloxException::VeloxException(
@@ -90,8 +92,8 @@ VeloxException::VeloxException(
         state.errorSource = errorSource;
         state.errorCode = errorCode;
         state.context = getExceptionContext().message(exceptionType);
-        state.topLevelContext =
-            getTopLevelExceptionContextString(exceptionType, state.context);
+        state.additionalContext =
+            getAdditionalExceptionContextString(exceptionType, state.context);
         state.isRetriable = isRetriable;
       })) {}
 
@@ -114,8 +116,8 @@ VeloxException::VeloxException(
         state.errorSource = errorSource;
         state.errorCode = errorCode;
         state.context = getExceptionContext().message(exceptionType);
-        state.topLevelContext =
-            getTopLevelExceptionContextString(exceptionType, state.context);
+        state.additionalContext =
+            getAdditionalExceptionContextString(exceptionType, state.context);
         state.isRetriable = isRetriable;
         state.wrappedException = e;
       })) {}
@@ -223,8 +225,8 @@ void VeloxException::State::finalize() const {
     elaborateMessage += "Context: " + context + "\n";
   }
 
-  if (!topLevelContext.empty()) {
-    elaborateMessage += "Top-Level Context: " + topLevelContext + "\n";
+  if (!additionalContext.empty()) {
+    elaborateMessage += "Additional Context: " + additionalContext + "\n";
   }
 
   if (function) {
diff --git a/velox/common/base/VeloxException.h b/velox/common/base/VeloxException.h
index 32e96b9a166e8..ae7b8fdab46b3 100644
--- a/velox/common/base/VeloxException.h
+++ b/velox/common/base/VeloxException.h
@@ -207,8 +207,8 @@ class VeloxException : public std::exception {
     return state_->context;
   }
 
-  const std::string& topLevelContext() const {
-    return state_->topLevelContext;
+  const std::string& additionalContext() const {
+    return state_->additionalContext;
   }
 
   const std::exception_ptr& wrappedException() const {
@@ -230,7 +230,7 @@ class VeloxException : public std::exception {
     // The current exception context.
     std::string context;
     // The top-level ancestor of the current exception context.
-    std::string topLevelContext;
+    std::string additionalContext;
     bool isRetriable;
     // The original std::exception.
     std::exception_ptr wrappedException;
@@ -353,6 +353,10 @@ struct ExceptionContext {
   /// Value to pass to `messageFunc`. Can be null.
   void* arg{nullptr};
 
+  /// If true, then the addition context in 'this' is always included when there
+  /// are hierarchical exception contexts.
+  bool isEssential{false};
+
   /// Pointer to the parent context when there are hierarchical exception
   /// contexts.
   ExceptionContext* parent{nullptr};
diff --git a/velox/common/base/tests/CMakeLists.txt b/velox/common/base/tests/CMakeLists.txt
index ebab3d8f75504..22c173c99e3d4 100644
--- a/velox/common/base/tests/CMakeLists.txt
+++ b/velox/common/base/tests/CMakeLists.txt
@@ -23,9 +23,9 @@ add_executable(
   FsTest.cpp
   RangeTest.cpp
   RawVectorTest.cpp
-  ScratchTest.cpp
   RuntimeMetricsTest.cpp
   ScopedLockTest.cpp
+  ScratchTest.cpp
   SemaphoreTest.cpp
   SimdUtilTest.cpp
   SpillConfigTest.cpp
@@ -38,7 +38,9 @@ add_test(velox_base_test velox_base_test)
 
 target_link_libraries(
   velox_base_test
-  PRIVATE velox_common_base
+  PRIVATE velox_caching
+          velox_common_base
+          velox_memory
           velox_time
           velox_status
           velox_exception
diff --git a/velox/common/base/tests/ExceptionTest.cpp b/velox/common/base/tests/ExceptionTest.cpp
index 4e5dd6dbaa54d..9386b8cb672e0 100644
--- a/velox/common/base/tests/ExceptionTest.cpp
+++ b/velox/common/base/tests/ExceptionTest.cpp
@@ -583,11 +583,13 @@ TEST(ExceptionTest, context) {
   };
 
   {
-    // Create multi-layer contexts.
+    // Create multi-layer contexts with top level marked as essential.
     MessageFunctionArg topLevelTroubleshootingAid{
         "Top-level troubleshooting aid.", &callCount};
-    facebook::velox::ExceptionContextSetter topLevelContext(
-        {messageFunction, &topLevelTroubleshootingAid});
+    facebook::velox::ExceptionContextSetter additionalContext(
+        {.messageFunc = messageFunction,
+         .arg = &topLevelTroubleshootingAid,
+         .isEssential = true});
 
     MessageFunctionArg midLevelTroubleshootingAid{
         "Mid-level troubleshooting aid.", &callCount};
@@ -608,7 +610,7 @@ TEST(ExceptionTest, context) {
         "\nRetriable: False"
         "\nExpression: 1 == 3"
         "\nContext: System error: Inner-level troubleshooting aid."
-        "\nTop-Level Context: System error: Top-level troubleshooting aid."
+        "\nAdditional Context: System error: Top-level troubleshooting aid."
         "\nFunction: operator()"
         "\nFile: ");
 
@@ -623,13 +625,164 @@ TEST(ExceptionTest, context) {
         "\nRetriable: False"
         "\nExpression: 1 == 3"
         "\nContext: User error: Inner-level troubleshooting aid."
-        "\nTop-Level Context: User error: Top-level troubleshooting aid."
+        "\nAdditional Context: User error: Top-level troubleshooting aid."
         "\nFunction: operator()"
         "\nFile: ");
 
     EXPECT_EQ(4, callCount);
   }
 
+  {
+    callCount = 0;
+    // Create multi-layer contexts with middle level marked as essential.
+    MessageFunctionArg topLevelTroubleshootingAid{
+        "Top-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter additionalContext(
+        {.messageFunc = messageFunction, .arg = &topLevelTroubleshootingAid});
+
+    MessageFunctionArg midLevelTroubleshootingAid{
+        "Mid-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter midLevelContext(
+        {.messageFunc = messageFunction,
+         .arg = &midLevelTroubleshootingAid,
+         .isEssential = true});
+
+    MessageFunctionArg innerLevelTroubleshootingAid{
+        "Inner-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter innerLevelContext(
+        {messageFunction, &innerLevelTroubleshootingAid});
+
+    verifyVeloxException(
+        [&]() { VELOX_CHECK_EQ(1, 3); },
+        "Exception: VeloxRuntimeError"
+        "\nError Source: RUNTIME"
+        "\nError Code: INVALID_STATE"
+        "\nReason: (1 vs. 3)"
+        "\nRetriable: False"
+        "\nExpression: 1 == 3"
+        "\nContext: System error: Inner-level troubleshooting aid."
+        "\nAdditional Context: System error: Mid-level troubleshooting aid."
+        "\nFunction: operator()"
+        "\nFile: ");
+
+    EXPECT_EQ(2, callCount);
+
+    verifyVeloxException(
+        [&]() { VELOX_USER_CHECK_EQ(1, 3); },
+        "Exception: VeloxUserError"
+        "\nError Source: USER"
+        "\nError Code: INVALID_ARGUMENT"
+        "\nReason: (1 vs. 3)"
+        "\nRetriable: False"
+        "\nExpression: 1 == 3"
+        "\nContext: User error: Inner-level troubleshooting aid."
+        "\nAdditional Context: User error: Mid-level troubleshooting aid."
+        "\nFunction: operator()"
+        "\nFile: ");
+
+    EXPECT_EQ(4, callCount);
+  }
+
+  {
+    callCount = 0;
+    // Create multi-layer contexts with none marked as essential.
+    MessageFunctionArg topLevelTroubleshootingAid{
+        "Top-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter additionalContext(
+        {.messageFunc = messageFunction, .arg = &topLevelTroubleshootingAid});
+
+    MessageFunctionArg midLevelTroubleshootingAid{
+        "Mid-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter midLevelContext(
+        {.messageFunc = messageFunction, .arg = &midLevelTroubleshootingAid});
+
+    MessageFunctionArg innerLevelTroubleshootingAid{
+        "Inner-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter innerLevelContext(
+        {messageFunction, &innerLevelTroubleshootingAid});
+
+    verifyVeloxException(
+        [&]() { VELOX_CHECK_EQ(1, 3); },
+        "Exception: VeloxRuntimeError"
+        "\nError Source: RUNTIME"
+        "\nError Code: INVALID_STATE"
+        "\nReason: (1 vs. 3)"
+        "\nRetriable: False"
+        "\nExpression: 1 == 3"
+        "\nContext: System error: Inner-level troubleshooting aid."
+        "\nFunction: operator()"
+        "\nFile: ");
+
+    EXPECT_EQ(1, callCount);
+
+    verifyVeloxException(
+        [&]() { VELOX_USER_CHECK_EQ(1, 3); },
+        "Exception: VeloxUserError"
+        "\nError Source: USER"
+        "\nError Code: INVALID_ARGUMENT"
+        "\nReason: (1 vs. 3)"
+        "\nRetriable: False"
+        "\nExpression: 1 == 3"
+        "\nContext: User error: Inner-level troubleshooting aid."
+        "\nFunction: operator()"
+        "\nFile: ");
+
+    EXPECT_EQ(2, callCount);
+  }
+
+  {
+    callCount = 0;
+    // Create multi-layer contexts with all ancestors marked as essential.
+    MessageFunctionArg topLevelTroubleshootingAid{
+        "Top-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter additionalContext(
+        {.messageFunc = messageFunction,
+         .arg = &topLevelTroubleshootingAid,
+         .isEssential = true});
+
+    MessageFunctionArg midLevelTroubleshootingAid{
+        "Mid-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter midLevelContext(
+        {.messageFunc = messageFunction,
+         .arg = &midLevelTroubleshootingAid,
+         .isEssential = true});
+
+    MessageFunctionArg innerLevelTroubleshootingAid{
+        "Inner-level troubleshooting aid.", &callCount};
+    facebook::velox::ExceptionContextSetter innerLevelContext(
+        {messageFunction, &innerLevelTroubleshootingAid});
+
+    verifyVeloxException(
+        [&]() { VELOX_CHECK_EQ(1, 3); },
+        "Exception: VeloxRuntimeError"
+        "\nError Source: RUNTIME"
+        "\nError Code: INVALID_STATE"
+        "\nReason: (1 vs. 3)"
+        "\nRetriable: False"
+        "\nExpression: 1 == 3"
+        "\nContext: System error: Inner-level troubleshooting aid."
+        "\nAdditional Context: System error: Mid-level troubleshooting aid. System error: Top-level troubleshooting aid."
+        "\nFunction: operator()"
+        "\nFile: ");
+
+    EXPECT_EQ(3, callCount);
+
+    verifyVeloxException(
+        [&]() { VELOX_USER_CHECK_EQ(1, 3); },
+        "Exception: VeloxUserError"
+        "\nError Source: USER"
+        "\nError Code: INVALID_ARGUMENT"
+        "\nReason: (1 vs. 3)"
+        "\nRetriable: False"
+        "\nExpression: 1 == 3"
+        "\nContext: User error: Inner-level troubleshooting aid."
+        "\nAdditional Context: User error: Mid-level troubleshooting aid. User error: Top-level troubleshooting aid."
+        "\nFunction: operator()"
+        "\nFile: ");
+
+    EXPECT_EQ(6, callCount);
+  }
+
   // Different context.
   {
     callCount = 0;
@@ -649,7 +802,6 @@ TEST(ExceptionTest, context) {
         "\nRetriable: False"
         "\nExpression: 1 == 3"
         "\nContext: System error: Debugging info."
-        "\nTop-Level Context: Same as context."
         "\nFunction: operator()"
         "\nFile: ");
 
@@ -664,7 +816,6 @@ TEST(ExceptionTest, context) {
         "\nRetriable: False"
         "\nExpression: 1 == 3"
         "\nContext: User error: Debugging info."
-        "\nTop-Level Context: Same as context."
         "\nFunction: operator()"
         "\nFile: ");
 
@@ -709,7 +860,6 @@ TEST(ExceptionTest, context) {
         "\nRetriable: False"
         "\nExpression: 1 == 3"
         "\nContext: Failed to produce additional context."
-        "\nTop-Level Context: Same as context."
         "\nFunction: operator()"
         "\nFile: ");
 
@@ -743,7 +893,7 @@ TEST(ExceptionTest, wrappedException) {
     ASSERT_EQ(ve.message(), "This is a test.");
     ASSERT_TRUE(ve.isUserError());
     ASSERT_EQ(ve.context(), "");
-    ASSERT_EQ(ve.topLevelContext(), "");
+    ASSERT_EQ(ve.additionalContext(), "");
     ASSERT_THROW(
         std::rethrow_exception(ve.wrappedException()), std::invalid_argument);
   }
@@ -755,7 +905,7 @@ TEST(ExceptionTest, wrappedException) {
     ASSERT_EQ(ve.message(), "This is a test.");
     ASSERT_FALSE(ve.isUserError());
     ASSERT_EQ(ve.context(), "");
-    ASSERT_EQ(ve.topLevelContext(), "");
+    ASSERT_EQ(ve.additionalContext(), "");
     ASSERT_THROW(
         std::rethrow_exception(ve.wrappedException()), std::invalid_argument);
   }
@@ -784,7 +934,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) {
 
   std::string data = "lakes";
   facebook::velox::ExceptionContextSetter context(
-      {messageFunction, data.data()});
+      {messageFunction, data.data(), true});
 
   try {
     throw std::invalid_argument("This is a test.");
@@ -793,7 +943,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) {
     ASSERT_EQ(ve.message(), "This is a test.");
     ASSERT_TRUE(ve.isUserError());
     ASSERT_EQ(ve.context(), "User error: lakes");
-    ASSERT_EQ(ve.topLevelContext(), "Same as context.");
+    ASSERT_EQ(ve.additionalContext(), "");
     ASSERT_THROW(
         std::rethrow_exception(ve.wrappedException()), std::invalid_argument);
   }
@@ -805,7 +955,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) {
     ASSERT_EQ(ve.message(), "This is a test.");
     ASSERT_FALSE(ve.isUserError());
     ASSERT_EQ(ve.context(), "System error: lakes");
-    ASSERT_EQ(ve.topLevelContext(), "Same as context.");
+    ASSERT_EQ(ve.additionalContext(), "");
     ASSERT_THROW(
         std::rethrow_exception(ve.wrappedException()), std::invalid_argument);
   }
@@ -821,7 +971,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) {
     ASSERT_EQ(ve.message(), "This is a test.");
     ASSERT_TRUE(ve.isUserError());
     ASSERT_EQ(ve.context(), "User error: mountains");
-    ASSERT_EQ(ve.topLevelContext(), "User error: lakes");
+    ASSERT_EQ(ve.additionalContext(), "User error: lakes");
     ASSERT_THROW(
         std::rethrow_exception(ve.wrappedException()), std::invalid_argument);
   }
@@ -833,7 +983,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) {
     ASSERT_EQ(ve.message(), "This is a test.");
     ASSERT_FALSE(ve.isUserError());
     ASSERT_EQ(ve.context(), "System error: mountains");
-    ASSERT_EQ(ve.topLevelContext(), "System error: lakes");
+    ASSERT_EQ(ve.additionalContext(), "System error: lakes");
     ASSERT_THROW(
         std::rethrow_exception(ve.wrappedException()), std::invalid_argument);
   }
diff --git a/velox/common/base/tests/StatsReporterTest.cpp b/velox/common/base/tests/StatsReporterTest.cpp
index b405f72513839..fd7e89a6f35fe 100644
--- a/velox/common/base/tests/StatsReporterTest.cpp
+++ b/velox/common/base/tests/StatsReporterTest.cpp
@@ -20,15 +20,13 @@
 #include <gtest/gtest.h>
 #include <cstdint>
 #include <unordered_map>
+#include <unordered_set>
+#include "velox/common/base/Counters.h"
+#include "velox/common/base/PeriodicStatsReporter.h"
+#include "velox/common/base/tests/GTestUtils.h"
 
 namespace facebook::velox {
 
-class StatsReporterTest : public testing::Test {
- protected:
-  void SetUp() override {}
-  void TearDown() override {}
-};
-
 class TestReporter : public BaseStatsReporter {
  public:
   mutable std::unordered_map<std::string, size_t> counterMap;
@@ -36,6 +34,12 @@ class TestReporter : public BaseStatsReporter {
   mutable std::unordered_map<std::string, std::vector<int32_t>>
       histogramPercentilesMap;
 
+  void clear() {
+    counterMap.clear();
+    statTypeMap.clear();
+    histogramPercentilesMap.clear();
+  }
+
   void registerMetricExportType(const char* key, StatType statType)
       const override {
     statTypeMap[key] = statType;
@@ -92,22 +96,32 @@ class TestReporter : public BaseStatsReporter {
   }
 };
 
-TEST_F(StatsReporterTest, trivialReporter) {
-  auto reporter = std::dynamic_pointer_cast<TestReporter>(
-      folly::Singleton<BaseStatsReporter>::try_get());
+class StatsReporterTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    reporter_ = std::dynamic_pointer_cast<TestReporter>(
+        folly::Singleton<BaseStatsReporter>::try_get());
+  }
+  void TearDown() override {
+    reporter_->clear();
+  }
+
+  std::shared_ptr<TestReporter> reporter_;
+};
 
+TEST_F(StatsReporterTest, trivialReporter) {
   DEFINE_METRIC("key1", StatType::COUNT);
   DEFINE_METRIC("key2", StatType::SUM);
   DEFINE_METRIC("key3", StatType::RATE);
   DEFINE_HISTOGRAM_METRIC("key4", 10, 0, 100, 50, 99, 100);
 
-  EXPECT_EQ(StatType::COUNT, reporter->statTypeMap["key1"]);
-  EXPECT_EQ(StatType::SUM, reporter->statTypeMap["key2"]);
-  EXPECT_EQ(StatType::RATE, reporter->statTypeMap["key3"]);
+  EXPECT_EQ(StatType::COUNT, reporter_->statTypeMap["key1"]);
+  EXPECT_EQ(StatType::SUM, reporter_->statTypeMap["key2"]);
+  EXPECT_EQ(StatType::RATE, reporter_->statTypeMap["key3"]);
   std::vector<int32_t> expected = {50, 99, 100};
-  EXPECT_EQ(expected, reporter->histogramPercentilesMap["key4"]);
+  EXPECT_EQ(expected, reporter_->histogramPercentilesMap["key4"]);
   EXPECT_TRUE(
-      reporter->statTypeMap.find("key5") == reporter->statTypeMap.end());
+      reporter_->statTypeMap.find("key5") == reporter_->statTypeMap.end());
 
   RECORD_METRIC_VALUE("key1", 10);
   RECORD_METRIC_VALUE("key1", 11);
@@ -119,12 +133,101 @@ TEST_F(StatsReporterTest, trivialReporter) {
   RECORD_HISTOGRAM_METRIC_VALUE("key4", 50);
   RECORD_HISTOGRAM_METRIC_VALUE("key4", 100);
 
-  EXPECT_EQ(36, reporter->counterMap["key1"]);
-  EXPECT_EQ(2201, reporter->counterMap["key2"]);
-  EXPECT_EQ(1101, reporter->counterMap["key3"]);
-  EXPECT_EQ(100, reporter->counterMap["key4"]);
+  EXPECT_EQ(36, reporter_->counterMap["key1"]);
+  EXPECT_EQ(2201, reporter_->counterMap["key2"]);
+  EXPECT_EQ(1101, reporter_->counterMap["key3"]);
+  EXPECT_EQ(100, reporter_->counterMap["key4"]);
+};
+
+class PeriodicStatsReporterTest : public StatsReporterTest {};
+
+class TestStatsReportMemoryArbitrator : public memory::MemoryArbitrator {
+ public:
+  explicit TestStatsReportMemoryArbitrator(
+      memory::MemoryArbitrator::Stats stats)
+      : memory::MemoryArbitrator({}), stats_(stats) {}
+
+  ~TestStatsReportMemoryArbitrator() override = default;
+
+  void updateStats(memory::MemoryArbitrator::Stats stats) {
+    stats_ = stats;
+  }
+
+  std::string kind() const override {
+    return "test";
+  }
+
+  uint64_t growCapacity(memory::MemoryPool* /*unused*/, uint64_t /*unused*/)
+      override {
+    return 0;
+  }
+
+  bool growCapacity(
+      memory::MemoryPool* /*unused*/,
+      const std::vector<std::shared_ptr<memory::MemoryPool>>& /*unused*/,
+      uint64_t /*unused*/) override {
+    return false;
+  }
+
+  uint64_t shrinkCapacity(memory::MemoryPool* /*unused*/, uint64_t /*unused*/)
+      override {
+    return 0;
+  }
+
+  uint64_t shrinkCapacity(
+      const std::vector<std::shared_ptr<memory::MemoryPool>>& /*unused*/,
+      uint64_t /*unused*/,
+      bool /*unused*/,
+      bool /*unused*/) override {
+    return 0;
+  }
+
+  Stats stats() const override {
+    return stats_;
+  }
+
+  std::string toString() const override {
+    return "TestStatsReportMemoryArbitrator::toString()";
+  }
+
+ private:
+  memory::MemoryArbitrator::Stats stats_;
 };
 
+TEST_F(PeriodicStatsReporterTest, basic) {
+  TestStatsReportMemoryArbitrator arbitrator({});
+  PeriodicStatsReporter::Options options;
+  options.arbitrator = &arbitrator;
+  options.arbitratorStatsIntervalMs = 4'000;
+  PeriodicStatsReporter periodicReporter(options);
+
+  periodicReporter.start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(2'000));
+  // Stop right after sufficient wait to ensure the following reads from main
+  // thread does not trigger TSAN failures.
+  periodicReporter.stop();
+
+  const auto& counterMap = reporter_->counterMap;
+  ASSERT_EQ(counterMap.size(), 2);
+  ASSERT_EQ(counterMap.count(kMetricArbitratorFreeCapacityBytes.str()), 1);
+  ASSERT_EQ(
+      counterMap.count(kMetricArbitratorFreeReservedCapacityBytes.str()), 1);
+}
+
+TEST_F(PeriodicStatsReporterTest, globalInstance) {
+  TestStatsReportMemoryArbitrator arbitrator({});
+  PeriodicStatsReporter::Options options;
+  options.arbitrator = &arbitrator;
+  options.arbitratorStatsIntervalMs = 4'000;
+  VELOX_ASSERT_THROW(
+      stopPeriodicStatsReporter(), "No periodic stats reporter to stop.");
+  ASSERT_NO_THROW(startPeriodicStatsReporter(options));
+  VELOX_ASSERT_THROW(
+      startPeriodicStatsReporter(options),
+      "The periodic stats reporter has already started.");
+  ASSERT_NO_THROW(stopPeriodicStatsReporter());
+}
+
 // Registering to folly Singleton with intended reporter type
 folly::Singleton<BaseStatsReporter> reporter([]() {
   return new TestReporter();
diff --git a/velox/common/caching/AsyncDataCache.h b/velox/common/caching/AsyncDataCache.h
index dd09e560543fa..b77a05952c8d6 100644
--- a/velox/common/caching/AsyncDataCache.h
+++ b/velox/common/caching/AsyncDataCache.h
@@ -716,7 +716,7 @@ class AsyncDataCache : public memory::Cache {
 
   /// Returns snapshot of the aggregated stats from all shards and the stats of
   /// SSD cache if used.
-  CacheStats refreshStats() const;
+  virtual CacheStats refreshStats() const;
 
   /// If 'details' is true, returns the stats of the backing memory allocator
   /// and ssd cache. Otherwise, only returns the cache stats.
diff --git a/velox/common/caching/SsdCache.cpp b/velox/common/caching/SsdCache.cpp
index 5b11fe89eebfc..d48eea9ae254a 100644
--- a/velox/common/caching/SsdCache.cpp
+++ b/velox/common/caching/SsdCache.cpp
@@ -60,7 +60,8 @@ SsdCache::SsdCache(
         i,
         fileMaxRegions,
         checkpointIntervalBytes / numShards,
-        disableFileCow));
+        disableFileCow,
+        executor_));
   }
 }
 
diff --git a/velox/common/caching/SsdCache.h b/velox/common/caching/SsdCache.h
index 2370bf00d68d4..20c5d6e0b87cf 100644
--- a/velox/common/caching/SsdCache.h
+++ b/velox/common/caching/SsdCache.h
@@ -105,6 +105,10 @@ class SsdCache {
 
   std::string toString() const;
 
+  const std::string& filePrefix() const {
+    return filePrefix_;
+  }
+
  private:
   const std::string filePrefix_;
   const int32_t numShards_;
diff --git a/velox/common/caching/SsdFile.cpp b/velox/common/caching/SsdFile.cpp
index 4c9483de2564f..e977f610ebe3a 100644
--- a/velox/common/caching/SsdFile.cpp
+++ b/velox/common/caching/SsdFile.cpp
@@ -135,6 +135,7 @@ SsdFile::SsdFile(
     folly::Executor* executor)
     : fileName_(filename),
       maxRegions_(maxRegions),
+      disableFileCow_(disableFileCow),
       shardId_(shardId),
       checkpointIntervalBytes_(checkpointIntervalBytes),
       executor_(executor) {
@@ -155,7 +156,7 @@ SsdFile::SsdFile(
       filename,
       folly::errnoStr(errno));
 
-  if (disableFileCow) {
+  if (disableFileCow_) {
     disableCow(fd_);
   }
 
@@ -346,6 +347,7 @@ bool SsdFile::growOrEvictLocked() {
 
   logEviction(candidates);
   clearRegionEntriesLocked(candidates);
+  stats_.regionsEvicted += candidates.size();
   writableRegions_ = std::move(candidates);
   suspended_ = false;
   return true;
@@ -531,6 +533,7 @@ void SsdFile::updateStats(SsdCacheStats& stats) const {
   }
   stats.entriesAgedOut += stats_.entriesAgedOut;
   stats.regionsAgedOut += stats_.regionsAgedOut;
+  stats.regionsEvicted += stats_.regionsEvicted;
   for (auto pins : regionPins_) {
     stats.numPins += pins;
   }
@@ -711,15 +714,6 @@ void SsdFile::checkpoint(bool force) {
   checkpointDeleted_ = false;
   bytesAfterCheckpoint_ = 0;
   try {
-    // We schedule the potentially long fsync of the cache file on another
-    // thread of the cache write executor, if available. If there is none, we do
-    // the sync on this thread at the end.
-    auto fileSync = std::make_shared<AsyncSource<int>>(
-        [fd = fd_]() { return std::make_unique<int>(::fsync(fd)); });
-    if (executor_ != nullptr) {
-      executor_->add([fileSync]() { fileSync->prepare(); });
-    }
-
     const auto checkRc = [&](int32_t rc, const std::string& errMsg) {
       if (rc < 0) {
         VELOX_FAIL("{} with rc {} :{}", errMsg, rc, folly::errnoStr(errno));
@@ -769,6 +763,15 @@ void SsdFile::checkpoint(bool force) {
       state.write(asChar(&offsetAndSize), sizeof(offsetAndSize));
     }
 
+    // We schedule the potentially long fsync of the cache file on another
+    // thread of the cache write executor, if available. If there is none, we do
+    // the sync on this thread at the end.
+    auto fileSync = std::make_shared<AsyncSource<int>>(
+        [fd = fd_]() { return std::make_unique<int>(::fsync(fd)); });
+    if (executor_ != nullptr) {
+      executor_->add([fileSync]() { fileSync->prepare(); });
+    }
+
     // NOTE: we need to ensure cache file data sync update completes before
     // updating checkpoint file.
     const auto fileSyncRc = fileSync->move();
@@ -790,6 +793,11 @@ void SsdFile::checkpoint(bool force) {
     const auto checkpointFd = checkRc(
         ::open(checkpointPath.c_str(), O_WRONLY),
         "Open of checkpoint file for sync");
+    // TODO: add this as file open option after we migrate to use velox
+    // filesystem for ssd file access.
+    if (disableFileCow_) {
+      disableCow(checkpointFd);
+    }
     VELOX_CHECK_GE(checkpointFd, 0);
     checkRc(::fsync(checkpointFd), "Sync of checkpoint file");
     ::close(checkpointFd);
@@ -822,6 +830,9 @@ void SsdFile::initializeCheckpoint() {
   }
   const auto logPath = fileName_ + kLogExtension;
   evictLogFd_ = ::open(logPath.c_str(), O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (disableFileCow_) {
+    disableCow(evictLogFd_);
+  }
   if (evictLogFd_ < 0) {
     ++stats_.openLogErrors;
     // Failure to open the log at startup is a process terminating error.
diff --git a/velox/common/caching/SsdFile.h b/velox/common/caching/SsdFile.h
index 77f03534a6a14..9b027cf9cb604 100644
--- a/velox/common/caching/SsdFile.h
+++ b/velox/common/caching/SsdFile.h
@@ -138,6 +138,7 @@ struct SsdCacheStats {
     bytesCached = tsanAtomicValue(other.bytesCached);
     entriesAgedOut = tsanAtomicValue(other.entriesAgedOut);
     regionsAgedOut = tsanAtomicValue(other.regionsAgedOut);
+    regionsEvicted = tsanAtomicValue(other.regionsEvicted);
     numPins = tsanAtomicValue(other.numPins);
 
     openFileErrors = tsanAtomicValue(other.openFileErrors);
@@ -162,6 +163,7 @@ struct SsdCacheStats {
   tsan_atomic<uint64_t> bytesCached{0};
   tsan_atomic<uint64_t> entriesAgedOut{0};
   tsan_atomic<uint64_t> regionsAgedOut{0};
+  tsan_atomic<uint64_t> regionsEvicted{0};
   tsan_atomic<int32_t> numPins{0};
 
   tsan_atomic<uint32_t> openFileErrors{0};
@@ -272,6 +274,11 @@ class SsdFile {
   /// Returns true if copy on write is disabled for this file. Used in testing.
   bool testingIsCowDisabled() const;
 
+  /// Return the SSD file path.
+  const std::string& fileName() const {
+    return fileName_;
+  }
+
  private:
   // 4 first bytes of a checkpoint file. Allows distinguishing between format
   // versions.
@@ -350,6 +357,9 @@ class SsdFile {
   // Maximum size of the backing file in kRegionSize units.
   const int32_t maxRegions_;
 
+  // True if copy on write should be disabled.
+  const bool disableFileCow_;
+
   // Serializes access to all private data members.
   mutable std::shared_mutex mutex_;
 
diff --git a/velox/common/encode/Base64.cpp b/velox/common/encode/Base64.cpp
index 85fd843b86a83..4135935189baf 100644
--- a/velox/common/encode/Base64.cpp
+++ b/velox/common/encode/Base64.cpp
@@ -22,6 +22,13 @@
 
 namespace facebook::velox::encoding {
 
+// Constants defining the size in bytes of binary and encoded blocks for Base64
+// encoding.
+// Size of a binary block in bytes (3 bytes = 24 bits)
+constexpr static int kBinaryBlockByteSize = 3;
+// Size of an encoded block in bytes (4 bytes = 24 bits)
+constexpr static int kEncodedBlockByteSize = 4;
+
 constexpr const Base64::Charset kBase64Charset = {
     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
     'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
@@ -298,10 +305,9 @@ std::string Base64::decode(folly::StringPiece encoded) {
 void Base64::decode(
     const std::pair<const char*, int32_t>& payload,
     std::string& output) {
-  size_t out_len = payload.second / 4 * 3;
-  output.resize(out_len, '\0');
-  out_len = Base64::decode(payload.first, payload.second, &output[0], out_len);
-  output.resize(out_len);
+  size_t inputSize = payload.second;
+  output.resize(calculateDecodedSize(payload.first, inputSize));
+  decode(payload.first, inputSize, output.data(), output.size());
 }
 
 // static
@@ -324,51 +330,50 @@ uint8_t Base64::Base64ReverseLookup(
 
 size_t
 Base64::decode(const char* src, size_t src_len, char* dst, size_t dst_len) {
-  return decodeImpl(src, src_len, dst, dst_len, kBase64ReverseIndexTable, true);
+  return decodeImpl(src, src_len, dst, dst_len, kBase64ReverseIndexTable);
 }
 
 // static
-size_t
-Base64::calculateDecodedSize(const char* data, size_t& size, bool withPadding) {
+size_t Base64::calculateDecodedSize(const char* data, size_t& size) {
   if (size == 0) {
     return 0;
   }
 
-  auto needed = (size / 4) * 3;
-  if (withPadding) {
-    // If the pad characters are included then the source string must be a
-    // multiple of 4 and we can query the end of the string to see how much
-    // padding exists.
-    if (size % 4 != 0) {
+  // Check if the input data is padded
+  if (isPadded(data, size)) {
+    // If padded, ensure that the string length is a multiple of the encoded
+    // block size
+    if (size % kEncodedBlockByteSize != 0) {
       throw Base64Exception(
           "Base64::decode() - invalid input string: "
-          "string length is not multiple of 4.");
+          "string length is not a multiple of 4.");
     }
 
+    auto needed = (size * kBinaryBlockByteSize) / kEncodedBlockByteSize;
     auto padding = countPadding(data, size);
     size -= padding;
-    return needed - padding;
+
+    // Adjust the needed size by deducting the bytes corresponding to the
+    // padding from the calculated size.
+    return needed -
+        ((padding * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) /
+        kEncodedBlockByteSize;
   }
+  // If not padded, Calculate extra bytes, if any
+  auto extra = size % kEncodedBlockByteSize;
+  auto needed = (size / kEncodedBlockByteSize) * kBinaryBlockByteSize;
 
-  // If padding doesn't exist we need to calculate it from the size - if the
-  // size % 4 is 0 then we have an even multiple 3 byte chunks in the result
-  // if it is 2 then we need 1 more byte in the output.  If it is 3 then we
-  // need 2 more bytes in the output.  It should never be 1.
-  auto extra = size % 4;
+  // Adjust the needed size for extra bytes, if present
   if (extra) {
     if (extra == 1) {
       throw Base64Exception(
           "Base64::decode() - invalid input string: "
           "string length cannot be 1 more than a multiple of 4.");
     }
-    return needed + extra - 1;
+    needed += (extra * kBinaryBlockByteSize) / kEncodedBlockByteSize;
   }
 
-  // Just because we don't need the pad, doesn't mean it is not there.  The
-  // URL decoder should be able to handle the original encoding.
-  auto padding = countPadding(data, size);
-  size -= padding;
-  return needed - padding;
+  return needed;
 }
 
 size_t Base64::decodeImpl(
@@ -376,13 +381,12 @@ size_t Base64::decodeImpl(
     size_t src_len,
     char* dst,
     size_t dst_len,
-    const Base64::ReverseIndex& reverse_lookup,
-    bool include_pad) {
+    const ReverseIndex& reverse_lookup) {
   if (!src_len) {
     return 0;
   }
 
-  auto needed = calculateDecodedSize(src, src_len, include_pad);
+  auto needed = calculateDecodedSize(src, src_len);
   if (dst_len < needed) {
     throw Base64Exception(
         "Base64::decode() - invalid output string: "
@@ -437,9 +441,8 @@ void Base64::decodeUrl(
     const char* src,
     size_t src_len,
     char* dst,
-    size_t dst_len,
-    bool hasPad) {
-  decodeImpl(src, src_len, dst, dst_len, kBase64UrlReverseIndexTable, hasPad);
+    size_t dst_len) {
+  decodeImpl(src, src_len, dst, dst_len, kBase64UrlReverseIndexTable);
 }
 
 std::string Base64::decodeUrl(folly::StringPiece encoded) {
@@ -458,8 +461,7 @@ void Base64::decodeUrl(
       payload.second,
       &output[0],
       out_len,
-      kBase64UrlReverseIndexTable,
-      false);
+      kBase64UrlReverseIndexTable);
   output.resize(out_len);
 }
 } // namespace facebook::velox::encoding
diff --git a/velox/common/encode/Base64.h b/velox/common/encode/Base64.h
index 9888d97e67c54..2c7de463ea6fa 100644
--- a/velox/common/encode/Base64.h
+++ b/velox/common/encode/Base64.h
@@ -57,10 +57,9 @@ class Base64 {
 
   static std::string decode(folly::StringPiece encoded);
 
-  /// Returns decoded size for the specified input. Adjusts the 'size' to
-  /// subtract the length of the padding, if exists.
-  static size_t
-  calculateDecodedSize(const char* data, size_t& size, bool withPadding = true);
+  /// Returns the actual size of the decoded data. Will also remove the padding
+  /// length from the input data 'size'.
+  static size_t calculateDecodedSize(const char* data, size_t& size);
 
   /// Decodes the specified number of characters from the 'data' and writes the
   /// result to the 'output'. The output must have enough space, e.g. as
@@ -69,7 +68,7 @@ class Base64 {
 
   static void decode(
       const std::pair<const char*, int32_t>& payload,
-      std::string& outp);
+      std::string& output);
 
   /// Encodes the specified number of characters from the 'data' and writes the
   /// result to the 'output'. The output must have enough space, e.g. as
@@ -89,19 +88,24 @@ class Base64 {
   static size_t
   decode(const char* src, size_t src_len, char* dst, size_t dst_len);
 
-  static void decodeUrl(
-      const char* src,
-      size_t src_len,
-      char* dst,
-      size_t dst_len,
-      bool pad);
+  static void
+  decodeUrl(const char* src, size_t src_len, char* dst, size_t dst_len);
 
   constexpr static char kBase64Pad = '=';
 
  private:
+  static inline bool isPadded(const char* data, size_t len) {
+    return (len > 0 && data[len - 1] == kBase64Pad);
+  }
+
   static inline size_t countPadding(const char* src, size_t len) {
-    DCHECK_GE(len, 2);
-    return src[len - 1] != kBase64Pad ? 0 : src[len - 2] != kBase64Pad ? 1 : 2;
+    size_t numPadding{0};
+    while (len > 0 && src[len - 1] == kBase64Pad) {
+      numPadding++;
+      len--;
+    }
+
+    return numPadding;
   }
 
   static uint8_t Base64ReverseLookup(char p, const ReverseIndex& table);
@@ -122,8 +126,7 @@ class Base64 {
       size_t src_len,
       char* dst,
       size_t dst_len,
-      const ReverseIndex& table,
-      bool include_pad);
+      const ReverseIndex& table);
 };
 
 } // namespace facebook::velox::encoding
diff --git a/velox/common/encode/CMakeLists.txt b/velox/common/encode/CMakeLists.txt
index d9918d53b59c5..bc27527e14ace 100644
--- a/velox/common/encode/CMakeLists.txt
+++ b/velox/common/encode/CMakeLists.txt
@@ -12,5 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+if(${VELOX_BUILD_TESTING})
+  add_subdirectory(tests)
+endif()
+
 add_library(velox_encode Base64.cpp)
 target_link_libraries(velox_encode PUBLIC Folly::folly)
diff --git a/velox/common/encode/tests/Base64Test.cpp b/velox/common/encode/tests/Base64Test.cpp
new file mode 100644
index 0000000000000..15556583c7519
--- /dev/null
+++ b/velox/common/encode/tests/Base64Test.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/common/encode/Base64.h"
+#include <gtest/gtest.h>
+#include "velox/common/base/tests/GTestUtils.h"
+
+namespace facebook::velox::encoding {
+class Base64Test : public ::testing::Test {};
+
+TEST_F(Base64Test, fromBase64) {
+  EXPECT_EQ(
+      "Hello, World!",
+      Base64::decode(folly::StringPiece("SGVsbG8sIFdvcmxkIQ==")));
+  EXPECT_EQ(
+      "Base64 encoding is fun.",
+      Base64::decode(folly::StringPiece("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=")));
+  EXPECT_EQ(
+      "Simple text", Base64::decode(folly::StringPiece("U2ltcGxlIHRleHQ=")));
+  EXPECT_EQ(
+      "1234567890", Base64::decode(folly::StringPiece("MTIzNDU2Nzg5MA==")));
+
+  // Check encoded strings without padding
+  EXPECT_EQ(
+      "Hello, World!",
+      Base64::decode(folly::StringPiece("SGVsbG8sIFdvcmxkIQ")));
+  EXPECT_EQ(
+      "Base64 encoding is fun.",
+      Base64::decode(folly::StringPiece("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4")));
+  EXPECT_EQ(
+      "Simple text", Base64::decode(folly::StringPiece("U2ltcGxlIHRleHQ")));
+  EXPECT_EQ("1234567890", Base64::decode(folly::StringPiece("MTIzNDU2Nzg5MA")));
+}
+
+TEST_F(Base64Test, calculateDecodedSizeProperSize) {
+  size_t encoded_size{0};
+
+  encoded_size = 20;
+  EXPECT_EQ(
+      13, Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ==", encoded_size));
+  EXPECT_EQ(18, encoded_size);
+
+  encoded_size = 18;
+  EXPECT_EQ(
+      13, Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ", encoded_size));
+  EXPECT_EQ(18, encoded_size);
+
+  encoded_size = 21;
+  EXPECT_THROW(
+      Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ==", encoded_size),
+      facebook::velox::encoding::Base64Exception);
+
+  encoded_size = 32;
+  EXPECT_EQ(
+      23,
+      Base64::calculateDecodedSize(
+          "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=", encoded_size));
+  EXPECT_EQ(31, encoded_size);
+
+  encoded_size = 31;
+  EXPECT_EQ(
+      23,
+      Base64::calculateDecodedSize(
+          "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4", encoded_size));
+  EXPECT_EQ(31, encoded_size);
+
+  encoded_size = 16;
+  EXPECT_EQ(10, Base64::calculateDecodedSize("MTIzNDU2Nzg5MA==", encoded_size));
+  EXPECT_EQ(14, encoded_size);
+
+  encoded_size = 14;
+  EXPECT_EQ(10, Base64::calculateDecodedSize("MTIzNDU2Nzg5MA", encoded_size));
+  EXPECT_EQ(14, encoded_size);
+}
+
+} // namespace facebook::velox::encoding
diff --git a/velox/expression/tests/utils/CMakeLists.txt b/velox/common/encode/tests/CMakeLists.txt
similarity index 70%
rename from velox/expression/tests/utils/CMakeLists.txt
rename to velox/common/encode/tests/CMakeLists.txt
index afdae1b2789d1..e3268cb7f1b96 100644
--- a/velox/expression/tests/utils/CMakeLists.txt
+++ b/velox/common/encode/tests/CMakeLists.txt
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_library(velox_expression_test_utility ArgumentTypeFuzzer.cpp
-                                          FuzzerToolkit.cpp)
-
-target_link_libraries(velox_expression_test_utility velox_type
-                      velox_expression_functions gtest)
+add_executable(velox_common_encode_test Base64Test.cpp)
+add_test(velox_common_encode_test velox_common_encode_test)
+target_link_libraries(
+  velox_common_encode_test
+  PUBLIC Folly::folly
+  PRIVATE velox_encode velox_exception gtest gtest_main)
diff --git a/velox/common/memory/MemoryAllocator.cpp b/velox/common/memory/MemoryAllocator.cpp
index 3995b18419299..e4dd46457b3a4 100644
--- a/velox/common/memory/MemoryAllocator.cpp
+++ b/velox/common/memory/MemoryAllocator.cpp
@@ -364,15 +364,18 @@ std::string Stats::toString() const {
   std::stringstream out;
   int64_t totalClocks = 0;
   int64_t totalBytes = 0;
+  int64_t totalAllocations = 0;
   for (auto i = 0; i < sizes.size(); ++i) {
     totalClocks += sizes[i].clocks();
     totalBytes += sizes[i].totalBytes;
+    totalAllocations += sizes[i].numAllocations;
   }
   out << fmt::format(
-      "Alloc: {}MB {} Gigaclocks, {}MB advised\n",
+      "Alloc: {}MB {} Gigaclocks {} Allocations, {}MB advised\n",
       totalBytes >> 20,
       totalClocks >> 30,
-      numAdvise >> 8);
+      numAdvise >> 8,
+      totalAllocations);
 
   // Sort the size classes by decreasing clocks.
   std::vector<int32_t> indices(sizes.size());
@@ -386,10 +389,11 @@ std::string Stats::toString() const {
       break;
     }
     out << fmt::format(
-        "Size {}K: {}MB {} Megaclocks\n",
+        "Size {}K: {}MB {} Megaclocks {} Allocations\n",
         sizes[i].size * 4,
         sizes[i].totalBytes >> 20,
-        sizes[i].clocks() >> 20);
+        sizes[i].clocks() >> 20,
+        sizes[i].numAllocations);
   }
   return out.str();
 }
diff --git a/velox/common/memory/SharedArbitrator.cpp b/velox/common/memory/SharedArbitrator.cpp
index 01bd3802f6dea..ffd31daa52ca8 100644
--- a/velox/common/memory/SharedArbitrator.cpp
+++ b/velox/common/memory/SharedArbitrator.cpp
@@ -150,7 +150,6 @@ SharedArbitrator::SharedArbitrator(const MemoryArbitrator::Config& config)
       freeReservedCapacity_(reservedCapacity_),
       freeNonReservedCapacity_(capacity_ - freeReservedCapacity_) {
   VELOX_CHECK_EQ(kind_, config.kind);
-  updateFreeCapacityMetrics();
 }
 
 std::string SharedArbitrator::Candidate::toString() const {
@@ -192,14 +191,6 @@ std::vector<SharedArbitrator::Candidate> SharedArbitrator::getCandidateStats(
   return candidates;
 }
 
-void SharedArbitrator::updateFreeCapacityMetrics() const {
-  RECORD_METRIC_VALUE(
-      kMetricArbitratorFreeCapacityBytes,
-      freeNonReservedCapacity_ + freeReservedCapacity_);
-  RECORD_METRIC_VALUE(
-      kMetricArbitratorFreeReservedCapacityBytes, freeReservedCapacity_);
-}
-
 int64_t SharedArbitrator::maxReclaimableCapacity(const MemoryPool& pool) const {
   return std::max<int64_t>(0, pool.capacity() - memoryPoolReservedCapacity_);
 }
@@ -226,8 +217,6 @@ int64_t SharedArbitrator::minGrowCapacity(const MemoryPool& pool) const {
 uint64_t SharedArbitrator::growCapacity(
     MemoryPool* pool,
     uint64_t targetBytes) {
-  const auto freeCapacityMetricUpdateCb =
-      folly::makeGuard([this]() { updateFreeCapacityMetrics(); });
   uint64_t reservedBytes{0};
   {
     std::lock_guard<std::mutex> l(mutex_);
@@ -276,9 +265,6 @@ uint64_t SharedArbitrator::decrementFreeCapacityLocked(
 uint64_t SharedArbitrator::shrinkCapacity(
     MemoryPool* pool,
     uint64_t targetBytes) {
-  const auto freeCapacityUpdateCb =
-      folly::makeGuard([this]() { updateFreeCapacityMetrics(); });
-
   uint64_t freedBytes{0};
   {
     std::lock_guard<std::mutex> l(mutex_);
@@ -294,9 +280,6 @@ uint64_t SharedArbitrator::shrinkCapacity(
     uint64_t targetBytes,
     bool allowSpill,
     bool allowAbort) {
-  const auto freeCapacityUpdateCb =
-      folly::makeGuard([this]() { updateFreeCapacityMetrics(); });
-
   ScopedArbitration scopedArbitration(this);
   if (targetBytes == 0) {
     targetBytes = capacity_;
@@ -345,9 +328,6 @@ bool SharedArbitrator::growCapacity(
     MemoryPool* pool,
     const std::vector<std::shared_ptr<MemoryPool>>& candidatePools,
     uint64_t targetBytes) {
-  const auto freeCapacityUpdateCb =
-      folly::makeGuard([this]() { updateFreeCapacityMetrics(); });
-
   ScopedArbitration scopedArbitration(pool, this);
   MemoryPool* requestor = pool->root();
   if (requestor->aborted()) {
diff --git a/velox/common/memory/SharedArbitrator.h b/velox/common/memory/SharedArbitrator.h
index 50dc8c015d188..865e44ac269a3 100644
--- a/velox/common/memory/SharedArbitrator.h
+++ b/velox/common/memory/SharedArbitrator.h
@@ -239,11 +239,6 @@ class SharedArbitrator : public memory::MemoryArbitrator {
   // the reserved capacity as specified by 'memoryPoolReservedCapacity_'.
   int64_t minGrowCapacity(const MemoryPool& pool) const;
 
-  // Updates the free capacity metrics on capacity changes.
-  //
-  // TODO: move this update to velox runtime monitoring service once available.
-  void updateFreeCapacityMetrics() const;
-
   mutable std::mutex mutex_;
   tsan_atomic<uint64_t> freeReservedCapacity_{0};
   tsan_atomic<uint64_t> freeNonReservedCapacity_{0};
diff --git a/velox/common/memory/tests/MemoryAllocatorTest.cpp b/velox/common/memory/tests/MemoryAllocatorTest.cpp
index 86133a88e2bf3..4bfff3f4ffada 100644
--- a/velox/common/memory/tests/MemoryAllocatorTest.cpp
+++ b/velox/common/memory/tests/MemoryAllocatorTest.cpp
@@ -632,10 +632,42 @@ TEST_P(MemoryAllocatorTest, allocationClass2) {
   allocation->clear();
 }
 
+TEST_P(MemoryAllocatorTest, stats) {
+  const std::vector<MachinePageCount>& sizes = instance_->sizeClasses();
+  MachinePageCount capacity = kCapacityPages;
+  for (auto i = 0; i < sizes.size(); ++i) {
+    std::unique_ptr<Allocation> allocation = std::make_unique<Allocation>();
+    auto size = sizes[i];
+    ASSERT_TRUE(allocate(size, *allocation));
+    ASSERT_GT(instance_->numAllocated(), 0);
+    instance_->freeNonContiguous(*allocation);
+    auto stats = instance_->stats();
+    ASSERT_EQ(0, stats.sizes[i].clocks());
+    ASSERT_EQ(stats.sizes[i].totalBytes, 0);
+    ASSERT_EQ(stats.sizes[i].numAllocations, 0);
+  }
+
+  gflags::FlagSaver flagSaver;
+  FLAGS_velox_time_allocations = true;
+  for (auto i = 0; i < sizes.size(); ++i) {
+    std::unique_ptr<Allocation> allocation = std::make_unique<Allocation>();
+    auto size = sizes[i];
+    ASSERT_TRUE(allocate(size, *allocation));
+    ASSERT_GT(instance_->numAllocated(), 0);
+    instance_->freeNonContiguous(*allocation);
+    auto stats = instance_->stats();
+    ASSERT_LT(0, stats.sizes[i].clocks());
+    ASSERT_GE(stats.sizes[i].totalBytes, size * AllocationTraits::kPageSize);
+    ASSERT_GE(stats.sizes[i].numAllocations, 1);
+  }
+}
+
 TEST_P(MemoryAllocatorTest, singleAllocation) {
   if (!useMmap_ && enableReservation_) {
     return;
   }
+  gflags::FlagSaver flagSaver;
+  FLAGS_velox_time_allocations = true;
   const std::vector<MachinePageCount>& sizes = instance_->sizeClasses();
   MachinePageCount capacity = kCapacityPages;
   for (auto i = 0; i < sizes.size(); ++i) {
diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp
index 2fadb1fd8acdd..d8cfe275fd71a 100644
--- a/velox/connectors/hive/HiveConnectorUtil.cpp
+++ b/velox/connectors/hive/HiveConnectorUtil.cpp
@@ -436,12 +436,16 @@ std::unique_ptr<dwio::common::SerDeOptions> parseSerdeParameters(
   auto mapKeyIt =
       serdeParameters.find(dwio::common::SerDeOptions::kMapKeyDelim);
 
+  auto escapeCharIt =
+      serdeParameters.find(dwio::common::SerDeOptions::kEscapeChar);
+
   auto nullStringIt = tableParameters.find(
       dwio::common::TableParameter::kSerializationNullFormat);
 
   if (fieldIt == serdeParameters.end() &&
       collectionIt == serdeParameters.end() &&
       mapKeyIt == serdeParameters.end() &&
+      escapeCharIt == serdeParameters.end() &&
       nullStringIt == tableParameters.end()) {
     return nullptr;
   }
@@ -458,8 +462,19 @@ std::unique_ptr<dwio::common::SerDeOptions> parseSerdeParameters(
   if (mapKeyIt != serdeParameters.end()) {
     mapKeyDelim = parseDelimiter(mapKeyIt->second);
   }
-  auto serDeOptions = std::make_unique<dwio::common::SerDeOptions>(
-      fieldDelim, collectionDelim, mapKeyDelim);
+
+  uint8_t escapeChar;
+  bool hasEscapeChar = false;
+  if (escapeCharIt != serdeParameters.end() && !escapeCharIt->second.empty()) {
+    hasEscapeChar = true;
+    escapeChar = escapeCharIt->second[0];
+  }
+
+  auto serDeOptions = hasEscapeChar
+      ? std::make_unique<dwio::common::SerDeOptions>(
+            fieldDelim, collectionDelim, mapKeyDelim, escapeChar, true)
+      : std::make_unique<dwio::common::SerDeOptions>(
+            fieldDelim, collectionDelim, mapKeyDelim);
   if (nullStringIt != tableParameters.end()) {
     serDeOptions->nullString = nullStringIt->second;
   }
@@ -553,7 +568,10 @@ void configureRowReaderOptions(
   } else {
     cs = std::make_shared<dwio::common::ColumnSelector>(rowType, columnNames);
   }
-  rowReaderOptions.select(cs).range(hiveSplit->start, hiveSplit->length);
+  rowReaderOptions.select(cs);
+  if (hiveSplit) {
+    rowReaderOptions.range(hiveSplit->start, hiveSplit->length);
+  }
 }
 
 namespace {
diff --git a/velox/core/Config.cpp b/velox/core/Config.cpp
index 8ccd7a8ccbd28..4465bca74f8ed 100644
--- a/velox/core/Config.cpp
+++ b/velox/core/Config.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 #include "velox/core/Config.h"
+#include "velox/core/QueryConfig.h"
+#include "velox/type/tz/TimeZoneMap.h"
 
 namespace facebook::velox::core {
 
@@ -46,4 +48,11 @@ bool MemConfigMutable::isValueExists(const std::string& key) const {
   return lockedValues->find(key) != lockedValues->end();
 }
 
+void MemConfig::validateConfig() {
+  // Validate if timezone name can be recognized.
+  if (isValueExists(QueryConfig::kSessionTimezone)) {
+    util::getTimeZoneID(values_[QueryConfig::kSessionTimezone]);
+  }
+}
+
 } // namespace facebook::velox::core
diff --git a/velox/core/Config.h b/velox/core/Config.h
index 11ccea060588d..2dc705e937363 100644
--- a/velox/core/Config.h
+++ b/velox/core/Config.h
@@ -70,12 +70,16 @@ namespace core {
 class MemConfig : public Config {
  public:
   explicit MemConfig(const std::unordered_map<std::string, std::string>& values)
-      : values_(values) {}
+      : values_(values) {
+    validateConfig();
+  }
 
   explicit MemConfig() : values_{} {}
 
   explicit MemConfig(std::unordered_map<std::string, std::string>&& values)
-      : values_(std::move(values)) {}
+      : values_(std::move(values)) {
+    validateConfig();
+  }
 
   folly::Optional<std::string> get(const std::string& key) const override;
 
@@ -90,6 +94,9 @@ class MemConfig : public Config {
   }
 
  private:
+  // Validate if configurations are valid.
+  void validateConfig();
+
   std::unordered_map<std::string, std::string> values_;
 };
 
diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h
index c14a596322566..dfba399d967e7 100644
--- a/velox/core/PlanNode.h
+++ b/velox/core/PlanNode.h
@@ -158,7 +158,7 @@ class PlanNode : public ISerializable {
   /// 'addContext' is not null.
   ///
   /// @param addContext Optional lambda to add context for a given plan node.
-  /// Receives plan node ID, indentation and std::stringstring where to append
+  /// Receives plan node ID, indentation and std::stringstream where to append
   /// the context. Use indentation for second and subsequent lines of a
   /// mult-line context. Do not use indentation for single-line context. Do not
   /// add trailing new-line character for the last or only line of context.
diff --git a/velox/core/SimpleFunctionMetadata.h b/velox/core/SimpleFunctionMetadata.h
index ccf80e43c3c3b..9fcb0ad60ac3e 100644
--- a/velox/core/SimpleFunctionMetadata.h
+++ b/velox/core/SimpleFunctionMetadata.h
@@ -20,6 +20,7 @@
 #include <optional>
 
 #include "velox/common/base/Exceptions.h"
+#include "velox/common/base/Status.h"
 #include "velox/core/CoreTypeSystem.h"
 #include "velox/core/Metaprogramming.h"
 #include "velox/core/QueryConfig.h"
@@ -687,17 +688,33 @@ class UDFHolder {
       bool,
       exec_return_type,
       const exec_arg_type<TArgs>&...>::value;
+
   static constexpr bool udf_has_call_return_void = util::has_method<
       Fun,
       call_method_resolver,
       void,
       exec_return_type,
       const exec_arg_type<TArgs>&...>::value;
-  static constexpr bool udf_has_call =
-      udf_has_call_return_bool | udf_has_call_return_void;
+
+  static constexpr bool udf_has_call_return_status = util::has_method<
+      Fun,
+      call_method_resolver,
+      Status,
+      exec_return_type,
+      const exec_arg_type<TArgs>&...>::value;
+
+  static constexpr bool udf_has_call = udf_has_call_return_bool |
+      udf_has_call_return_void | udf_has_call_return_status;
+
   static_assert(
       !(udf_has_call_return_bool && udf_has_call_return_void),
-      "Provided call() methods need to return either void OR bool.");
+      "Provided call() methods need to return either void OR bool OR status.");
+  static_assert(
+      !(udf_has_call_return_bool && udf_has_call_return_status),
+      "Provided call() methods need to return either void OR bool OR status.");
+  static_assert(
+      !(udf_has_call_return_void && udf_has_call_return_status),
+      "Provided call() methods need to return either void OR bool OR status.");
 
   // callNullable():
   static constexpr bool udf_has_callNullable_return_bool = util::has_method<
@@ -863,13 +880,14 @@ class UDFHolder {
     }
   }
 
-  FOLLY_ALWAYS_INLINE bool call(
+  FOLLY_ALWAYS_INLINE Status call(
       exec_return_type& out,
+      bool& notNull,
       const typename exec_resolver<TArgs>::in_type&... args) {
     if constexpr (udf_has_call) {
-      return callImpl(out, args...);
+      return callImpl(out, notNull, args...);
     } else if constexpr (udf_has_callNullable) {
-      return callNullableImpl(out, (&args)...);
+      return callNullableImpl(out, notNull, (&args)...);
     } else {
       VELOX_UNREACHABLE(
           "call should never be called if the UDF does not "
@@ -877,18 +895,20 @@ class UDFHolder {
     }
   }
 
-  FOLLY_ALWAYS_INLINE bool callNullable(
+  FOLLY_ALWAYS_INLINE Status callNullable(
       exec_return_type& out,
+      bool& notNull,
       const typename exec_resolver<TArgs>::in_type*... args) {
     if constexpr (udf_has_callNullable) {
-      return callNullableImpl(out, args...);
+      return callNullableImpl(out, notNull, args...);
     } else if constexpr (udf_has_call) {
       // Default null behavior.
       const bool isAllSet = (args && ...);
       if (LIKELY(isAllSet)) {
-        return callImpl(out, (*args)...);
+        return callImpl(out, notNull, (*args)...);
       } else {
-        return false;
+        notNull = false;
+        return Status::OK();
       }
     } else {
       VELOX_UNREACHABLE(
@@ -897,21 +917,23 @@ class UDFHolder {
     }
   }
 
-  FOLLY_ALWAYS_INLINE bool callAscii(
+  FOLLY_ALWAYS_INLINE Status callAscii(
       exec_return_type& out,
+      bool& notNull,
       const typename exec_resolver<TArgs>::in_type&... args) {
     if constexpr (udf_has_callAscii) {
-      return callAsciiImpl(out, args...);
+      return callAsciiImpl(out, notNull, args...);
     } else {
-      return call(out, args...);
+      return call(out, notNull, args...);
     }
   }
 
-  FOLLY_ALWAYS_INLINE bool callNullFree(
+  FOLLY_ALWAYS_INLINE Status callNullFree(
       exec_return_type& out,
+      bool& notNull,
       const exec_no_nulls_arg_type<TArgs>&... args) {
     if constexpr (udf_has_callNullFree) {
-      return callNullFreeImpl(out, args...);
+      return callNullFreeImpl(out, notNull, args...);
     } else {
       VELOX_UNREACHABLE(
           "callNullFree should never be called if the UDF does not implement callNullFree.");
@@ -920,52 +942,66 @@ class UDFHolder {
 
   // Helper functions to handle void vs bool return type.
 
-  FOLLY_ALWAYS_INLINE bool callImpl(
+  FOLLY_ALWAYS_INLINE Status callImpl(
       typename Exec::template resolver<TReturn>::out_type& out,
+      bool& notNull,
       const typename Exec::template resolver<TArgs>::in_type&... args) {
     static_assert(udf_has_call);
-    if constexpr (udf_has_call_return_bool) {
+
+    if constexpr (udf_has_call_return_status) {
+      notNull = true;
       return instance_.call(out, args...);
+    } else if constexpr (udf_has_call_return_bool) {
+      notNull = instance_.call(out, args...);
+      return Status::OK();
     } else {
       instance_.call(out, args...);
-      return true;
+      notNull = true;
+      return Status::OK();
     }
   }
 
-  FOLLY_ALWAYS_INLINE bool callNullableImpl(
+  FOLLY_ALWAYS_INLINE Status callNullableImpl(
       exec_return_type& out,
+      bool& notNull,
       const typename Exec::template resolver<TArgs>::in_type*... args) {
     static_assert(udf_has_callNullable);
     if constexpr (udf_has_callNullable_return_bool) {
-      return instance_.callNullable(out, args...);
+      notNull = instance_.callNullable(out, args...);
+      return Status::OK();
     } else {
       instance_.callNullable(out, args...);
-      return true;
+      notNull = true;
+      return Status::OK();
     }
   }
 
-  FOLLY_ALWAYS_INLINE bool callAsciiImpl(
+  FOLLY_ALWAYS_INLINE Status callAsciiImpl(
       typename Exec::template resolver<TReturn>::out_type& out,
+      bool& notNull,
       const typename Exec::template resolver<TArgs>::in_type&... args) {
     static_assert(udf_has_callAscii);
     if constexpr (udf_has_callAscii_return_bool) {
-      return instance_.callAscii(out, args...);
+      notNull = instance_.callAscii(out, args...);
     } else {
       instance_.callAscii(out, args...);
-      return true;
+      notNull = true;
     }
+    return Status::OK();
   }
 
-  FOLLY_ALWAYS_INLINE bool callNullFreeImpl(
+  FOLLY_ALWAYS_INLINE Status callNullFreeImpl(
       typename Exec::template resolver<TReturn>::out_type& out,
+      bool& notNull,
       const exec_no_nulls_arg_type<TArgs>&... args) {
     static_assert(udf_has_callNullFree);
     if constexpr (udf_has_callNullFree_return_bool) {
-      return instance_.callNullFree(out, args...);
+      notNull = instance_.callNullFree(out, args...);
     } else {
       instance_.callNullFree(out, args...);
-      return true;
+      notNull = true;
     }
+    return Status::OK();
   }
 };
 
diff --git a/velox/core/tests/QueryConfigTest.cpp b/velox/core/tests/QueryConfigTest.cpp
index 67e87fe5cd349..9955bbf34cef1 100644
--- a/velox/core/tests/QueryConfigTest.cpp
+++ b/velox/core/tests/QueryConfigTest.cpp
@@ -46,8 +46,23 @@ TEST_F(QueryConfigTest, setConfig) {
   ASSERT_TRUE(config.isLegacyCast());
 }
 
+TEST_F(QueryConfigTest, invalidConfig) {
+  std::unordered_map<std::string, std::string> configData(
+      {{QueryConfig::kSessionTimezone, "Invalid"}});
+  VELOX_ASSERT_USER_THROW(
+      std::make_shared<QueryCtx>(nullptr, std::move(configData)),
+      "Unknown time zone: 'Invalid'");
+
+  auto queryCtx = std::make_shared<QueryCtx>(nullptr);
+  VELOX_ASSERT_USER_THROW(
+      queryCtx->testingOverrideConfigUnsafe({
+          {core::QueryConfig::kSessionTimezone, ""},
+      }),
+      "Unknown time zone: ''");
+}
+
 TEST_F(QueryConfigTest, memConfig) {
-  const std::string tz = "timezone1";
+  const std::string tz = "UTC";
   const std::unordered_map<std::string, std::string> configData(
       {{QueryConfig::kSessionTimezone, tz}});
 
@@ -72,7 +87,7 @@ TEST_F(QueryConfigTest, memConfig) {
         tz,
         cfg.Config::get<std::string>(QueryConfig::kSessionTimezone).value());
     ASSERT_FALSE(cfg.Config::get<std::string>("missing-entry").has_value());
-    const std::string tz2 = "timezone2";
+    const std::string tz2 = "PST";
     ASSERT_NO_THROW(cfg.setValue(QueryConfig::kSessionTimezone, tz2));
     ASSERT_EQ(
         tz2,
diff --git a/velox/docs/develop/testing/fuzzer.rst b/velox/docs/develop/testing/fuzzer.rst
index 9aa7630f07a4b..639acda6ee5e5 100644
--- a/velox/docs/develop/testing/fuzzer.rst
+++ b/velox/docs/develop/testing/fuzzer.rst
@@ -141,6 +141,8 @@ tested:
     Total aggregations verified against DuckDB: 2537 (44.63%)
     Total failed aggregations: 1061 (18.67%)
 
+.. _window-fuzzer:
+
 Window Fuzzer
 -------------
 
@@ -284,7 +286,7 @@ When Fuzzer test fails, a seed number and the evaluated expression are
 printed to the log. An example is given below. Developers can use ``--seed``
 with this seed number to rerun the exact same expression with the same inputs,
 and use a debugger to investigate the issue. For the example below, the command
-to reproduce the error would be ``velox/expression/tests/velox_expression_fuzzer_test --seed 1188545576``.
+to reproduce the error would be ``velox/expression/fuzzer/velox_expression_fuzzer_test --seed 1188545576``.
 
 ::
 
diff --git a/velox/docs/develop/testing/join-fuzzer.rst b/velox/docs/develop/testing/join-fuzzer.rst
index be7d61a467bfc..1bbfbfc7df41b 100644
--- a/velox/docs/develop/testing/join-fuzzer.rst
+++ b/velox/docs/develop/testing/join-fuzzer.rst
@@ -42,7 +42,7 @@ Use velox_join_fuzzer_test binary to run join fuzzer:
 
     velox/exec/tests/velox_join_fuzzer_test
 
-By default, the fuzzer will go through 10 interations. Use --steps
+By default, the fuzzer will go through 10 iterations. Use --steps
 or --duration-sec flag to run fuzzer for longer. Use --seed to
 reproduce fuzzer failures.
 
diff --git a/velox/docs/develop/testing/row-number-fuzzer.rst b/velox/docs/develop/testing/row-number-fuzzer.rst
new file mode 100644
index 0000000000000..6f304a50f72b8
--- /dev/null
+++ b/velox/docs/develop/testing/row-number-fuzzer.rst
@@ -0,0 +1,55 @@
+================
+RowNumber Fuzzer
+================
+
+The RowNumberFuzzer is a testing tool that automatically generate equivalent query plans and then executes these plans
+to validate the consistency of the results. It works as follows:
+
+1. Data Generation: It starts by generating a random set of input data, also known as a vector. This data can
+   have a variety of encodings and data layouts to ensure thorough testing.
+2. Plan Generation: Generate two equivalent query plans, one is row-number over ValuesNode as the base plan.
+   and the other is over TableScanNode as the alter plan.
+3. Query Execution: Executes those equivalent query plans using the generated data and asserts that the results are
+   consistent across different plans.
+  i. Execute the base plan, compare the result with the reference (DuckDB or Presto) and use it as the expected result.
+  #. Execute the alter plan multiple times with and without spill, and compare each result with the
+     expected result.
+4. Iteration: This process is repeated multiple times to ensure reliability and robustness.
+
+How to run
+----------
+
+Use velox_row_number_fuzzer_test binary to run rowNumber fuzzer:
+
+::
+
+    velox/exec/tests/velox_row_number_fuzzer_test --seed 123 --duration_sec 60
+
+By default, the fuzzer will go through 10 iterations. Use --steps
+or --duration-sec flag to run fuzzer for longer. Use --seed to
+reproduce fuzzer failures.
+
+Here is a full list of supported command line arguments.
+
+* ``–-steps``: How many iterations to run. Each iteration generates and
+  evaluates one expression or aggregation. Default is 10.
+
+* ``–-duration_sec``: For how long to run in seconds. If both ``-–steps``
+  and ``-–duration_sec`` are specified, –duration_sec takes precedence.
+
+* ``–-seed``: The seed to generate random expressions and input vectors with.
+
+* ``–-v=1``: Verbose logging (from `Google Logging Library <https://github.com/google/glog#setting-flags>`_).
+
+* ``–-batch_size``: The size of input vectors to generate. Default is 100.
+
+* ``--num_batches``: The number of input vectors of size `--batch_size` to
+  generate. Default is 5.
+
+* ``--enable_spill``: Whether to test with spilling or not. Default is true.
+
+* ``--presto_url`` The PrestoQueryRunner url along with its port number.
+
+* ``--req_timeout_ms`` Timeout in milliseconds of an HTTP request to the PrestoQueryRunner.
+
+If running from CLion IDE, add ``--logtostderr=1`` to see the full output.
diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst
index e1d3615c7283e..52cf9969f0a3c 100644
--- a/velox/docs/functions/presto/binary.rst
+++ b/velox/docs/functions/presto/binary.rst
@@ -8,7 +8,26 @@ Binary Functions
 
 .. function:: from_base64(string) -> varbinary
 
-    Decodes binary data from the base64 encoded ``string``.
+    Decodes a Base64-encoded ``string`` back into its original binary form. 
+    This function is capable of handling both fully padded and non-padded Base64 encoded strings. 
+    Partially padded Base64 strings are not supported and will result in an error.
+
+    Examples
+    --------
+    Query with padded Base64 string:
+    ::
+        SELECT from_base64('SGVsbG8gV29ybGQ='); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]
+
+    Query with non-padded Base64 string:
+    ::
+        SELECT from_base64('SGVsbG8gV29ybGQ'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]
+
+    Query with partial-padded Base64 string:
+    ::
+        SELECT from_base64('SGVsbG8gV29ybGQgZm9yIHZlbG94IQ='); -- Error : Base64::decode() - invalid input string: string length is not a multiple of 4.
+
+    In the above examples, both the fully padded and non-padded Base64 strings ('SGVsbG8gV29ybGQ=' and 'SGVsbG8gV29ybGQ') decode to the binary representation of the text 'Hello World'.
+    While, partial-padded Base64 string 'SGVsbG8gV29ybGQgZm9yIHZlbG94IQ=' will lead to an velox error.
 
 .. function:: from_base64url(string) -> varbinary
 
diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst
index 5472b5a2c8dbf..708a3349b93b1 100644
--- a/velox/docs/functions/spark/datetime.rst
+++ b/velox/docs/functions/spark/datetime.rst
@@ -217,10 +217,24 @@ These functions support TIMESTAMP and DATE input types.
 
 .. spark:function:: second(timestamp) -> integer
 
-    Returns the seconds of ``timestamp``.::
+    Returns the seconds of ``timestamp``. ::
 
         SELECT second('2009-07-30 12:58:59'); -- 59
 
+.. spark:function:: timestamp_micros(x) -> timestamp
+
+    Returns timestamp from the number of microseconds since UTC epoch.
+    Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT.::
+
+        SELECT timestamp_micros(1230219000123123); -- '2008-12-25 15:30:00.123123'
+
+.. spark:function:: timestamp_millis(x) -> timestamp
+
+    Returns timestamp from the number of milliseconds since UTC epoch.
+    Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT.::
+
+        SELECT timestamp_millis(1230219000123); -- '2008-12-25 15:30:00.123'
+
 .. spark:function:: to_unix_timestamp(string) -> integer
 
     Alias for ``unix_timestamp(string) -> integer``.
@@ -238,12 +252,31 @@ These functions support TIMESTAMP and DATE input types.
 
 .. spark:function:: unix_date(date) -> integer
 
-    Returns the number of days since 1970-01-01.::
+    Returns the number of days since 1970-01-01. ::
 
         SELECT unix_date('1970-01-01'); -- '0'
         SELECT unix_date('1970-01-02'); -- '1'
         SELECT unix_date('1969-12-31'); -- '-1'
 
+.. spark:function:: unix_micros(timestamp) -> bigint
+
+    Returns the number of microseconds since 1970-01-01 00:00:00 UTC.::
+
+        SELECT unix_micros('1970-01-01 00:00:01'); -- 1000000
+
+.. spark:function:: unix_millis(timestamp) -> bigint
+
+    Returns the number of milliseconds since 1970-01-01 00:00:00 UTC. Truncates
+    higher levels of precision.::
+
+        SELECT unix_millis('1970-01-01 00:00:01'); -- 1000
+
+.. spark:function:: unix_seconds(timestamp) -> bigint
+    
+    Returns the number of seconds since 1970-01-01 00:00:00 UTC. ::
+
+        SELECT unix_seconds('1970-01-01 00:00:01'); -- 1
+
 .. spark:function:: unix_timestamp() -> integer
 
     Returns the current UNIX timestamp in seconds.
@@ -272,7 +305,7 @@ These functions support TIMESTAMP and DATE input types.
 
 .. function:: weekday(date) -> integer
 
-    Returns the day of the week for date (0 = Monday, 1 = Tuesday, …, 6 = Sunday).::
+    Returns the day of the week for date (0 = Monday, 1 = Tuesday, …, 6 = Sunday). ::
 
         SELECT weekday('2015-04-08'); -- 2
         SELECT weekday('2024-02-10'); -- 5
diff --git a/velox/docs/monthly-updates.rst b/velox/docs/monthly-updates.rst
index 5945ec03d90a6..a34d51fedcff9 100644
--- a/velox/docs/monthly-updates.rst
+++ b/velox/docs/monthly-updates.rst
@@ -5,6 +5,9 @@ Monthly Updates
 .. toctree::
     :maxdepth: 1
 
+    monthly-updates/april-2024
+    monthly-updates/march-2024
+    monthly-updates/february-2024
     monthly-updates/january-2024
     monthly-updates/2023/index
     monthly-updates/2022/index
diff --git a/velox/docs/monthly-updates/april-2024.rst b/velox/docs/monthly-updates/april-2024.rst
new file mode 100644
index 0000000000000..919e8551bdbb1
--- /dev/null
+++ b/velox/docs/monthly-updates/april-2024.rst
@@ -0,0 +1,63 @@
+*****************
+April 2024 Update
+*****************
+
+Documentation
+=============
+
+* Document operations on decimals for :doc:`Presto </functions/presto/decimal>`
+  and :doc:`Spark </functions/spark/decimal>`.
+* Document spill write stats. :pr:`9326`
+
+Core Library
+============
+
+* Fix bugs in Window operator. :pr:`9476`, :pr:`9271`, :pr:`9257`
+
+Presto Functions
+================
+
+* Add :func:`word_stem` and :func:`to_iso8601` scalar functions.
+* Add support for DECIMAL inputs to :func:`arbitrary`, :func:`min` and :func:`max` aggregate functions.
+* Fix :func:`json_extract` for paths with wildcards.
+
+Spark Functions
+===============
+
+* Add :spark:func:`array_size`, :spark:func:`flatten`, :spark:func:`year_of_week` scalar functions.
+* Add :spark:func:`collect_list` and :spark:func:`regr_replacement` aggregate functions.
+
+Hive Connector
+==============
+
+* Add support for storing decimal as integer in Parquet writer.
+* Add hive.s3.connect-timeout, hive.s3.socket-timeout and hive.s3.max-connections configs. :pr:`9472`
+* Fix complex type handing in Parquet reader. :pr:`9187`
+* Fix DWRF reader to skip null map keys.
+
+Performance and Correctness
+===========================
+
+* Add aggregation and window fuzzer runs to every PR.
+* Add nightly run of window fuzzer.
+* Add check for aggregate function signature changes to every PR.
+* Add biased aggregation fuzzer run for newly added aggregate functions to every PR.
+
+Build System
+============
+
+* Add nightly job to track build metrics.
+
+Credits
+=======
+
+Andres Suarez, Andrii Rosa, Ankita Victor, Ashwin Krishna Kumar, Bikramjeet Vig,
+Christian Zentgraf, Daniel Munoz, David McKnight, Deepak Majeti, Hengzhi Chen,
+Huameng (Michael) Jiang, Jacob Wujciak-Jens, Jeongseok Lee, Jialiang Tan, Jimmy
+Lu, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, Lu Niu, Ludovic Henry, Ma,
+Rong, Mahadevuni Naveen Kumar, Masha Basmanova, Mike Lui, Minhan Cao, PHILO-HE,
+Pedro Eugenio Rocha Pedreira, Pedro Pedreira, Pramod, Qian Sun, Richard Barnes,
+Sergey Pershin, Shabab Ayub, Tengfei Huang, Terry Wang, Wei He, Weitao Wan,
+Wills Feng, Yang Zhang, Yihong Wang, Yoav Helfman, Zac Wen, Zhenyuan Zhao,
+aditi-pandit, chliang, cindyyyang, duanmeng, jay.narale, joey.ljy, mohsaka,
+rui-mo, svm1, willsfeng, wutiangan, wypb, xiaoxmeng, yingsu00, zhli1142015
diff --git a/velox/docs/monthly-updates/february-2024.rst b/velox/docs/monthly-updates/february-2024.rst
new file mode 100644
index 0000000000000..18704d2bb5ac4
--- /dev/null
+++ b/velox/docs/monthly-updates/february-2024.rst
@@ -0,0 +1,68 @@
+********************
+February 2024 Update
+********************
+
+Core Library
+============
+
+* Add support for aggregations over distinct inputs to StreamingAggregation.
+* Add support for deserializing a single column in Presto page format.
+* Add support for deserializing an all-null column serialized as UNKNOWN type in Presto page format.
+* Add stats for null skew in join operator.
+* Convert TIMESTAMP_WITH_TIME_ZONE type to a primitive type.
+* Add background profiler that starts Linux perf on the Velox process.
+* Fix ``out of range in dynamic array`` error in Task::toJson.
+* Delete unused ``max_arbitrary_buffer_size`` config.
+
+Presto Functions
+================
+
+* Add :func:`typeof`, :func:`from_iso8601_date` scalar functions.
+* Add support for DECIMAL input type to :func:`set_agg` and :func:`set_union` aggregate functions.
+* Add support for UNKNOWN input type to :func:`checksum` aggregate function.
+* Add support for DATE +/- INTERVAL YEAR MONTH functions.
+* Add support for ``UCT|UCT|GMT|GMT0`` as ``Z`` to :func:`parse_datetime` scalar function.
+
+Spark Functions
+===============
+
+* Add :spark:func:`array_repeat`, :spark:func:`date_from_unix_date`, :spark:func:`weekday`, :spark:func:`minute`, :spark:func:`second` scalar functions.
+* Add :spark:func:`ntile` window function.
+
+Hive Connector
+==============
+
+* Add ``ignore_missing_files`` config.
+* Add write support to ABFS file system.
+* Add support for proxy to S3 file system.
+
+Arrow
+=====
+
+* Add support to export UNKNOWN type to Arrow array.
+* Add support to convert Arrow REE arrays to Velox Vectors.
+
+Performance and Correctness
+===========================
+
+* Add FieldReference benchmark.
+* Add :ref:`Window fuzzer <window-fuzzer>`.
+* Fix ``Too many open files`` error in Join fuzzer.
+
+Build System
+============
+
+* Add ``VELOX_BUILD_MINIMAL_WITH_DWIO`` CMake option.
+* Move documentation, header and format check to Github Action.
+
+Credits
+=======
+
+Aaron Feldman, Ankita Victor, Bikramjeet Vig, Christian Zentgraf, Daniel Munoz,
+David McKnight, Deepak Majeti, Ge Gao, Hongze Zhang, Jacob Wujciak-Jens, Jia Ke,
+Jialiang Tan, Jimmy Lu, Kevin Wilfong, Krishna Pai, Lu Niu, Masha Basmanova,
+Nick Terrell, Orri Erling, PHILO-HE, Pedro Pedreira, Pramod, Pranjal Shankhdhar,
+Richard Barnes, Schierbeck, Cody, Sergey Pershin, Wei He, Yedidya Feldblum,
+Zac Wen, Zhenyuan Zhao, aditi-pandit, duanmeng, gayangya, hengjiang.ly, hitarth,
+lingbin, mwish, rrando901, rui-mo, xiaodou, xiaoxmeng, xumingming, yingsu00,
+zhli1142015, 高阳阳
diff --git a/velox/docs/monthly-updates/march-2024.rst b/velox/docs/monthly-updates/march-2024.rst
new file mode 100644
index 0000000000000..636f52ed3a4f4
--- /dev/null
+++ b/velox/docs/monthly-updates/march-2024.rst
@@ -0,0 +1,79 @@
+*****************
+March 2024 Update
+*****************
+
+Documentation
+=============
+
+* Document `design philosophy <https://velox-lib.io/docs/community/design-philosophy/>`_
+* Document custom input generators and verifiers supported in the Aggregation Fuzzer.
+* Document runtime stats reported by the HashTable. :pr:`9255`
+* Document usage of generic types in Simple Function API. :pr:`9084`
+
+Core Library
+============
+
+* Add prefix-sort for fixed width sorting keys.
+* Add null behavior and determinism scalar function metadata to the registry. :pr:`9209`
+* Add order-sensitive aggregate function metadata to the registry. :pr:`9050`
+* Add support for DECIMAL type to Simple Function API. :pr:`9096`
+* Add support for lambda functions (reduce_agg) to StreamingAggregation.
+* Deprecate threshold based spilling in Aggregation and OrderBy.
+* Optimize Exchange protocol used by Presto for latency. :pr:`8845`
+
+Presto Functions
+================
+
+* Add :func:`day`, :func:`from_ieee754_32`, :func:`hamming_distance`, :func:`map_normalize`,
+  :func:`map_top_n` scalar functions.
+* Add support for DECIMAL input type to :func:`floor` function.
+* Add support for timestamp +/- IntervalYearMonth.
+* Add :func:`regr_avgx`, :func:`regr_avgy`, :func:`regr_count`, :func:`regr_r2`,
+  :func:`regr_sxx`, :func:`regr_sxy`, and :func:`regr_syy` aggregation functions.
+
+Spark Functions
+===============
+
+* Add :spark:func:`array_remove`, :spark:func:`bit_length`, :spark:func:`bitwise_xor`,
+  :spark:func:`bitwise_not`, :spark:func:`make_ym_interval`, :spark:func:`from_utc_timestamp`,
+  :spark:func:`to_utc_timestamp`, :spark:func:`make_timestamp`, :spark:func:`map_subset`,
+  :spark:func:`unhex`, :spark:func:`unix_date`, :spark:func:`uuid` functions.
+* Add :spark:func:`regexp_replace` function.
+* Add :spark:func:`monotonically_increasing_id`, :spark:func:`spark_partition_id` functions.
+* Add :spark:func:`kurtosis` and :spark:func:`skewness` aggregation functions.
+* Add support for DECIMAL inputs to :spark:func:`sum` aggregation function.
+* Add CAST(real as decimal).
+* Add configuration property 'spark.partition_id'.
+
+Hive Connector
+==============
+
+* Add support for S3 client no_proxy CIDR expression. :pr:`9160`
+* Add support for synthetic columns '$file_size' and '$file_modified_time'.
+* Optimize reading a small sample of rows. :pr:`8920`.
+* Fix Parquet reader for files with different encodings across row groups. :pr:`9129`
+
+Performance and Correctness
+===========================
+
+* Add nightly run of Aggregation fuzzer using Presto as source of truth.
+* Add nightly run of Exchange fuzzer.
+* Add utility to randomly trigger OOMs and integrate it into Aggregation and Join fuzzers.
+* Add group execution mode to Join fuzzer.
+* Add support for random frame clause generation to Window fuzzer.
+* Add custom input generator for map_union_sum Presto aggregation function.
+* Add custom result verifier for arbitrary Presto aggregation function.
+
+Credits
+=======
+
+8dukongjian, Amit Dutta, Ankita Victor, Bikramjeet Vig, Christian Zentgraf,
+Daniel Munoz, Deepak Majeti, Ge Gao, InitialZJ, Jacob Wujciak-Jens, Jake Jung,
+Jialiang Tan, Jimmy Lu, Karteekmurthys, Kevin Wilfong, Krishna Pai, Ma,  Rong,
+Mahadevuni Naveen Kumar, Marcus D. Hanwell, Masha Basmanova, Nicholas Ormrod,
+Nick Terrell, Orri Erling, PHILO-HE, Patrick Sullivan, Pedro Pedreira, Pramod,
+Pratik Joseph Dabre, Qian Sun, Richard Barnes, Sandino Flores, Schierbeck,
+Cody, Sergey Pershin, Ubuntu, Wei He, Yang Zhang, Zac Wen, aditi-pandit,
+duanmeng, f0rest9999, hengjiang.ly, joey.ljy, lingbin, mwish, rexan, rui-mo,
+willsfeng, wypb, xiaodai1002, xiaoxmeng, xumingming, youxiduo, yuling.sh,
+zhli1142015, zky.zhoukeyong
diff --git a/velox/dwio/common/CacheInputStream.cpp b/velox/dwio/common/CacheInputStream.cpp
index 98c36e457b606..462a6ba8b87e9 100644
--- a/velox/dwio/common/CacheInputStream.cpp
+++ b/velox/dwio/common/CacheInputStream.cpp
@@ -127,7 +127,13 @@ void CacheInputStream::seekToPosition(PositionProvider& seekPosition) {
 }
 
 std::string CacheInputStream::getName() const {
-  return fmt::format("CacheInputStream {} of {}", position_, region_.length);
+  std::string result =
+      fmt::format("CacheInputStream {} of {}", position_, region_.length);
+  auto ssdFile = ssdFileName();
+  if (!ssdFile.empty()) {
+    result += fmt::format(" ssdFile={}", ssdFile);
+  }
+  return result;
 }
 
 size_t CacheInputStream::positionSize() {
@@ -285,6 +291,14 @@ bool CacheInputStream::loadFromSsd(
   return true;
 }
 
+std::string CacheInputStream::ssdFileName() const {
+  auto ssdCache = cache_->ssdCache();
+  if (!ssdCache) {
+    return "";
+  }
+  return ssdCache->file(fileNum_).fileName();
+}
+
 void CacheInputStream::loadPosition() {
   auto offset = region_.offset;
   if (pin_.empty()) {
diff --git a/velox/dwio/common/CacheInputStream.h b/velox/dwio/common/CacheInputStream.h
index 5a99f5b35c829..6b95bf3713bba 100644
--- a/velox/dwio/common/CacheInputStream.h
+++ b/velox/dwio/common/CacheInputStream.h
@@ -109,6 +109,10 @@ class CacheInputStream : public SeekableInputStream {
       velox::common::Region region,
       cache::AsyncDataCacheEntry& entry);
 
+  // Return SSD cache file path if exists; return empty string if no SSD cache
+  // file.
+  std::string ssdFileName() const;
+
   CachedBufferedInput* const bufferedInput_;
   cache::AsyncDataCache* const cache_;
   IoStatistics* ioStats_;
diff --git a/velox/dwio/common/ColumnVisitors.h b/velox/dwio/common/ColumnVisitors.h
index 3ac3d5e219f74..cc81f4505fee0 100644
--- a/velox/dwio/common/ColumnVisitors.h
+++ b/velox/dwio/common/ColumnVisitors.h
@@ -49,29 +49,32 @@ struct DropValues {
   }
 };
 
-template <typename TReader>
 struct ExtractToReader {
   using HookType = dwio::common::NoHook;
   static constexpr bool kSkipNulls = false;
-  explicit ExtractToReader(TReader* readerIn) : reader(readerIn) {}
+  explicit ExtractToReader(SelectiveColumnReader* readerIn)
+      : reader_(readerIn) {}
 
   bool acceptsNulls() const {
     return true;
   }
 
   template <typename T>
-  void addNull(vector_size_t rowIndex);
+  void addNull(vector_size_t /*rowIndex*/) {
+    reader_->template addNull<T>();
+  }
 
   template <typename V>
   void addValue(vector_size_t /*rowIndex*/, V value) {
-    reader->addValue(value);
+    reader_->addValue(value);
   }
 
-  TReader* reader;
-
   dwio::common::NoHook& hook() {
     return noHook();
   }
+
+ private:
+  SelectiveColumnReader* reader_;
 };
 
 template <typename THook>
@@ -150,6 +153,7 @@ class ColumnVisitor {
   using DataType = T;
   static constexpr bool dense = isDense;
   static constexpr bool kHasBulkPath = true;
+
   ColumnVisitor(
       TFilter& filter,
       SelectiveColumnReader* reader,
@@ -163,6 +167,20 @@ class ColumnVisitor {
         rowIndex_(0),
         values_(values) {}
 
+  template <bool isDense2 = isDense, std::enable_if_t<isDense2, int> = 0>
+  ColumnVisitor(
+      TFilter& filter,
+      SelectiveColumnReader* reader,
+      vector_size_t numRows,
+      ExtractValues values)
+      : filter_(filter),
+        reader_(reader),
+        allowNulls_(!TFilter::deterministic || filter.testNull()),
+        rows_(nullptr),
+        numRows_(numRows),
+        rowIndex_(0),
+        values_(values) {}
+
   bool allowNulls() {
     if (ExtractValues::kSkipNulls && TFilter::deterministic) {
       return false;
@@ -269,7 +287,7 @@ class ColumnVisitor {
     }
     if (++rowIndex_ >= numRows_) {
       atEnd = true;
-      return rows_[numRows_ - 1] - previous;
+      return rowAt(numRows_ - 1) - previous;
     }
     if (TFilter::deterministic && isDense) {
       return 0;
@@ -301,7 +319,7 @@ class ColumnVisitor {
     if (isDense) {
       return 0;
     }
-    return currentRow() - rows_[rowIndex_ - 1] - 1;
+    return currentRow() - rowAt(rowIndex_ - 1) - 1;
   }
 
   FOLLY_ALWAYS_INLINE vector_size_t process(T value, bool& atEnd) {
@@ -314,7 +332,7 @@ class ColumnVisitor {
       }
       if (++rowIndex_ >= numRows_) {
         atEnd = true;
-        return rows_[numRows_ - 1] - previous;
+        return rowAt(numRows_ - 1) - previous;
       }
       return currentRow() - previous - 1;
     }
@@ -331,7 +349,7 @@ class ColumnVisitor {
     if (isDense) {
       return 0;
     }
-    return currentRow() - rows_[rowIndex_ - 1] - 1;
+    return currentRow() - rowAt(rowIndex_ - 1) - 1;
   }
 
   // Returns space for 'size' items of T for a scan to fill. The scan
@@ -341,26 +359,30 @@ class ColumnVisitor {
     return reader_->mutableValues<T>(size);
   }
 
-  int32_t numRows() const {
-    return reader_->numRows();
-  }
-
   SelectiveColumnReader& reader() const {
     return *reader_;
   }
 
-  inline vector_size_t rowAt(vector_size_t index) {
+  inline vector_size_t rowAt(vector_size_t index) const {
     if (isDense) {
       return index;
     }
     return rows_[index];
   }
 
-  bool atEnd() {
+  vector_size_t rowIndex() const {
+    return rowIndex_;
+  }
+
+  void setRowIndex(vector_size_t index) {
+    rowIndex_ = index;
+  }
+
+  bool atEnd() const {
     return rowIndex_ >= numRows_;
   }
 
-  vector_size_t currentRow() {
+  vector_size_t currentRow() const {
     if (isDense) {
       return rowIndex_;
     }
@@ -371,7 +393,7 @@ class ColumnVisitor {
     return rows_;
   }
 
-  vector_size_t numRows() {
+  vector_size_t numRows() const {
     return numRows_;
   }
 
@@ -504,12 +526,6 @@ inline void ColumnVisitor<T, TFilter, ExtractValues, isDense>::addOutputRow(
   reader_->addOutputRow(row);
 }
 
-template <typename TReader>
-template <typename T>
-void ExtractToReader<TReader>::addNull(vector_size_t /*rowIndex*/) {
-  reader->template addNull<T>();
-}
-
 enum FilterResult { kUnknown = 0x40, kSuccess = 0x80, kFailure = 0 };
 
 namespace detail {
@@ -1390,13 +1406,6 @@ class DirectRleColumnVisitor
             rows,
             values) {}
 
-  // Use for replacing all rows with non-null rows for fast path with
-  // processRun and processRle.
-  void setRows(folly::Range<const int32_t*> newRows) {
-    super::rows_ = newRows.data();
-    super::numRows_ = newRows.size();
-  }
-
   // Processes 'numInput' T's in 'input'. Sets 'values' and
   // 'numValues'' to the resulting values. 'scatterRows' may be
   // non-null if there is no filter and the decoded values should be
@@ -1479,4 +1488,113 @@ class DirectRleColumnVisitor
   }
 };
 
+template <bool kEncodingHasNulls>
+class StringColumnReadWithVisitorHelper {
+ public:
+  StringColumnReadWithVisitorHelper(SelectiveColumnReader& reader, RowSet rows)
+      : reader_(reader), rows_(rows) {}
+
+  template <typename F>
+  auto operator()(F&& readWithVisitor) {
+    const bool isDense = rows_.back() == rows_.size() - 1;
+    if (reader_.scanSpec()->keepValues()) {
+      if (auto* hook = reader_.scanSpec()->valueHook()) {
+        if (isDense) {
+          readHelper<velox::common::AlwaysTrue, true>(
+              &alwaysTrue(),
+              ExtractToGenericHook(hook),
+              std::forward<F>(readWithVisitor));
+        } else {
+          readHelper<velox::common::AlwaysTrue, false>(
+              &alwaysTrue(),
+              ExtractToGenericHook(hook),
+              std::forward<F>(readWithVisitor));
+        }
+      } else {
+        if (isDense) {
+          processFilter<true>(
+              ExtractToReader(&reader_), std::forward<F>(readWithVisitor));
+        } else {
+          processFilter<false>(
+              ExtractToReader(&reader_), std::forward<F>(readWithVisitor));
+        }
+      }
+    } else {
+      if (isDense) {
+        processFilter<true>(DropValues(), std::forward<F>(readWithVisitor));
+      } else {
+        processFilter<false>(DropValues(), std::forward<F>(readWithVisitor));
+      }
+    }
+  }
+
+ private:
+  template <typename TFilter, bool kIsDense, typename ExtractValues, typename F>
+  void readHelper(
+      velox::common::Filter* filter,
+      ExtractValues extractValues,
+      F readWithVisitor) {
+    readWithVisitor(
+        ColumnVisitor<folly::StringPiece, TFilter, ExtractValues, kIsDense>(
+            *static_cast<TFilter*>(filter), &reader_, rows_, extractValues));
+  }
+
+  template <bool kIsDense, typename ExtractValues, typename F>
+  void processFilter(ExtractValues extractValues, F&& readWithVisitor) {
+    auto* filter = reader_.scanSpec()->filter();
+    if (filter == nullptr) {
+      readHelper<velox::common::AlwaysTrue, kIsDense>(
+          &alwaysTrue(), extractValues, std::forward<F>(readWithVisitor));
+      return;
+    }
+    switch (filter->kind()) {
+      case velox::common::FilterKind::kAlwaysTrue:
+        readHelper<velox::common::AlwaysTrue, kIsDense>(
+            filter, extractValues, std::forward<F>(readWithVisitor));
+        break;
+      case velox::common::FilterKind::kIsNull:
+        if constexpr (kEncodingHasNulls) {
+          reader_.filterNulls<StringView>(
+              rows_, true, !std::is_same_v<ExtractValues, DropValues>);
+        } else {
+          readHelper<velox::common::IsNull, kIsDense>(
+              filter, extractValues, std::forward<F>(readWithVisitor));
+        }
+        break;
+      case velox::common::FilterKind::kIsNotNull:
+        if constexpr (
+            kEncodingHasNulls && std::is_same_v<ExtractValues, DropValues>) {
+          reader_.filterNulls<StringView>(rows_, false, false);
+        } else {
+          readHelper<velox::common::IsNotNull, kIsDense>(
+              filter, extractValues, std::forward<F>(readWithVisitor));
+        }
+        break;
+      case velox::common::FilterKind::kBytesRange:
+        readHelper<velox::common::BytesRange, kIsDense>(
+            filter, extractValues, std::forward<F>(readWithVisitor));
+        break;
+      case velox::common::FilterKind::kNegatedBytesRange:
+        readHelper<velox::common::NegatedBytesRange, kIsDense>(
+            filter, extractValues, std::forward<F>(readWithVisitor));
+        break;
+      case velox::common::FilterKind::kBytesValues:
+        readHelper<velox::common::BytesValues, kIsDense>(
+            filter, extractValues, std::forward<F>(readWithVisitor));
+        break;
+      case velox::common::FilterKind::kNegatedBytesValues:
+        readHelper<velox::common::NegatedBytesValues, kIsDense>(
+            filter, extractValues, std::forward<F>(readWithVisitor));
+        break;
+      default:
+        readHelper<velox::common::Filter, kIsDense>(
+            filter, extractValues, std::forward<F>(readWithVisitor));
+        break;
+    }
+  }
+
+  SelectiveColumnReader& reader_;
+  const RowSet rows_;
+};
+
 } // namespace facebook::velox::dwio::common
diff --git a/velox/dwio/common/FormatData.h b/velox/dwio/common/FormatData.h
index 0348604606465..1f0b5d4426bb8 100644
--- a/velox/dwio/common/FormatData.h
+++ b/velox/dwio/common/FormatData.h
@@ -34,7 +34,7 @@ class FormatData {
 
   template <typename T>
   T& as() {
-    return *reinterpret_cast<T*>(this);
+    return *static_cast<T*>(this);
   }
 
   /// Reads nulls if the format has nulls separate from the encoded
diff --git a/velox/dwio/common/OnDemandUnitLoader.cpp b/velox/dwio/common/OnDemandUnitLoader.cpp
index a15998002cc8c..ee21a2b442338 100644
--- a/velox/dwio/common/OnDemandUnitLoader.cpp
+++ b/velox/dwio/common/OnDemandUnitLoader.cpp
@@ -58,10 +58,18 @@ class OnDemandUnitLoader : public UnitLoader {
     return *loadUnits_[unit];
   }
 
-  void onRead(
-      uint32_t /* unit */,
-      uint64_t /* rowOffsetInUnit */,
-      uint64_t /* rowCount */) override {}
+  void onRead(uint32_t unit, uint64_t rowOffsetInUnit, uint64_t /* rowCount */)
+      override {
+    VELOX_CHECK_LT(unit, loadUnits_.size(), "Unit out of range");
+    VELOX_CHECK_LT(
+        rowOffsetInUnit, loadUnits_[unit]->getNumRows(), "Row out of range");
+  }
+
+  void onSeek(uint32_t unit, uint64_t rowOffsetInUnit) override {
+    VELOX_CHECK_LT(unit, loadUnits_.size(), "Unit out of range");
+    VELOX_CHECK_LE(
+        rowOffsetInUnit, loadUnits_[unit]->getNumRows(), "Row out of range");
+  }
 
  private:
   std::vector<std::unique_ptr<LoadUnit>> loadUnits_;
diff --git a/velox/dwio/common/Options.h b/velox/dwio/common/Options.h
index 64566f3a1af95..5f806f22b08fe 100644
--- a/velox/dwio/common/Options.h
+++ b/velox/dwio/common/Options.h
@@ -45,11 +45,7 @@ enum class FileFormat {
   TEXT = 5,
   JSON = 6,
   PARQUET = 7,
-#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY
-  ALPHA = 8,
-#else
   NIMBLE = 8,
-#endif
   ORC = 9,
 };
 
@@ -83,6 +79,7 @@ class SerDeOptions {
   inline static const std::string kFieldDelim{"field.delim"};
   inline static const std::string kCollectionDelim{"collection.delim"};
   inline static const std::string kMapKeyDelim{"mapkey.delim"};
+  inline static const std::string kEscapeChar{"escape.delim"};
 
   explicit SerDeOptions(
       uint8_t fieldDelim = '\1',
diff --git a/velox/dwio/common/SelectiveByteRleColumnReader.h b/velox/dwio/common/SelectiveByteRleColumnReader.h
index 06aae1c4986b7..67537ea8d8b8f 100644
--- a/velox/dwio/common/SelectiveByteRleColumnReader.h
+++ b/velox/dwio/common/SelectiveByteRleColumnReader.h
@@ -39,7 +39,11 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader {
 
   void getValues(RowSet rows, VectorPtr* result) override;
 
-  template <typename Reader, bool isDense, typename ExtractValues>
+  template <
+      typename Reader,
+      bool isDense,
+      bool kEncodingHasNulls,
+      typename ExtractValues>
   void processFilter(
       velox::common::Filter* filter,
       ExtractValues extractValues,
@@ -58,7 +62,7 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader {
       RowSet rows,
       ExtractValues extractValues);
 
-  template <typename Reader>
+  template <typename Reader, bool kEncodingHasNulls>
   void
   readCommon(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls);
 };
@@ -78,7 +82,11 @@ void SelectiveByteRleColumnReader::readHelper(
           *reinterpret_cast<TFilter*>(filter), this, rows, extractValues));
 }
 
-template <typename Reader, bool isDense, typename ExtractValues>
+template <
+    typename Reader,
+    bool isDense,
+    bool kEncodingHasNulls,
+    typename ExtractValues>
 void SelectiveByteRleColumnReader::processFilter(
     velox::common::Filter* filter,
     ExtractValues extractValues,
@@ -90,13 +98,20 @@ void SelectiveByteRleColumnReader::processFilter(
           filter, rows, extractValues);
       break;
     case FilterKind::kIsNull:
-      filterNulls<int8_t>(
-          rows,
-          true,
-          !std::is_same_v<decltype(extractValues), dwio::common::DropValues>);
+      if constexpr (kEncodingHasNulls) {
+        filterNulls<int8_t>(
+            rows,
+            true,
+            !std::is_same_v<decltype(extractValues), dwio::common::DropValues>);
+      } else {
+        readHelper<Reader, velox::common::IsNull, isDense>(
+            filter, rows, extractValues);
+      }
       break;
     case FilterKind::kIsNotNull:
-      if (std::is_same_v<decltype(extractValues), dwio::common::DropValues>) {
+      if constexpr (
+          kEncodingHasNulls &&
+          std::is_same_v<decltype(extractValues), dwio::common::DropValues>) {
         filterNulls<int8_t>(rows, false, false);
       } else {
         readHelper<Reader, velox::common::IsNotNull, isDense>(
@@ -148,7 +163,7 @@ void SelectiveByteRleColumnReader::processValueHook(
   }
 }
 
-template <typename Reader>
+template <typename Reader, bool kEncodingHasNulls>
 void SelectiveByteRleColumnReader::readCommon(
     vector_size_t offset,
     RowSet rows,
@@ -167,17 +182,19 @@ void SelectiveByteRleColumnReader::readCommon(
       return;
     }
     if (isDense) {
-      processFilter<Reader, true>(
+      processFilter<Reader, true, kEncodingHasNulls>(
           filter, dwio::common::ExtractToReader(this), rows);
     } else {
-      processFilter<Reader, false>(
+      processFilter<Reader, false, kEncodingHasNulls>(
           filter, dwio::common::ExtractToReader(this), rows);
     }
   } else {
     if (isDense) {
-      processFilter<Reader, true>(filter, dwio::common::DropValues(), rows);
+      processFilter<Reader, true, kEncodingHasNulls>(
+          filter, dwio::common::DropValues(), rows);
     } else {
-      processFilter<Reader, false>(filter, dwio::common::DropValues(), rows);
+      processFilter<Reader, false, kEncodingHasNulls>(
+          filter, dwio::common::DropValues(), rows);
     }
   }
 }
diff --git a/velox/dwio/common/SelectiveColumnReader.h b/velox/dwio/common/SelectiveColumnReader.h
index 4b97f4c6f4652..1fdcbc2e2a762 100644
--- a/velox/dwio/common/SelectiveColumnReader.h
+++ b/velox/dwio/common/SelectiveColumnReader.h
@@ -294,9 +294,8 @@ class SelectiveColumnReader {
   template <typename T>
   inline void addNull() {
     VELOX_DCHECK_NE(valueSize_, kNoValueSize);
-    VELOX_DCHECK_LE(
-        rawResultNulls_ && rawValues_ && (numValues_ + 1) * valueSize_,
-        values_->capacity());
+    VELOX_DCHECK(rawResultNulls_ && rawValues_);
+    VELOX_DCHECK_LE((numValues_ + 1) * valueSize_, values_->capacity());
 
     anyNulls_ = true;
     bits::setNull(rawResultNulls_, numValues_);
@@ -441,12 +440,12 @@ class SelectiveColumnReader {
     isFlatMapValue_ = value;
   }
 
- protected:
   // Filters 'rows' according to 'is_null'. Only applies to cases where
   // scanSpec_->readsNullsOnly() is true.
   template <typename T>
   void filterNulls(RowSet rows, bool isNull, bool extractValues);
 
+ protected:
   template <typename T>
   void
   prepareRead(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls);
@@ -670,6 +669,8 @@ inline void SelectiveColumnReader::addValue(const folly::StringPiece value) {
   addStringValue(value);
 }
 
+velox::common::AlwaysTrue& alwaysTrue();
+
 } // namespace facebook::velox::dwio::common
 
 namespace facebook::velox::dwio::common {
diff --git a/velox/dwio/common/SelectiveColumnReaderInternal.h b/velox/dwio/common/SelectiveColumnReaderInternal.h
index 61bcf2b5befa9..ed38c6551fd55 100644
--- a/velox/dwio/common/SelectiveColumnReaderInternal.h
+++ b/velox/dwio/common/SelectiveColumnReaderInternal.h
@@ -31,8 +31,6 @@
 
 namespace facebook::velox::dwio::common {
 
-velox::common::AlwaysTrue& alwaysTrue();
-
 class Timer {
  public:
   Timer() : startClocks_{folly::hardware_timestamp()} {}
diff --git a/velox/dwio/common/SelectiveFloatingPointColumnReader.h b/velox/dwio/common/SelectiveFloatingPointColumnReader.h
index 61ccd7e4d8b6d..ea2455afa0c8c 100644
--- a/velox/dwio/common/SelectiveFloatingPointColumnReader.h
+++ b/velox/dwio/common/SelectiveFloatingPointColumnReader.h
@@ -40,7 +40,7 @@ class SelectiveFloatingPointColumnReader : public SelectiveColumnReader {
     return std::is_same_v<TData, TRequested>;
   }
 
-  template <typename Reader>
+  template <typename Reader, bool kEncodingHasNulls>
   void
   readCommon(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls);
 
@@ -57,7 +57,11 @@ class SelectiveFloatingPointColumnReader : public SelectiveColumnReader {
   void
   readHelper(velox::common::Filter* filter, RowSet rows, ExtractValues values);
 
-  template <typename Reader, bool isDense, typename ExtractValues>
+  template <
+      typename Reader,
+      bool isDense,
+      bool kEncodingHasNulls,
+      typename ExtractValues>
   void processFilter(
       velox::common::Filter* filter,
       RowSet rows,
@@ -84,7 +88,11 @@ void SelectiveFloatingPointColumnReader<TData, TRequested>::readHelper(
 }
 
 template <typename TData, typename TRequested>
-template <typename Reader, bool isDense, typename ExtractValues>
+template <
+    typename Reader,
+    bool isDense,
+    bool kEncodingHasNulls,
+    typename ExtractValues>
 void SelectiveFloatingPointColumnReader<TData, TRequested>::processFilter(
     velox::common::Filter* filter,
     RowSet rows,
@@ -101,11 +109,18 @@ void SelectiveFloatingPointColumnReader<TData, TRequested>::processFilter(
           filter, rows, extractValues);
       break;
     case velox::common::FilterKind::kIsNull:
-      filterNulls<TRequested>(
-          rows, true, !std::is_same_v<decltype(extractValues), DropValues>);
+      if constexpr (kEncodingHasNulls) {
+        filterNulls<TRequested>(
+            rows, true, !std::is_same_v<decltype(extractValues), DropValues>);
+      } else {
+        readHelper<Reader, velox::common::IsNull, isDense>(
+            filter, rows, extractValues);
+      }
       break;
     case velox::common::FilterKind::kIsNotNull:
-      if (std::is_same_v<decltype(extractValues), DropValues>) {
+      if constexpr (
+          kEncodingHasNulls &&
+          std::is_same_v<decltype(extractValues), DropValues>) {
         filterNulls<TRequested>(rows, false, false);
       } else {
         readHelper<Reader, velox::common::IsNotNull, isDense>(
@@ -163,7 +178,7 @@ void SelectiveFloatingPointColumnReader<TData, TRequested>::processValueHook(
 }
 
 template <typename TData, typename TRequested>
-template <typename Reader>
+template <typename Reader, bool kEncodingHasNulls>
 void SelectiveFloatingPointColumnReader<TData, TRequested>::readCommon(
     vector_size_t offset,
     RowSet rows,
@@ -179,18 +194,20 @@ void SelectiveFloatingPointColumnReader<TData, TRequested>::readCommon(
       }
     } else {
       if (isDense) {
-        processFilter<Reader, true>(
+        processFilter<Reader, true, kEncodingHasNulls>(
             scanSpec_->filter(), rows, ExtractToReader(this));
       } else {
-        processFilter<Reader, false>(
+        processFilter<Reader, false, kEncodingHasNulls>(
             scanSpec_->filter(), rows, ExtractToReader(this));
       }
     }
   } else {
     if (isDense) {
-      processFilter<Reader, true>(scanSpec_->filter(), rows, DropValues());
+      processFilter<Reader, true, kEncodingHasNulls>(
+          scanSpec_->filter(), rows, DropValues());
     } else {
-      processFilter<Reader, false>(scanSpec_->filter(), rows, DropValues());
+      processFilter<Reader, false, kEncodingHasNulls>(
+          scanSpec_->filter(), rows, DropValues());
     }
   }
 }
diff --git a/velox/dwio/common/SelectiveIntegerColumnReader.h b/velox/dwio/common/SelectiveIntegerColumnReader.h
index ba4b63e168ca0..444f341e06b9f 100644
--- a/velox/dwio/common/SelectiveIntegerColumnReader.h
+++ b/velox/dwio/common/SelectiveIntegerColumnReader.h
@@ -41,7 +41,11 @@ class SelectiveIntegerColumnReader : public SelectiveColumnReader {
 
  protected:
   // Switches based on filter type between different readHelper instantiations.
-  template <typename Reader, bool isDense, typename ExtractValues>
+  template <
+      typename Reader,
+      bool isDense,
+      bool kEncodingHasNulls,
+      typename ExtractValues>
   void processFilter(
       velox::common::Filter* filter,
       ExtractValues extractValues,
@@ -66,7 +70,7 @@ class SelectiveIntegerColumnReader : public SelectiveColumnReader {
   // The common part of integer reading. calls the appropriate
   // instantiation of processValueHook or processFilter based on
   // possible value hook, filter and denseness.
-  template <typename Reader>
+  template <typename Reader, bool kEncodingHasNulls>
   void readCommon(RowSet rows);
 };
 
@@ -113,7 +117,11 @@ void SelectiveIntegerColumnReader::readHelper(
   }
 }
 
-template <typename Reader, bool isDense, typename ExtractValues>
+template <
+    typename Reader,
+    bool isDense,
+    bool kEncodingHasNulls,
+    typename ExtractValues>
 void SelectiveIntegerColumnReader::processFilter(
     velox::common::Filter* filter,
     ExtractValues extractValues,
@@ -130,11 +138,18 @@ void SelectiveIntegerColumnReader::processFilter(
           filter, rows, extractValues);
       break;
     case velox::common::FilterKind::kIsNull:
-      filterNulls<int64_t>(
-          rows, true, !std::is_same_v<decltype(extractValues), DropValues>);
+      if constexpr (kEncodingHasNulls) {
+        filterNulls<int64_t>(
+            rows, true, !std::is_same_v<decltype(extractValues), DropValues>);
+      } else {
+        readHelper<Reader, velox::common::IsNull, isDense>(
+            filter, rows, extractValues);
+      }
       break;
     case velox::common::FilterKind::kIsNotNull:
-      if (std::is_same_v<decltype(extractValues), DropValues>) {
+      if constexpr (
+          kEncodingHasNulls &&
+          std::is_same_v<decltype(extractValues), DropValues>) {
         filterNulls<int64_t>(rows, false, false);
       } else {
         readHelper<Reader, velox::common::IsNotNull, isDense>(
@@ -211,7 +226,7 @@ void SelectiveIntegerColumnReader::processValueHook(
   }
 }
 
-template <typename Reader>
+template <typename Reader, bool kEncodingHasNulls>
 void SelectiveIntegerColumnReader::readCommon(RowSet rows) {
   bool isDense = rows.back() == rows.size() - 1;
   velox::common::Filter* filter =
@@ -225,16 +240,20 @@ void SelectiveIntegerColumnReader::readCommon(RowSet rows) {
       }
     } else {
       if (isDense) {
-        processFilter<Reader, true>(filter, ExtractToReader(this), rows);
+        processFilter<Reader, true, kEncodingHasNulls>(
+            filter, ExtractToReader(this), rows);
       } else {
-        processFilter<Reader, false>(filter, ExtractToReader(this), rows);
+        processFilter<Reader, false, kEncodingHasNulls>(
+            filter, ExtractToReader(this), rows);
       }
     }
   } else {
     if (isDense) {
-      processFilter<Reader, true>(filter, DropValues(), rows);
+      processFilter<Reader, true, kEncodingHasNulls>(
+          filter, DropValues(), rows);
     } else {
-      processFilter<Reader, false>(filter, DropValues(), rows);
+      processFilter<Reader, false, kEncodingHasNulls>(
+          filter, DropValues(), rows);
     }
   }
 }
diff --git a/velox/dwio/common/TypeWithId.cpp b/velox/dwio/common/TypeWithId.cpp
index 6b803a6249da4..c4c9b4f8d36fd 100644
--- a/velox/dwio/common/TypeWithId.cpp
+++ b/velox/dwio/common/TypeWithId.cpp
@@ -86,4 +86,37 @@ std::unique_ptr<TypeWithId> TypeWithId::create(
       type, std::move(children), myId, maxId, column);
 }
 
+std::string TypeWithId::fullName() const {
+  std::vector<std::string> path;
+  auto* child = this;
+  while (child->parent_) {
+    switch (child->parent_->type()->kind()) {
+      case TypeKind::ROW:
+        VELOX_CHECK(
+            child == child->parent_->children_.at(child->column_).get());
+        path.push_back(
+            '.' + child->parent_->type()->asRow().nameOf(child->column_));
+        break;
+      case TypeKind::ARRAY:
+        break;
+      case TypeKind::MAP:
+        if (child == child->parent_->children_.at(0).get()) {
+          path.push_back(".<keys>");
+        } else {
+          VELOX_CHECK(child == child->children_.at(1).get());
+          path.push_back(".<values>");
+        }
+        break;
+      default:
+        VELOX_UNREACHABLE();
+    }
+    child = parent_;
+  }
+  std::string ans = "<root>";
+  for (int i = path.size() - 1; i >= 0; --i) {
+    ans += path[i];
+  }
+  return ans;
+}
+
 } // namespace facebook::velox::dwio::common
diff --git a/velox/dwio/common/TypeWithId.h b/velox/dwio/common/TypeWithId.h
index 39a988f9936a9..5c5fbc5d070c7 100644
--- a/velox/dwio/common/TypeWithId.h
+++ b/velox/dwio/common/TypeWithId.h
@@ -73,6 +73,8 @@ class TypeWithId : public velox::Tree<std::shared_ptr<const TypeWithId>> {
     return children_;
   }
 
+  std::string fullName() const;
+
  private:
   static std::unique_ptr<TypeWithId> create(
       const std::shared_ptr<const velox::Type>& type,
diff --git a/velox/dwio/common/UnitLoader.h b/velox/dwio/common/UnitLoader.h
index f536a2d5eef16..3ea9653d521f9 100644
--- a/velox/dwio/common/UnitLoader.h
+++ b/velox/dwio/common/UnitLoader.h
@@ -49,8 +49,13 @@ class UnitLoader {
   virtual LoadUnit& getLoadedUnit(uint32_t unit) = 0;
 
   // Reader reports progress calling this method
+  // The call must be done **after** getLoadedUnit for unit
   virtual void
   onRead(uint32_t unit, uint64_t rowOffsetInUnit, uint64_t rowCount) = 0;
+
+  // Reader reports seek calling this method.
+  // The call must be done **before** getLoadedUnit for the new unit
+  virtual void onSeek(uint32_t unit, uint64_t rowOffsetInUnit) = 0;
 };
 
 class UnitLoaderFactory {
diff --git a/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp b/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp
index 241d91117e060..2a07b45746465 100644
--- a/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp
+++ b/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp
@@ -38,7 +38,7 @@ using RowSet = folly::Range<const facebook::velox::vector_size_t*>;
 
 static const uint64_t kNumValues = 1024768 * 8;
 
-namespace duckdb {
+namespace facebook::velox::parquet {
 
 class ByteBuffer { // on to the 10 thousandth impl
  public:
@@ -65,7 +65,7 @@ class ByteBuffer { // on to the 10 thousandth impl
   template <class T>
   T get() {
     available(sizeof(T));
-    T val = Load<T>((data_ptr_t)ptr);
+    T val = duckdb::Load<T>((duckdb::data_ptr_t)ptr);
     return val;
   }
 
@@ -104,7 +104,7 @@ class ParquetDecodeUtils {
       uint32_t count,
       uint8_t width) {
     if (width >= ParquetDecodeUtils::BITPACK_MASKS_SIZE) {
-      throw InvalidInputException(
+      throw duckdb::InvalidInputException(
           "The width (%d) of the bitpacked data exceeds the supported max width (%d), "
           "the file might be corrupted.",
           width,
@@ -145,9 +145,9 @@ class ParquetDecodeUtils {
     return result;
   }
 };
-} // namespace duckdb
+} // namespace facebook::velox::parquet
 
-const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS[] = {
+const uint64_t facebook::velox::parquet::ParquetDecodeUtils::BITPACK_MASKS[] = {
     0,
     1,
     3,
@@ -214,10 +214,11 @@ const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS[] = {
     9223372036854775807,
     18446744073709551615ULL};
 
-const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS_SIZE =
-    sizeof(ParquetDecodeUtils::BITPACK_MASKS) / sizeof(uint64_t);
+const uint64_t
+    facebook::velox::parquet::ParquetDecodeUtils::BITPACK_MASKS_SIZE =
+        sizeof(ParquetDecodeUtils::BITPACK_MASKS) / sizeof(uint64_t);
 
-const uint8_t duckdb::ParquetDecodeUtils::BITPACK_DLEN = 8;
+const uint8_t facebook::velox::parquet::ParquetDecodeUtils::BITPACK_DLEN = 8;
 
 // Array of bit packed representations of randomInts_u32. The array at index i
 // is packed i bits wide and the values come from the low bits of
@@ -316,11 +317,11 @@ void arrowBitUnpack(uint8_t bitWidth, T* result) {
 
 template <typename T>
 void duckdbBitUnpack(uint8_t bitWidth, T* result) {
-  duckdb::ByteBuffer duckInputBuffer(
+  facebook::velox::parquet::ByteBuffer duckInputBuffer(
       reinterpret_cast<char*>(bitPackedData[bitWidth].data()),
       BYTES(kNumValues, bitWidth));
   uint8_t bitpack_pos = 0;
-  duckdb::ParquetDecodeUtils::BitUnpack<T>(
+  facebook::velox::parquet::ParquetDecodeUtils::BitUnpack<T>(
       duckInputBuffer, bitpack_pos, result, kNumValues, bitWidth);
 }
 
diff --git a/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp b/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp
index 4775303057cbb..492b2517712be 100644
--- a/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp
+++ b/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp
@@ -95,6 +95,72 @@ TEST(OnDemandUnitLoaderTests, LoadsCorrectlyWithNoCallback) {
   EXPECT_EQ(readerMock.unitsLoaded(), std::vector<bool>({false, false, true}));
 }
 
+TEST(OnDemandUnitLoaderTests, CanSeek) {
+  size_t blockedOnIoCount = 0;
+  OnDemandUnitLoaderFactory factory([&](auto) { ++blockedOnIoCount; });
+  ReaderMock readerMock{{10, 20, 30}, {0, 0, 0}, factory};
+  EXPECT_EQ(readerMock.unitsLoaded(), std::vector<bool>({false, false, false}));
+  EXPECT_EQ(blockedOnIoCount, 0);
+
+  EXPECT_NO_THROW(readerMock.seek(10););
+
+  EXPECT_TRUE(readerMock.read(3)); // Unit: 1, rows: 0-2, load(1)
+  EXPECT_EQ(readerMock.unitsLoaded(), std::vector<bool>({false, true, false}));
+  EXPECT_EQ(blockedOnIoCount, 1);
+
+  EXPECT_NO_THROW(readerMock.seek(0););
+
+  EXPECT_TRUE(readerMock.read(3)); // Unit: 0, rows: 0-2, load(0), unload(1)
+  EXPECT_EQ(readerMock.unitsLoaded(), std::vector<bool>({true, false, false}));
+  EXPECT_EQ(blockedOnIoCount, 2);
+
+  EXPECT_NO_THROW(readerMock.seek(30););
+
+  EXPECT_TRUE(readerMock.read(3)); // Unit: 2, rows: 0-2, load(2), unload(0)
+  EXPECT_EQ(readerMock.unitsLoaded(), std::vector<bool>({false, false, true}));
+  EXPECT_EQ(blockedOnIoCount, 3);
+
+  EXPECT_NO_THROW(readerMock.seek(5););
+
+  EXPECT_TRUE(readerMock.read(5)); // Unit: 0, rows: 5-9, load(0), unload(1)
+  EXPECT_EQ(readerMock.unitsLoaded(), std::vector<bool>({true, false, false}));
+  EXPECT_EQ(blockedOnIoCount, 4);
+}
+
+TEST(OnDemandUnitLoaderTests, SeekOutOfRangeReaderError) {
+  size_t blockedOnIoCount = 0;
+  OnDemandUnitLoaderFactory factory([&](auto) { ++blockedOnIoCount; });
+  ReaderMock readerMock{{10, 20, 30}, {0, 0, 0}, factory};
+  EXPECT_EQ(readerMock.unitsLoaded(), std::vector<bool>({false, false, false}));
+  EXPECT_EQ(blockedOnIoCount, 0);
+  readerMock.seek(59);
+
+  readerMock.seek(60);
+
+  EXPECT_THAT(
+      [&]() { readerMock.seek(61); },
+      Throws<facebook::velox::VeloxRuntimeError>(Property(
+          &facebook::velox::VeloxRuntimeError::message,
+          HasSubstr("Can't seek to possition 61 in file. Must be up to 60."))));
+}
+
+TEST(OnDemandUnitLoaderTests, SeekOutOfRange) {
+  OnDemandUnitLoaderFactory factory(nullptr);
+  std::vector<std::atomic_bool> unitsLoaded(getUnitsLoadedWithFalse(1));
+  std::vector<std::unique_ptr<LoadUnit>> units;
+  units.push_back(std::make_unique<LoadUnitMock>(10, 0, unitsLoaded, 0));
+
+  auto unitLoader = factory.create(std::move(units));
+
+  unitLoader->onSeek(0, 10);
+
+  EXPECT_THAT(
+      [&]() { unitLoader->onSeek(0, 11); },
+      Throws<facebook::velox::VeloxRuntimeError>(Property(
+          &facebook::velox::VeloxRuntimeError::message,
+          HasSubstr("Row out of range"))));
+}
+
 TEST(OnDemandUnitLoaderTests, UnitOutOfRange) {
   OnDemandUnitLoaderFactory factory(nullptr);
   std::vector<std::atomic_bool> unitsLoaded(getUnitsLoadedWithFalse(1));
diff --git a/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp b/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp
index 716327e301490..ed1e9417d48bf 100644
--- a/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp
+++ b/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp
@@ -42,6 +42,28 @@ bool ReaderMock::read(uint64_t maxRows) {
   return true;
 }
 
+void ReaderMock::seek(uint64_t rowNumber) {
+  uint64_t totalRows = 0;
+  uint64_t rowsLeft = rowNumber;
+  for (size_t unit = 0; unit < rowsPerUnit_.size(); ++unit) {
+    const uint64_t rowCount = rowsPerUnit_[unit];
+    if (rowsLeft < rowCount) {
+      currentUnit_ = unit;
+      currentRowInUnit_ = rowsLeft;
+      loader_->onSeek(currentUnit_, currentRowInUnit_);
+      return;
+    }
+    rowsLeft -= rowCount;
+    totalRows += rowCount;
+  }
+  VELOX_CHECK_EQ(
+      rowsLeft,
+      0,
+      "Can't seek to possition {} in file. Must be up to {}.",
+      rowNumber,
+      totalRows);
+}
+
 bool ReaderMock::loadUnit() {
   VELOX_CHECK(currentRowInUnit_ <= rowsPerUnit_[currentUnit_]);
   if (currentRowInUnit_ == rowsPerUnit_[currentUnit_]) {
@@ -51,11 +73,9 @@ bool ReaderMock::loadUnit() {
       return false;
     }
   }
-  if (currentRowInUnit_ == 0) {
-    auto& unit = loader_->getLoadedUnit(currentUnit_);
-    auto& unitMock = dynamic_cast<LoadUnitMock&>(unit);
-    VELOX_CHECK(unitMock.isLoaded());
-  }
+  auto& unit = loader_->getLoadedUnit(currentUnit_);
+  auto& unitMock = dynamic_cast<LoadUnitMock&>(unit);
+  VELOX_CHECK(unitMock.isLoaded());
   return true;
 }
 
diff --git a/velox/dwio/common/tests/utils/UnitLoaderTestTools.h b/velox/dwio/common/tests/utils/UnitLoaderTestTools.h
index e7760e75fd493..f606d7db71f15 100644
--- a/velox/dwio/common/tests/utils/UnitLoaderTestTools.h
+++ b/velox/dwio/common/tests/utils/UnitLoaderTestTools.h
@@ -78,6 +78,8 @@ class ReaderMock {
 
   bool read(uint64_t maxRows);
 
+  void seek(uint64_t rowNumber);
+
   std::vector<bool> unitsLoaded() const {
     return {unitsLoaded_.begin(), unitsLoaded_.end()};
   }
diff --git a/velox/dwio/dwrf/reader/DwrfReader.cpp b/velox/dwio/dwrf/reader/DwrfReader.cpp
index 258c09bb7f1ba..5ca3c2b041df7 100644
--- a/velox/dwio/dwrf/reader/DwrfReader.cpp
+++ b/velox/dwio/dwrf/reader/DwrfReader.cpp
@@ -316,6 +316,7 @@ uint64_t DwrfRowReader::seekToRow(uint64_t rowNumber) {
   if (isEmptyFile()) {
     return 0;
   }
+  nextRowNumber_.reset();
 
   // If we are reading only a portion of the file
   // (bounded by firstStripe_ and stripeCeiling_),
@@ -357,6 +358,9 @@ uint64_t DwrfRowReader::seekToRow(uint64_t rowNumber) {
   currentRowInStripe_ = rowNumber - firstRowOfStripe_[currentStripe_];
   previousRow_ = rowNumber;
 
+  const auto loadUnitIdx = currentStripe_ - firstStripe_;
+  unitLoader_->onSeek(loadUnitIdx, currentRowInStripe_);
+
   if (currentStripe_ != previousStripe) {
     // Different stripe. Let's load the new stripe.
     currentUnit_ = nullptr;
@@ -583,6 +587,9 @@ void DwrfRowReader::readWithRowNumber(
 }
 
 int64_t DwrfRowReader::nextRowNumber() {
+  if (nextRowNumber_.has_value()) {
+    return *nextRowNumber_;
+  }
   auto strideSize = getReader().getFooter().rowIndexStride();
   while (currentStripe_ < stripeCeiling_) {
     if (currentRowInStripe_ == 0) {
@@ -601,20 +608,21 @@ int64_t DwrfRowReader::nextRowNumber() {
     }
     checkSkipStrides(strideSize);
     if (currentRowInStripe_ < rowsInCurrentStripe_) {
-      return firstRowOfStripe_[currentStripe_] + currentRowInStripe_;
+      nextRowNumber_ = firstRowOfStripe_[currentStripe_] + currentRowInStripe_;
+      return *nextRowNumber_;
     }
   advanceToNextStripe:
     ++currentStripe_;
     currentRowInStripe_ = 0;
     currentUnit_ = nullptr;
   }
-  atEnd_ = true;
+  nextRowNumber_ = kAtEnd;
   return kAtEnd;
 }
 
 int64_t DwrfRowReader::nextReadSize(uint64_t size) {
   VELOX_DCHECK_GT(size, 0);
-  if (atEnd_) {
+  if (nextRowNumber() == kAtEnd) {
     return kAtEnd;
   }
   auto rowsToRead = std::min(size, rowsInCurrentStripe_ - currentRowInStripe_);
@@ -643,6 +651,7 @@ uint64_t DwrfRowReader::next(
     return 0;
   }
   auto rowsToRead = nextReadSize(size);
+  nextRowNumber_.reset();
   previousRow_ = nextRow;
   // Record strideIndex for use by the columnReader_ which may delay actual
   // reading of the data.
diff --git a/velox/dwio/dwrf/reader/DwrfReader.h b/velox/dwio/dwrf/reader/DwrfReader.h
index acd496e414de2..549746776fa16 100644
--- a/velox/dwio/dwrf/reader/DwrfReader.h
+++ b/velox/dwio/dwrf/reader/DwrfReader.h
@@ -160,7 +160,7 @@ class DwrfRowReader : public StrideIndexProvider,
 
   dwio::common::ColumnReaderStatistics columnReaderStatistics_;
 
-  bool atEnd_{false};
+  std::optional<int64_t> nextRowNumber_;
 
   std::unique_ptr<dwio::common::UnitLoader> unitLoader_;
   DwrfUnit* currentUnit_;
diff --git a/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h b/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h
index 363082eccd64b..8684842c6fa27 100644
--- a/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h
+++ b/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h
@@ -79,7 +79,7 @@ class SelectiveByteRleColumnReader
 
   void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls)
       override {
-    readCommon<SelectiveByteRleColumnReader>(offset, rows, incomingNulls);
+    readCommon<SelectiveByteRleColumnReader, true>(offset, rows, incomingNulls);
     readOffset_ += rows.back() + 1;
   }
 
diff --git a/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h b/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h
index 5498ea77e8c52..6cce3ad3ec75f 100644
--- a/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h
+++ b/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h
@@ -49,7 +49,7 @@ class SelectiveFloatingPointColumnReader
   void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls)
       override {
     using T = SelectiveFloatingPointColumnReader<TFile, TRequested>;
-    this->template readCommon<T>(offset, rows, incomingNulls);
+    this->template readCommon<T, true>(offset, rows, incomingNulls);
     this->readOffset_ += rows.back() + 1;
   }
 
diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp
index 649319a6f9dac..eaf99cafb9374 100644
--- a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp
+++ b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp
@@ -105,7 +105,7 @@ void SelectiveIntegerDictionaryColumnReader::read(
 
   // lazy load dictionary only when it's needed
   ensureInitialized();
-  readCommon<SelectiveIntegerDictionaryColumnReader>(rows);
+  readCommon<SelectiveIntegerDictionaryColumnReader, true>(rows);
 
   readOffset_ += rows.back() + 1;
 }
diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp
index 57dc53090953f..c7bd41bda3bea 100644
--- a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp
+++ b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp
@@ -34,7 +34,7 @@ void SelectiveIntegerDirectColumnReader::read(
       offset,
       rows,
       incomingNulls);
-  readCommon<SelectiveIntegerDirectColumnReader>(rows);
+  readCommon<SelectiveIntegerDirectColumnReader, true>(rows);
   readOffset_ += rows.back() + 1;
 }
 
diff --git a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp
index 76f17361e6a6f..adc775d2d1018 100644
--- a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp
+++ b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp
@@ -422,9 +422,7 @@ void SelectiveStringDirectColumnReader::readWithVisitor(
   int32_t current = visitor.start();
   constexpr bool isExtract =
       std::is_same_v<typename TVisitor::FilterType, common::AlwaysTrue> &&
-      std::is_same_v<
-          typename TVisitor::Extract,
-          dwio::common::ExtractToReader<SelectiveStringDirectColumnReader>>;
+      std::is_same_v<typename TVisitor::Extract, dwio::common::ExtractToReader>;
   auto nulls = nullsInReadRange_ ? nullsInReadRange_->as<uint64_t>() : nullptr;
 
   if (process::hasAvx2() && isExtract) {
@@ -465,73 +463,11 @@ void SelectiveStringDirectColumnReader::readWithVisitor(
   }
 }
 
-template <typename TFilter, bool isDense, typename ExtractValues>
-void SelectiveStringDirectColumnReader::readHelper(
-    common::Filter* filter,
-    RowSet rows,
-    ExtractValues extractValues) {
-  readWithVisitor(
-      rows,
-      dwio::common::
-          ColumnVisitor<folly::StringPiece, TFilter, ExtractValues, isDense>(
-              *reinterpret_cast<TFilter*>(filter), this, rows, extractValues));
-}
-
-template <bool isDense, typename ExtractValues>
-void SelectiveStringDirectColumnReader::processFilter(
-    common::Filter* filter,
-    RowSet rows,
-    ExtractValues extractValues) {
-  if (filter == nullptr) {
-    readHelper<common::AlwaysTrue, isDense>(
-        &dwio::common::alwaysTrue(), rows, extractValues);
-    return;
-  }
-
-  switch (filter->kind()) {
-    case common::FilterKind::kAlwaysTrue:
-      readHelper<common::AlwaysTrue, isDense>(filter, rows, extractValues);
-      break;
-    case common::FilterKind::kIsNull:
-      filterNulls<StringView>(
-          rows,
-          true,
-          !std::is_same_v<decltype(extractValues), dwio::common::DropValues>);
-      break;
-    case common::FilterKind::kIsNotNull:
-      if (std::is_same_v<decltype(extractValues), dwio::common::DropValues>) {
-        filterNulls<StringView>(rows, false, false);
-      } else {
-        readHelper<common::IsNotNull, isDense>(filter, rows, extractValues);
-      }
-      break;
-    case common::FilterKind::kBytesRange:
-      readHelper<common::BytesRange, isDense>(filter, rows, extractValues);
-      break;
-    case common::FilterKind::kNegatedBytesRange:
-      readHelper<common::NegatedBytesRange, isDense>(
-          filter, rows, extractValues);
-      break;
-    case common::FilterKind::kBytesValues:
-      readHelper<common::BytesValues, isDense>(filter, rows, extractValues);
-      break;
-    case common::FilterKind::kNegatedBytesValues:
-      readHelper<common::NegatedBytesValues, isDense>(
-          filter, rows, extractValues);
-      break;
-    default:
-      readHelper<common::Filter, isDense>(filter, rows, extractValues);
-      break;
-  }
-}
-
 void SelectiveStringDirectColumnReader::read(
     vector_size_t offset,
     RowSet rows,
     const uint64_t* incomingNulls) {
   prepareRead<folly::StringPiece>(offset, rows, incomingNulls);
-  bool isDense = rows.back() == rows.size() - 1;
-
   auto numRows = rows.back() + 1;
   auto numNulls = nullsInReadRange_
       ? BaseVector::countNulls(nullsInReadRange_, 0, numRows)
@@ -542,38 +478,8 @@ void SelectiveStringDirectColumnReader::read(
       lengths_->asMutable<int32_t>(), numRows - numNulls);
   rawLengths_ = lengths_->as<uint32_t>();
   lengthIndex_ = 0;
-  if (scanSpec_->keepValues()) {
-    if (scanSpec_->valueHook()) {
-      if (isDense) {
-        readHelper<common::AlwaysTrue, true>(
-            &dwio::common::alwaysTrue(),
-            rows,
-            dwio::common::ExtractToGenericHook(scanSpec_->valueHook()));
-      } else {
-        readHelper<common::AlwaysTrue, false>(
-            &dwio::common::alwaysTrue(),
-            rows,
-            dwio::common::ExtractToGenericHook(scanSpec_->valueHook()));
-      }
-    } else {
-      if (isDense) {
-        processFilter<true>(
-            scanSpec_->filter(), rows, dwio::common::ExtractToReader(this));
-      } else {
-        processFilter<false>(
-            scanSpec_->filter(), rows, dwio::common::ExtractToReader(this));
-      }
-    }
-  } else {
-    if (isDense) {
-      processFilter<true>(
-          scanSpec_->filter(), rows, dwio::common::DropValues());
-    } else {
-      processFilter<false>(
-          scanSpec_->filter(), rows, dwio::common::DropValues());
-    }
-  }
-
+  dwio::common::StringColumnReadWithVisitorHelper<true>(
+      *this, rows)([&](auto visitor) { readWithVisitor(rows, visitor); });
   readOffset_ += numRows;
 }
 
diff --git a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h
index 21fe4a3a25e53..cfa8a7350136b 100644
--- a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h
+++ b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h
@@ -66,15 +66,6 @@ class SelectiveStringDirectColumnReader
   template <typename TVisitor>
   void readWithVisitor(RowSet rows, TVisitor visitor);
 
-  template <typename TFilter, bool isDense, typename ExtractValues>
-  void readHelper(common::Filter* filter, RowSet rows, ExtractValues values);
-
-  template <bool isDense, typename ExtractValues>
-  void processFilter(
-      common::Filter* filter,
-      RowSet rows,
-      ExtractValues extractValues);
-
   void extractCrossBuffers(
       const int32_t* lengths,
       const int32_t* starts,
diff --git a/velox/dwio/dwrf/test/CacheInputTest.cpp b/velox/dwio/dwrf/test/CacheInputTest.cpp
index 7830de27502d1..dd15964972e2c 100644
--- a/velox/dwio/dwrf/test/CacheInputTest.cpp
+++ b/velox/dwio/dwrf/test/CacheInputTest.cpp
@@ -201,8 +201,15 @@ class CacheTest : public testing::Test {
               (1 << 20) - 11,
               (streamStarts_[streamIndex + 1] - streamStarts_[streamIndex]) /
                   2)};
-      data->streams.push_back(
-          data->input->enqueue(region, streamIds_[streamIndex].get()));
+      auto stream = data->input->enqueue(region, streamIds_[streamIndex].get());
+      if (cache_->ssdCache()) {
+        auto name = static_cast<const CacheInputStream&>(*stream).getName();
+        EXPECT_TRUE(
+            name.find("ssdFile=" + cache_->ssdCache()->filePrefix()) !=
+            name.npos)
+            << name;
+      }
+      data->streams.push_back(std::move(stream));
       data->regions.push_back(region);
     }
     return data;
@@ -424,6 +431,7 @@ TEST_F(CacheTest, window) {
   auto stream = input->read(begin, end - begin, LogType::TEST);
   auto cacheInput = dynamic_cast<CacheInputStream*>(stream.get());
   EXPECT_TRUE(cacheInput != nullptr);
+  ASSERT_EQ(cacheInput->getName(), "CacheInputStream 0 of 13631488");
   auto maxSize =
       allocator_->sizeClasses().back() * memory::AllocationTraits::kPageSize;
   const void* buffer;
@@ -501,8 +509,6 @@ TEST_F(CacheTest, ssd) {
   readFiles(
       "prefix1_", 0, kSsdBytes / bytesPerFile, 30, 100, 1, kStripesPerFile, 4);
 
-  LOG(INFO) << cache_->toString();
-
   waitForWrite();
   cache_->clear();
   // Read double this to get some eviction from SSD.
@@ -523,7 +529,6 @@ TEST_F(CacheTest, ssd) {
   // issued. Also, the head of each file does not get prefetched
   // because each file has its own tracker.
   EXPECT_LE(kSsdBytes / 8, ioStats_->prefetch().sum());
-  LOG(INFO) << cache_->toString();
 
   readFiles(
       "prefix1_",
@@ -534,7 +539,6 @@ TEST_F(CacheTest, ssd) {
       1,
       kStripesPerFile,
       4);
-  LOG(INFO) << cache_->toString();
 }
 
 TEST_F(CacheTest, singleFileThreads) {
diff --git a/velox/dwio/parquet/reader/BooleanColumnReader.h b/velox/dwio/parquet/reader/BooleanColumnReader.h
index 41d3405abd548..73126f4679888 100644
--- a/velox/dwio/parquet/reader/BooleanColumnReader.h
+++ b/velox/dwio/parquet/reader/BooleanColumnReader.h
@@ -49,7 +49,7 @@ class BooleanColumnReader : public dwio::common::SelectiveByteRleColumnReader {
 
   void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls)
       override {
-    readCommon<BooleanColumnReader>(offset, rows, incomingNulls);
+    readCommon<BooleanColumnReader, true>(offset, rows, incomingNulls);
     readOffset_ += rows.back() + 1;
   }
 
diff --git a/velox/dwio/parquet/reader/FloatingPointColumnReader.h b/velox/dwio/parquet/reader/FloatingPointColumnReader.h
index be4c2cd843631..ed91e67a739ff 100644
--- a/velox/dwio/parquet/reader/FloatingPointColumnReader.h
+++ b/velox/dwio/parquet/reader/FloatingPointColumnReader.h
@@ -48,7 +48,7 @@ class FloatingPointColumnReader
   void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls)
       override {
     using T = FloatingPointColumnReader<TData, TRequested>;
-    this->template readCommon<T>(offset, rows, incomingNulls);
+    this->template readCommon<T, true>(offset, rows, incomingNulls);
     this->readOffset_ += rows.back() + 1;
   }
 
diff --git a/velox/dwio/parquet/reader/IntegerColumnReader.h b/velox/dwio/parquet/reader/IntegerColumnReader.h
index 59a9fc12bf919..d7b458c739534 100644
--- a/velox/dwio/parquet/reader/IntegerColumnReader.h
+++ b/velox/dwio/parquet/reader/IntegerColumnReader.h
@@ -75,7 +75,7 @@ class IntegerColumnReader : public dwio::common::SelectiveIntegerColumnReader {
         offset,
         rows,
         nullptr);
-    readCommon<IntegerColumnReader>(rows);
+    readCommon<IntegerColumnReader, true>(rows);
     readOffset_ += rows.back() + 1;
   }
 
diff --git a/velox/dwio/parquet/reader/StringColumnReader.cpp b/velox/dwio/parquet/reader/StringColumnReader.cpp
index 334c3c02a7e20..2dd0250159c3e 100644
--- a/velox/dwio/parquet/reader/StringColumnReader.cpp
+++ b/velox/dwio/parquet/reader/StringColumnReader.cpp
@@ -31,104 +31,15 @@ uint64_t StringColumnReader::skip(uint64_t numValues) {
   return numValues;
 }
 
-template <typename TFilter, bool isDense, typename ExtractValues>
-void StringColumnReader::readHelper(
-    common::Filter* filter,
-    RowSet rows,
-    ExtractValues extractValues) {
-  formatData_->as<ParquetData>().readWithVisitor(
-      dwio::common::
-          ColumnVisitor<folly::StringPiece, TFilter, ExtractValues, isDense>(
-              *reinterpret_cast<TFilter*>(filter), this, rows, extractValues));
-}
-
-template <bool isDense, typename ExtractValues>
-void StringColumnReader::processFilter(
-    common::Filter* filter,
-    RowSet rows,
-    ExtractValues extractValues) {
-  if (filter == nullptr) {
-    readHelper<common::AlwaysTrue, isDense>(
-        &dwio::common::alwaysTrue(), rows, extractValues);
-    return;
-  }
-
-  switch (filter->kind()) {
-    case common::FilterKind::kAlwaysTrue:
-      readHelper<common::AlwaysTrue, isDense>(filter, rows, extractValues);
-      break;
-    case common::FilterKind::kIsNull:
-      filterNulls<StringView>(
-          rows,
-          true,
-          !std::is_same<decltype(extractValues), dwio::common::DropValues>::
-              value);
-      break;
-    case common::FilterKind::kIsNotNull:
-      if (std::is_same<decltype(extractValues), dwio::common::DropValues>::
-              value) {
-        filterNulls<StringView>(rows, false, false);
-      } else {
-        readHelper<common::IsNotNull, isDense>(filter, rows, extractValues);
-      }
-      break;
-    case common::FilterKind::kBytesRange:
-      readHelper<common::BytesRange, isDense>(filter, rows, extractValues);
-      break;
-    case common::FilterKind::kNegatedBytesRange:
-      readHelper<common::NegatedBytesRange, isDense>(
-          filter, rows, extractValues);
-      break;
-    case common::FilterKind::kBytesValues:
-      readHelper<common::BytesValues, isDense>(filter, rows, extractValues);
-      break;
-    case common::FilterKind::kNegatedBytesValues:
-      readHelper<common::NegatedBytesValues, isDense>(
-          filter, rows, extractValues);
-      break;
-    default:
-      readHelper<common::Filter, isDense>(filter, rows, extractValues);
-      break;
-  }
-}
-
 void StringColumnReader::read(
     vector_size_t offset,
     RowSet rows,
     const uint64_t* incomingNulls) {
   prepareRead<folly::StringPiece>(offset, rows, incomingNulls);
-  bool isDense = rows.back() == rows.size() - 1;
-  if (scanSpec_->keepValues()) {
-    if (scanSpec_->valueHook()) {
-      if (isDense) {
-        readHelper<common::AlwaysTrue, true>(
-            &dwio::common::alwaysTrue(),
-            rows,
-            dwio::common::ExtractToGenericHook(scanSpec_->valueHook()));
-      } else {
-        readHelper<common::AlwaysTrue, false>(
-            &dwio::common::alwaysTrue(),
-            rows,
-            dwio::common::ExtractToGenericHook(scanSpec_->valueHook()));
-      }
-      return;
-    }
-    if (isDense) {
-      processFilter<true>(
-          scanSpec_->filter(), rows, dwio::common::ExtractToReader(this));
-    } else {
-      processFilter<false>(
-          scanSpec_->filter(), rows, dwio::common::ExtractToReader(this));
-    }
-  } else {
-    if (isDense) {
-      processFilter<true>(
-          scanSpec_->filter(), rows, dwio::common::DropValues());
-    } else {
-      processFilter<false>(
-          scanSpec_->filter(), rows, dwio::common::DropValues());
-    }
-  }
+  dwio::common::StringColumnReadWithVisitorHelper<true>(
+      *this, rows)([&](auto visitor) {
+    formatData_->as<ParquetData>().readWithVisitor(visitor);
+  });
   readOffset_ += rows.back() + 1;
 }
 
diff --git a/velox/dwio/parquet/reader/StringColumnReader.h b/velox/dwio/parquet/reader/StringColumnReader.h
index 23269fda84462..e9bedc2365fc8 100644
--- a/velox/dwio/parquet/reader/StringColumnReader.h
+++ b/velox/dwio/parquet/reader/StringColumnReader.h
@@ -49,27 +49,6 @@ class StringColumnReader : public dwio::common::SelectiveColumnReader {
   void getValues(RowSet rows, VectorPtr* result) override;
 
   void dedictionarize() override;
-
- private:
-  template <bool hasNulls>
-  void skipInDecode(int32_t numValues, int32_t current, const uint64_t* nulls);
-
-  folly::StringPiece readValue(int32_t length);
-
-  template <bool hasNulls, typename Visitor>
-  void decode(const uint64_t* nulls, Visitor visitor);
-
-  template <typename TVisitor>
-  void readWithVisitor(RowSet rows, TVisitor visitor);
-
-  template <typename TFilter, bool isDense, typename ExtractValues>
-  void readHelper(common::Filter* filter, RowSet rows, ExtractValues values);
-
-  template <bool isDense, typename ExtractValues>
-  void processFilter(
-      common::Filter* filter,
-      RowSet rows,
-      ExtractValues extractValues);
 };
 
 } // namespace facebook::velox::parquet
diff --git a/velox/exec/Driver.cpp b/velox/exec/Driver.cpp
index f6e466cb5a5ab..fa732e12cba09 100644
--- a/velox/exec/Driver.cpp
+++ b/velox/exec/Driver.cpp
@@ -110,6 +110,19 @@ inline void checkIsBlockFutureValid(
       op->operatorType());
 }
 
+// Used to generate context for exceptions that are thrown while executing an
+// operator. Eg output: 'Operator: FilterProject(1) PlanNodeId: 1 TaskId:
+// test_cursor 1 PipelineId: 0 DriverId: 0 OperatorAddress: 0x61a000003c80'
+std::string addContextOnException(
+    VeloxException::Type exceptionType,
+    void* arg) {
+  if (exceptionType != VeloxException::Type::kSystem) {
+    return "";
+  }
+  auto* op = static_cast<Operator*>(arg);
+  return fmt::format("Operator: {}", op->toString());
+}
+
 } // namespace
 
 DriverCtx::DriverCtx(
@@ -293,11 +306,12 @@ void Driver::initializeOperators() {
 }
 
 void Driver::pushdownFilters(int operatorIndex) {
-  auto op = operators_[operatorIndex].get();
+  auto* op = operators_[operatorIndex].get();
   const auto& filters = op->getDynamicFilters();
   if (filters.empty()) {
     return;
   }
+  const auto& planNodeId = op->planNodeId();
 
   op->addRuntimeStat("dynamicFiltersProduced", RuntimeCounter(filters.size()));
 
@@ -313,7 +327,7 @@ void Driver::pushdownFilters(int operatorIndex) {
             prevOp->canAddDynamicFilter(),
             "Cannot push down dynamic filters produced by {}",
             op->toString());
-        prevOp->addDynamicFilter(channel, entry.second);
+        prevOp->addDynamicFilter(planNodeId, channel, entry.second);
         prevOp->addRuntimeStat("dynamicFiltersAccepted", RuntimeCounter(1));
         break;
       }
@@ -327,7 +341,7 @@ void Driver::pushdownFilters(int operatorIndex) {
             prevOp->canAddDynamicFilter(),
             "Cannot push down dynamic filters produced by {}",
             op->toString());
-        prevOp->addDynamicFilter(channel, entry.second);
+        prevOp->addDynamicFilter(planNodeId, channel, entry.second);
         prevOp->addRuntimeStat("dynamicFiltersAccepted", RuntimeCounter(1));
         break;
       }
@@ -374,10 +388,12 @@ void Driver::enqueueInternal() {
     RuntimeStatWriterScopeGuard statsWriterGuard(operatorPtr);             \
     threadNumVeloxThrow() = 0;                                             \
     opCallStatus_.start(operatorId, operatorMethod);                       \
+    ExceptionContextSetter exceptionContext(                               \
+        {addContextOnException, operatorPtr, true});                       \
     auto stopGuard = folly::makeGuard([&]() { opCallStatus_.stop(); });    \
     call;                                                                  \
     recordSilentThrows(*operatorPtr);                                      \
-  } catch (const VeloxException& e) {                                      \
+  } catch (const VeloxException&) {                                        \
     throw;                                                                 \
   } catch (const std::exception& e) {                                      \
     VELOX_FAIL(                                                            \
diff --git a/velox/exec/ExchangeClient.cpp b/velox/exec/ExchangeClient.cpp
index 0ca18e1fcf700..65eab7842e675 100644
--- a/velox/exec/ExchangeClient.cpp
+++ b/velox/exec/ExchangeClient.cpp
@@ -117,6 +117,11 @@ ExchangeClient::next(uint32_t maxBytes, bool* atEnd, ContinueFuture* future) {
   std::vector<std::unique_ptr<SerializedPage>> pages;
   {
     std::lock_guard<std::mutex> l(queue_->mutex());
+    if (closed_) {
+      *atEnd = true;
+      return pages;
+    }
+
     *atEnd = false;
     pages = queue_->dequeueLocked(maxBytes, atEnd, future);
     if (*atEnd) {
diff --git a/velox/exec/HashProbe.cpp b/velox/exec/HashProbe.cpp
index 318ff7b1df2da..3ff7e6c5d2e53 100644
--- a/velox/exec/HashProbe.cpp
+++ b/velox/exec/HashProbe.cpp
@@ -478,6 +478,7 @@ void HashProbe::prepareInputIndicesBuffers(
   VELOX_DCHECK(spillEnabled());
   const auto maxIndicesBufferBytes = numInput * sizeof(vector_size_t);
   if (nonSpillInputIndicesBuffer_ == nullptr ||
+      !nonSpillInputIndicesBuffer_->isMutable() ||
       nonSpillInputIndicesBuffer_->size() < maxIndicesBufferBytes) {
     nonSpillInputIndicesBuffer_ = allocateIndices(numInput, pool());
     rawNonSpillInputIndicesBuffer_ =
@@ -1052,7 +1053,7 @@ bool HashProbe::maybeReadSpillOutput() {
   return true;
 }
 
-void HashProbe::fillFilterInput(vector_size_t size) {
+RowVectorPtr HashProbe::createFilterInput(vector_size_t size) {
   std::vector<VectorPtr> filterColumns(filterInputType_->size());
   for (auto projection : filterInputProjections_) {
     ensureLoadedIfNotAtEnd(projection.inputChannel);
@@ -1068,11 +1069,12 @@ void HashProbe::fillFilterInput(vector_size_t size) {
       filterInputType_->children(),
       filterColumns);
 
-  filterInput_ = std::make_shared<RowVector>(
+  return std::make_shared<RowVector>(
       pool(), filterInputType_, nullptr, size, std::move(filterColumns));
 }
 
 void HashProbe::prepareFilterRowsForNullAwareJoin(
+    RowVectorPtr& filterInput,
     vector_size_t numRows,
     bool filterPropagateNulls) {
   VELOX_CHECK_LE(numRows, kBatchSize);
@@ -1086,7 +1088,7 @@ void HashProbe::prepareFilterRowsForNullAwareJoin(
     auto* rawNullRows = nullFilterInputRows_.asMutableRange().bits();
     for (auto& projection : filterInputProjections_) {
       filterInputColumnDecodedVector_.decode(
-          *filterInput_->childAt(projection.outputChannel), filterInputRows_);
+          *filterInput->childAt(projection.outputChannel), filterInputRows_);
       if (filterInputColumnDecodedVector_.mayHaveNulls()) {
         SelectivityVector nullsInActiveRows(numRows);
         memcpy(
@@ -1285,13 +1287,14 @@ int32_t HashProbe::evalFilter(int32_t numRows) {
     filterInputRows_.updateBounds();
   }
 
-  fillFilterInput(numRows);
+  RowVectorPtr filterInput = createFilterInput(numRows);
 
   if (nullAware_) {
-    prepareFilterRowsForNullAwareJoin(numRows, filterPropagateNulls);
+    prepareFilterRowsForNullAwareJoin(
+        filterInput, numRows, filterPropagateNulls);
   }
 
-  EvalCtx evalCtx(operatorCtx_->execCtx(), filter_.get(), filterInput_.get());
+  EvalCtx evalCtx(operatorCtx_->execCtx(), filter_.get(), filterInput.get());
   filter_->eval(0, 1, true, filterInputRows_, evalCtx, filterResult_);
 
   decodedFilterResult_.decode(*filterResult_[0], filterInputRows_);
@@ -1668,7 +1671,7 @@ void HashProbe::spillOutput(const std::vector<HashProbe*>& operators) {
       // this runs.
       try {
         spillTask->move();
-      } catch (const std::exception& e) {
+      } catch (const std::exception&) {
       }
     }
   });
@@ -1770,7 +1773,7 @@ SpillPartitionSet HashProbe::spillTable() {
       // this runs.
       try {
         spillTask->move();
-      } catch (const std::exception& e) {
+      } catch (const std::exception&) {
       }
     }
   });
diff --git a/velox/exec/HashProbe.h b/velox/exec/HashProbe.h
index 997847fbb057d..3b9af4fd3d8e8 100644
--- a/velox/exec/HashProbe.h
+++ b/velox/exec/HashProbe.h
@@ -128,14 +128,17 @@ class HashProbe : public Operator {
         decodedFilterResult_.valueAt<bool>(row);
   }
 
-  // Populate filter input columns.
-  void fillFilterInput(vector_size_t size);
+  // Create a temporary input vector to be passed to the filter. This ensures it
+  // gets destroyed in case its wrapping an unloaded vector which eventually
+  // needs to be wrapped in fillOutput().
+  RowVectorPtr createFilterInput(vector_size_t size);
 
   // Prepare filter row selectivity for null-aware join. 'numRows'
   // specifies the number of rows in 'filterInputRows_' to process. If
   // 'filterPropagateNulls' is true, the probe input row which has null in any
   // probe filter column can't pass the filter.
   void prepareFilterRowsForNullAwareJoin(
+      RowVectorPtr& filterInput,
       vector_size_t numRows,
       bool filterPropagateNulls);
 
@@ -372,7 +375,7 @@ class HashProbe : public Operator {
   // side. Used by right semi project join.
   bool probeSideHasNullKeys_{false};
 
-  // Rows in 'filterInput_' to apply 'filter_' to.
+  // Rows in the filter columns to apply 'filter_' to.
   SelectivityVector filterInputRows_;
 
   // Join filter.
@@ -390,11 +393,6 @@ class HashProbe : public Operator {
   // Maps from column index in hash table to channel in 'filterInputType_'.
   std::vector<IdentityProjection> filterTableProjections_;
 
-  // Temporary projection from probe and build for evaluating
-  // 'filter_'. This can always be reused since this does not escape
-  // this operator.
-  RowVectorPtr filterInput_;
-
   // The following six fields are used in null-aware anti join filter
   // processing.
 
diff --git a/velox/exec/Operator.cpp b/velox/exec/Operator.cpp
index 45c83098b561c..18483e565414a 100644
--- a/velox/exec/Operator.cpp
+++ b/velox/exec/Operator.cpp
@@ -395,9 +395,10 @@ std::string Operator::toString() const {
   std::stringstream out;
   if (auto task = operatorCtx_->task()) {
     auto driverCtx = operatorCtx_->driverCtx();
-    out << operatorType() << "(" << operatorId() << ")<" << task->taskId()
-        << ":" << driverCtx->pipelineId << "." << driverCtx->driverId << " "
-        << this;
+    out << operatorType() << "(" << operatorId() << ")"
+        << " PlanNodeId: " << planNodeId() << " TaskId: " << task->taskId()
+        << " PipelineId: " << driverCtx->pipelineId
+        << " DriverId: " << driverCtx->driverId << " OperatorAddress: " << this;
   } else {
     out << "<Terminated, no task>";
   }
@@ -504,6 +505,8 @@ void OperatorStats::add(const OperatorStats& other) {
   spilledFiles += other.spilledFiles;
 
   numNullKeys += other.numNullKeys;
+
+  dynamicFilterStats.add(other.dynamicFilterStats);
 }
 
 void OperatorStats::clear() {
@@ -537,6 +540,8 @@ void OperatorStats::clear() {
   spilledRows = 0;
   spilledPartitions = 0;
   spilledFiles = 0;
+
+  dynamicFilterStats.clear();
 }
 
 std::unique_ptr<memory::MemoryReclaimer> Operator::MemoryReclaimer::create(
diff --git a/velox/exec/Operator.h b/velox/exec/Operator.h
index c48862c60b63a..3519b2f2d6fbe 100644
--- a/velox/exec/Operator.h
+++ b/velox/exec/Operator.h
@@ -82,6 +82,26 @@ struct MemoryStats {
   }
 };
 
+/// Records the dynamic filter stats of an operator.
+struct DynamicFilterStats {
+  /// The set of plan node ids that produce the dynamic filter added to an
+  /// operator. If it is empty, then there is no dynamic filter added.
+  std::unordered_set<core::PlanNodeId> producerNodeIds;
+
+  void clear() {
+    producerNodeIds.clear();
+  }
+
+  void add(const DynamicFilterStats& other) {
+    producerNodeIds.insert(
+        other.producerNodeIds.begin(), other.producerNodeIds.end());
+  }
+
+  bool empty() const {
+    return producerNodeIds.empty();
+  }
+};
+
 struct OperatorStats {
   /// Initial ordinal position in the operator's pipeline.
   int32_t operatorId = 0;
@@ -106,6 +126,9 @@ struct OperatorStats {
   uint64_t inputBytes = 0;
   uint64_t inputPositions = 0;
 
+  /// Contains the dynamic filters stats if applied.
+  DynamicFilterStats dynamicFilterStats;
+
   /// Number of input batches / vectors. Allows to compute an average batch
   /// size.
   uint64_t inputVectors = 0;
@@ -161,7 +184,7 @@ struct OperatorStats {
 
   int numDrivers = 0;
 
-  OperatorStats() {}
+  OperatorStats() = default;
 
   OperatorStats(
       int32_t _operatorId,
@@ -422,6 +445,7 @@ class Operator : public BaseRuntimeStatWriter {
   /// Adds a filter dynamically generated by a downstream operator. Called only
   /// if canAddFilter() returns true.
   virtual void addDynamicFilter(
+      const core::PlanNodeId& /*producer*/,
       column_index_t /*outputChannel*/,
       const std::shared_ptr<common::Filter>& /*filter*/) {
     VELOX_UNSUPPORTED(
diff --git a/velox/exec/PlanNodeStats.cpp b/velox/exec/PlanNodeStats.cpp
index c536f31b00b32..caab8f7648728 100644
--- a/velox/exec/PlanNodeStats.cpp
+++ b/velox/exec/PlanNodeStats.cpp
@@ -39,6 +39,8 @@ void PlanNodeStats::addTotals(const OperatorStats& stats) {
   rawInputRows += stats.rawInputPositions;
   rawInputBytes += stats.rawInputBytes;
 
+  dynamicFilterStats.add(stats.dynamicFilterStats);
+
   outputRows += stats.outputPositions;
   outputBytes += stats.outputBytes;
   outputVectors += stats.outputVectors;
@@ -112,6 +114,11 @@ std::string PlanNodeStats::toString(bool includeInputStats) const {
         << succinctBytes(spilledBytes) << ", " << spilledFiles << " files)";
   }
 
+  if (!dynamicFilterStats.empty()) {
+    out << ", DynamicFilter producer plan nodes: "
+        << folly::join(',', dynamicFilterStats.producerNodeIds);
+  }
+
   return out.str();
 }
 
diff --git a/velox/exec/PlanNodeStats.h b/velox/exec/PlanNodeStats.h
index a53a7fa3ea007..4d10b9d608759 100644
--- a/velox/exec/PlanNodeStats.h
+++ b/velox/exec/PlanNodeStats.h
@@ -62,6 +62,9 @@ struct PlanNodeStats {
   /// Sum of raw input bytes for all corresponding operators.
   uint64_t rawInputBytes{0};
 
+  /// Contains the dynamic filters stats if applied.
+  DynamicFilterStats dynamicFilterStats;
+
   /// Sum of output rows for all corresponding operators. When
   /// plan node corresponds to multiple operator types, operators of only one of
   /// these types report non-zero output rows.
diff --git a/velox/exec/TableScan.cpp b/velox/exec/TableScan.cpp
index 1b4674cf50018..f1c24d97e6c48 100644
--- a/velox/exec/TableScan.cpp
+++ b/velox/exec/TableScan.cpp
@@ -356,12 +356,14 @@ bool TableScan::isFinished() {
 }
 
 void TableScan::addDynamicFilter(
+    const core::PlanNodeId& producer,
     column_index_t outputChannel,
     const std::shared_ptr<common::Filter>& filter) {
   if (dataSource_) {
     dataSource_->addDynamicFilter(outputChannel, filter);
   }
   dynamicFilters_.emplace(outputChannel, filter);
+  stats_.wlock()->dynamicFilterStats.producerNodeIds.emplace(producer);
 }
 
 } // namespace facebook::velox::exec
diff --git a/velox/exec/TableScan.h b/velox/exec/TableScan.h
index 973821a3a45c8..516e377516ba3 100644
--- a/velox/exec/TableScan.h
+++ b/velox/exec/TableScan.h
@@ -46,6 +46,7 @@ class TableScan : public SourceOperator {
   }
 
   void addDynamicFilter(
+      const core::PlanNodeId& producer,
       column_index_t outputChannel,
       const std::shared_ptr<common::Filter>& filter) override;
 
diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp
index c0ac2f53283ce..01fdd52266bff 100644
--- a/velox/exec/Task.cpp
+++ b/velox/exec/Task.cpp
@@ -259,6 +259,7 @@ std::shared_ptr<Task> Task::create(
       std::move(onError));
 }
 
+// static
 std::shared_ptr<Task> Task::create(
     const std::string& taskId,
     core::PlanFragment planFragment,
@@ -279,42 +280,6 @@ std::shared_ptr<Task> Task::create(
   return task;
 }
 
-std::shared_ptr<Task> Task::create(
-    const std::string& taskId,
-    core::PlanFragment planFragment,
-    int destination,
-    std::shared_ptr<core::QueryCtx> queryCtx,
-    Consumer consumer,
-    std::function<void(std::exception_ptr)> onError) {
-  return Task::create(
-      taskId,
-      std::move(planFragment),
-      destination,
-      std::move(queryCtx),
-      (consumer ? [c = std::move(consumer)]() { return c; }
-                : ConsumerSupplier{}),
-      std::move(onError));
-}
-
-std::shared_ptr<Task> Task::create(
-    const std::string& taskId,
-    core::PlanFragment planFragment,
-    int destination,
-    std::shared_ptr<core::QueryCtx> queryCtx,
-    ConsumerSupplier consumerSupplier,
-    std::function<void(std::exception_ptr)> onError) {
-  auto task = std::shared_ptr<Task>(new Task(
-      taskId,
-      std::move(planFragment),
-      destination,
-      std::move(queryCtx),
-      Task::ExecutionMode::kParallel,
-      std::move(consumerSupplier),
-      std::move(onError)));
-  task->initTaskPool();
-  return task;
-}
-
 Task::Task(
     const std::string& taskId,
     core::PlanFragment planFragment,
diff --git a/velox/exec/Task.h b/velox/exec/Task.h
index 09716455d18a9..3945103a0f326 100644
--- a/velox/exec/Task.h
+++ b/velox/exec/Task.h
@@ -84,24 +84,6 @@ class Task : public std::enable_shared_from_this<Task> {
       ConsumerSupplier consumerSupplier,
       std::function<void(std::exception_ptr)> onError = nullptr);
 
-  /// TODO: Delete following two overloads once all callers are migrated to the
-  /// above ones
-  static std::shared_ptr<Task> create(
-      const std::string& taskId,
-      core::PlanFragment planFragment,
-      int destination,
-      std::shared_ptr<core::QueryCtx> queryCtx,
-      Consumer consumer = nullptr,
-      std::function<void(std::exception_ptr)> onError = nullptr);
-
-  static std::shared_ptr<Task> create(
-      const std::string& taskId,
-      core::PlanFragment planFragment,
-      int destination,
-      std::shared_ptr<core::QueryCtx> queryCtx,
-      ConsumerSupplier consumerSupplier,
-      std::function<void(std::exception_ptr)> onError = nullptr);
-
   ~Task();
 
   /// Specify directory to which data will be spilled if spilling is enabled and
diff --git a/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp b/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp
index ce030bff359f4..0f2f9c0d76af0 100644
--- a/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp
+++ b/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp
@@ -197,6 +197,9 @@ struct HashTableBenchmarkResult {
 
 class HashTableListJoinResultBenchmark : public VectorTestBase {
  public:
+  HashTableListJoinResultBenchmark()
+      : randomEngine_((std::random_device{}())) {}
+
   HashTableBenchmarkResult run(HashTableBenchmarkParams params) {
     params_ = params;
     HashTableBenchmarkResult result;
@@ -260,7 +263,8 @@ class HashTableListJoinResultBenchmark : public VectorTestBase {
     if (addExtraValue) {
       data[0] = params_.extraValue;
     }
-    std::random_shuffle(data.begin(), data.end());
+
+    std::shuffle(data.begin(), data.end(), randomEngine_);
     std::vector<VectorPtr> children;
     children.push_back(makeFlatVector<int64_t>(data));
     for (int32_t i = 0; i < params_.numDependentFields; ++i) {
@@ -462,6 +466,7 @@ class HashTableListJoinResultBenchmark : public VectorTestBase {
     eraseTime_ += eraseClock.timeToDropValue();
   }
 
+  std::default_random_engine randomEngine_;
   std::unique_ptr<HashTable<true>> topTable_;
   HashTableBenchmarkParams params_;
 
diff --git a/velox/exec/fuzzer/AggregationFuzzer.cpp b/velox/exec/fuzzer/AggregationFuzzer.cpp
index e6d15548e62c3..cde7098bc1250 100644
--- a/velox/exec/fuzzer/AggregationFuzzer.cpp
+++ b/velox/exec/fuzzer/AggregationFuzzer.cpp
@@ -25,7 +25,7 @@
 
 #include "velox/exec/PartitionFunction.h"
 #include "velox/exec/fuzzer/AggregationFuzzerBase.h"
-#include "velox/expression/tests/utils/FuzzerToolkit.h"
+#include "velox/expression/fuzzer/FuzzerToolkit.h"
 #include "velox/vector/VectorSaver.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
 
@@ -39,8 +39,8 @@ DEFINE_bool(
     false,
     "When true, the results of the window aggregation are compared to reference DB results");
 
-using facebook::velox::test::CallableSignature;
-using facebook::velox::test::SignatureTemplate;
+using facebook::velox::fuzzer::CallableSignature;
+using facebook::velox::fuzzer::SignatureTemplate;
 
 namespace facebook::velox::exec::test {
 
@@ -153,7 +153,7 @@ class AggregationFuzzer : public AggregationFuzzerBase {
       const std::vector<PlanWithSplits>& plans,
       bool customVerification,
       const std::vector<std::shared_ptr<ResultVerifier>>& customVerifiers,
-      const velox::test::ResultOrError& expected,
+      const velox::fuzzer::ResultOrError& expected,
       int32_t maxDrivers = 2,
       bool testWithSpilling = true) {
     for (auto i = 0; i < plans.size(); ++i) {
@@ -1074,7 +1074,7 @@ bool AggregationFuzzer::compareEquivalentPlanResults(
           stats_.updateReferenceQueryStats(referenceResult.second);
 
           if (referenceResult.first) {
-            velox::test::ResultOrError expected;
+            velox::fuzzer::ResultOrError expected;
             expected.result =
                 mergeRowVectors(referenceResult.first.value(), pool_.get());
 
diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.cpp b/velox/exec/fuzzer/AggregationFuzzerBase.cpp
index 462ecf9932ba8..6f6e42dd7bb69 100644
--- a/velox/exec/fuzzer/AggregationFuzzerBase.cpp
+++ b/velox/exec/fuzzer/AggregationFuzzerBase.cpp
@@ -25,7 +25,7 @@
 #include "velox/exec/fuzzer/PrestoQueryRunner.h"
 #include "velox/exec/tests/utils/TempDirectoryPath.h"
 #include "velox/expression/SignatureBinder.h"
-#include "velox/expression/tests/utils/ArgumentTypeFuzzer.h"
+#include "velox/expression/fuzzer/ArgumentTypeFuzzer.h"
 #include "velox/vector/VectorSaver.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
 
@@ -198,7 +198,7 @@ AggregationFuzzerBase::pickSignature() {
     const auto& signatureTemplate =
         signatureTemplates_[idx - signatures_.size()];
     signature.name = signatureTemplate.name;
-    velox::test::ArgumentTypeFuzzer typeFuzzer(
+    velox::fuzzer::ArgumentTypeFuzzer typeFuzzer(
         *signatureTemplate.signature, rng_);
     VELOX_CHECK(typeFuzzer.fuzzArgumentTypes(FLAGS_max_num_varargs));
     signature.args = typeFuzzer.argumentTypes();
@@ -386,7 +386,7 @@ void AggregationFuzzerBase::printSignatureStats() {
   }
 }
 
-velox::test::ResultOrError AggregationFuzzerBase::execute(
+velox::fuzzer::ResultOrError AggregationFuzzerBase::execute(
     const core::PlanNodePtr& plan,
     const std::vector<exec::Split>& splits,
     bool injectSpill,
@@ -395,7 +395,7 @@ velox::test::ResultOrError AggregationFuzzerBase::execute(
   LOG(INFO) << "Executing query plan: " << std::endl
             << plan->toString(true, true);
 
-  velox::test::ResultOrError resultOrError;
+  velox::fuzzer::ResultOrError resultOrError;
   try {
     std::shared_ptr<TempDirectoryPath> spillDirectory;
     AssertQueryBuilder builder(plan);
@@ -511,7 +511,7 @@ void AggregationFuzzerBase::testPlan(
     bool abandonPartial,
     bool customVerification,
     const std::vector<std::shared_ptr<ResultVerifier>>& customVerifiers,
-    const velox::test::ResultOrError& expected,
+    const velox::fuzzer::ResultOrError& expected,
     int32_t maxDrivers) {
   auto actual = execute(
       planWithSplits.plan,
@@ -523,10 +523,10 @@ void AggregationFuzzerBase::testPlan(
 }
 
 void AggregationFuzzerBase::compare(
-    const velox::test::ResultOrError& actual,
+    const velox::fuzzer::ResultOrError& actual,
     bool customVerification,
     const std::vector<std::shared_ptr<ResultVerifier>>& customVerifiers,
-    const velox::test::ResultOrError& expected) {
+    const velox::fuzzer::ResultOrError& expected) {
   // Compare results or exceptions (if any). Fail is anything is different.
   if (FLAGS_enable_oom_injection) {
     // If OOM injection is enabled and we've made it this far and the test
@@ -537,7 +537,8 @@ void AggregationFuzzerBase::compare(
   // Compare results or exceptions (if any). Fail if anything is different.
   if (expected.exceptionPtr || actual.exceptionPtr) {
     // Throws in case exceptions are not compatible.
-    velox::test::compareExceptions(expected.exceptionPtr, actual.exceptionPtr);
+    velox::fuzzer::compareExceptions(
+        expected.exceptionPtr, actual.exceptionPtr);
     return;
   }
 
diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.h b/velox/exec/fuzzer/AggregationFuzzerBase.h
index 9b9c0530dcaa1..f2f9b6fb632fd 100644
--- a/velox/exec/fuzzer/AggregationFuzzerBase.h
+++ b/velox/exec/fuzzer/AggregationFuzzerBase.h
@@ -23,7 +23,7 @@
 #include "velox/exec/fuzzer/ReferenceQueryRunner.h"
 #include "velox/exec/fuzzer/ResultVerifier.h"
 #include "velox/exec/tests/utils/AssertQueryBuilder.h"
-#include "velox/expression/tests/utils/FuzzerToolkit.h"
+#include "velox/expression/fuzzer/FuzzerToolkit.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
 #include "velox/vector/tests/utils/VectorMaker.h"
 
@@ -47,8 +47,8 @@ DECLARE_bool(log_signature_stats);
 
 namespace facebook::velox::exec::test {
 
-using facebook::velox::test::CallableSignature;
-using facebook::velox::test::SignatureTemplate;
+using facebook::velox::fuzzer::CallableSignature;
+using facebook::velox::fuzzer::SignatureTemplate;
 
 constexpr const std::string_view kPlanNodeFileName = "plan_nodes";
 
@@ -219,7 +219,7 @@ class AggregationFuzzerBase {
       const core::PlanNodePtr& plan,
       const std::vector<RowVectorPtr>& input);
 
-  velox::test::ResultOrError execute(
+  velox::fuzzer::ResultOrError execute(
       const core::PlanNodePtr& plan,
       const std::vector<exec::Split>& splits = {},
       bool injectSpill = false,
@@ -236,10 +236,10 @@ class AggregationFuzzerBase {
       const std::vector<RowVectorPtr>& input);
 
   void compare(
-      const velox::test::ResultOrError& actual,
+      const velox::fuzzer::ResultOrError& actual,
       bool customVerification,
       const std::vector<std::shared_ptr<ResultVerifier>>& customVerifiers,
-      const velox::test::ResultOrError& expected);
+      const velox::fuzzer::ResultOrError& expected);
 
   /// Returns false if the type or its children are unsupported.
   /// Currently returns false if type is Date,IntervalDayTime or Unknown.
@@ -258,7 +258,7 @@ class AggregationFuzzerBase {
       bool abandonPartial,
       bool customVerification,
       const std::vector<std::shared_ptr<ResultVerifier>>& customVerifiers,
-      const velox::test::ResultOrError& expected,
+      const velox::fuzzer::ResultOrError& expected,
       int32_t maxDrivers = 2);
 
   void printSignatureStats();
diff --git a/velox/exec/fuzzer/AggregationFuzzerRunner.h b/velox/exec/fuzzer/AggregationFuzzerRunner.h
index 0697576551992..f82a2120189cb 100644
--- a/velox/exec/fuzzer/AggregationFuzzerRunner.h
+++ b/velox/exec/fuzzer/AggregationFuzzerRunner.h
@@ -94,7 +94,7 @@ class AggregationFuzzerRunner {
       exit(1);
     }
 
-    auto filteredSignatures = velox::test::filterSignatures(
+    auto filteredSignatures = velox::fuzzer::filterSignatures(
         signatures, options.onlyFunctions, options.skipFunctions);
     if (filteredSignatures.empty()) {
       LOG(ERROR)
diff --git a/velox/exec/fuzzer/CMakeLists.txt b/velox/exec/fuzzer/CMakeLists.txt
index 96ca34cb9cc4e..a169100a6bdf1 100644
--- a/velox/exec/fuzzer/CMakeLists.txt
+++ b/velox/exec/fuzzer/CMakeLists.txt
@@ -57,3 +57,9 @@ target_link_libraries(
   velox_expression_test_utility
   velox_aggregation_fuzzer_base
   velox_temp_path)
+
+add_library(velox_row_number_fuzzer RowNumberFuzzer.cpp)
+
+target_link_libraries(
+  velox_row_number_fuzzer velox_fuzzer_util velox_type velox_vector_fuzzer
+  velox_exec_test_lib velox_expression_test_utility)
diff --git a/velox/exec/fuzzer/DuckQueryRunner.cpp b/velox/exec/fuzzer/DuckQueryRunner.cpp
index d926addfd9211..e19b1d33a7b08 100644
--- a/velox/exec/fuzzer/DuckQueryRunner.cpp
+++ b/velox/exec/fuzzer/DuckQueryRunner.cpp
@@ -133,21 +133,26 @@ std::optional<std::string> DuckQueryRunner::toSql(
     }
   }
 
-  if (auto projectNode =
+  if (const auto projectNode =
           std::dynamic_pointer_cast<const core::ProjectNode>(plan)) {
     return toSql(projectNode);
   }
 
-  if (auto windowNode =
+  if (const auto windowNode =
           std::dynamic_pointer_cast<const core::WindowNode>(plan)) {
     return toSql(windowNode);
   }
 
-  if (auto aggregationNode =
+  if (const auto aggregationNode =
           std::dynamic_pointer_cast<const core::AggregationNode>(plan)) {
     return toSql(aggregationNode);
   }
 
+  if (const auto rowNumberNode =
+          std::dynamic_pointer_cast<const core::RowNumberNode>(plan)) {
+    return toSql(rowNumberNode);
+  }
+
   VELOX_NYI();
 }
 
@@ -297,4 +302,31 @@ std::optional<std::string> DuckQueryRunner::toSql(
 
   return sql.str();
 }
+
+std::optional<std::string> DuckQueryRunner::toSql(
+    const std::shared_ptr<const core::RowNumberNode>& rowNumberNode) {
+  std::stringstream sql;
+  sql << "SELECT ";
+
+  const auto& inputType = rowNumberNode->sources()[0]->outputType();
+  for (auto i = 0; i < inputType->size(); ++i) {
+    appendComma(i, sql);
+    sql << inputType->nameOf(i);
+  }
+
+  sql << ", row_number() OVER (";
+
+  const auto& partitionKeys = rowNumberNode->partitionKeys();
+  if (!partitionKeys.empty()) {
+    sql << "partition by ";
+    for (auto i = 0; i < partitionKeys.size(); ++i) {
+      appendComma(i, sql);
+      sql << partitionKeys[i]->name();
+    }
+  }
+
+  sql << ") as row_number FROM tmp";
+
+  return sql.str();
+}
 } // namespace facebook::velox::exec::test
diff --git a/velox/exec/fuzzer/DuckQueryRunner.h b/velox/exec/fuzzer/DuckQueryRunner.h
index a683652946a4d..a5dc3f785716a 100644
--- a/velox/exec/fuzzer/DuckQueryRunner.h
+++ b/velox/exec/fuzzer/DuckQueryRunner.h
@@ -49,6 +49,9 @@ class DuckQueryRunner : public ReferenceQueryRunner {
   std::optional<std::string> toSql(
       const std::shared_ptr<const core::ProjectNode>& projectNode);
 
+  std::optional<std::string> toSql(
+      const std::shared_ptr<const core::RowNumberNode>& rowNumberNode);
+
   std::unordered_set<std::string> aggregateFunctionNames_;
 };
 
diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp
index bc81a452ae5e5..7831bf350a7a5 100644
--- a/velox/exec/fuzzer/PrestoQueryRunner.cpp
+++ b/velox/exec/fuzzer/PrestoQueryRunner.cpp
@@ -159,21 +159,26 @@ PrestoQueryRunner::PrestoQueryRunner(
 
 std::optional<std::string> PrestoQueryRunner::toSql(
     const core::PlanNodePtr& plan) {
-  if (auto projectNode =
+  if (const auto projectNode =
           std::dynamic_pointer_cast<const core::ProjectNode>(plan)) {
     return toSql(projectNode);
   }
 
-  if (auto windowNode =
+  if (const auto windowNode =
           std::dynamic_pointer_cast<const core::WindowNode>(plan)) {
     return toSql(windowNode);
   }
 
-  if (auto aggregationNode =
+  if (const auto aggregationNode =
           std::dynamic_pointer_cast<const core::AggregationNode>(plan)) {
     return toSql(aggregationNode);
   }
 
+  if (const auto rowNumberNode =
+          std::dynamic_pointer_cast<const core::RowNumberNode>(plan)) {
+    return toSql(rowNumberNode);
+  }
+
   VELOX_NYI();
 }
 
@@ -500,6 +505,37 @@ std::optional<std::string> PrestoQueryRunner::toSql(
   return sql.str();
 }
 
+std::optional<std::string> PrestoQueryRunner::toSql(
+    const std::shared_ptr<const core::RowNumberNode>& rowNumberNode) {
+  if (!isSupportedDwrfType(rowNumberNode->sources()[0]->outputType())) {
+    return std::nullopt;
+  }
+
+  std::stringstream sql;
+  sql << "SELECT ";
+
+  const auto& inputType = rowNumberNode->sources()[0]->outputType();
+  for (auto i = 0; i < inputType->size(); ++i) {
+    appendComma(i, sql);
+    sql << inputType->nameOf(i);
+  }
+
+  sql << ", row_number() OVER (";
+
+  const auto& partitionKeys = rowNumberNode->partitionKeys();
+  if (!partitionKeys.empty()) {
+    sql << "partition by ";
+    for (auto i = 0; i < partitionKeys.size(); ++i) {
+      appendComma(i, sql);
+      sql << partitionKeys[i]->name();
+    }
+  }
+
+  sql << ") as row_number FROM tmp";
+
+  return sql.str();
+}
+
 std::multiset<std::vector<variant>> PrestoQueryRunner::execute(
     const std::string& sql,
     const std::vector<RowVectorPtr>& input,
diff --git a/velox/exec/fuzzer/PrestoQueryRunner.h b/velox/exec/fuzzer/PrestoQueryRunner.h
index dfa8fabea93f4..7490e91a03c25 100644
--- a/velox/exec/fuzzer/PrestoQueryRunner.h
+++ b/velox/exec/fuzzer/PrestoQueryRunner.h
@@ -86,6 +86,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner {
   std::optional<std::string> toSql(
       const std::shared_ptr<const velox::core::ProjectNode>& projectNode);
 
+  std::optional<std::string> toSql(
+      const std::shared_ptr<const velox::core::RowNumberNode>& rowNumberNode);
+
   std::string startQuery(const std::string& sql);
 
   std::string fetchNext(const std::string& nextUri);
diff --git a/velox/exec/fuzzer/RowNumberFuzzer.cpp b/velox/exec/fuzzer/RowNumberFuzzer.cpp
new file mode 100644
index 0000000000000..c7a482859c336
--- /dev/null
+++ b/velox/exec/fuzzer/RowNumberFuzzer.cpp
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/exec/fuzzer/RowNumberFuzzer.h"
+#include <boost/random/uniform_int_distribution.hpp>
+#include <utility>
+#include "velox/common/file/FileSystems.h"
+#include "velox/connectors/hive/HiveConnector.h"
+#include "velox/connectors/hive/HiveConnectorSplit.h"
+#include "velox/dwio/dwrf/reader/DwrfReader.h"
+#include "velox/dwio/dwrf/writer/Writer.h"
+#include "velox/exec/fuzzer/ReferenceQueryRunner.h"
+#include "velox/exec/tests/utils/AssertQueryBuilder.h"
+#include "velox/exec/tests/utils/PlanBuilder.h"
+#include "velox/exec/tests/utils/TempDirectoryPath.h"
+#include "velox/vector/fuzzer/VectorFuzzer.h"
+
+DEFINE_int32(steps, 10, "Number of plans to generate and test.");
+
+DEFINE_int32(
+    duration_sec,
+    0,
+    "For how long it should run (in seconds). If zero, "
+    "it executes exactly --steps iterations and exits.");
+
+DEFINE_int32(
+    batch_size,
+    100,
+    "The number of elements on each generated vector.");
+
+DEFINE_int32(num_batches, 10, "The number of generated vectors.");
+
+DEFINE_double(
+    null_ratio,
+    0.1,
+    "Chance of adding a null value in a vector "
+    "(expressed as double from 0 to 1).");
+
+DEFINE_bool(enable_spill, true, "Whether to test plans with spilling enabled.");
+
+DEFINE_bool(
+    enable_oom_injection,
+    false,
+    "When enabled OOMs will randomly be triggered while executing query "
+    "plans. The goal of this mode is to ensure unexpected exceptions "
+    "aren't thrown and the process isn't killed in the process of cleaning "
+    "up after failures. Therefore, results are not compared when this is "
+    "enabled. Note that this option only works in debug builds.");
+
+namespace facebook::velox::exec::test {
+namespace {
+
+class RowNumberFuzzer {
+ public:
+  explicit RowNumberFuzzer(
+      size_t initialSeed,
+      std::unique_ptr<ReferenceQueryRunner>);
+
+  void go();
+
+  struct PlanWithSplits {
+    core::PlanNodePtr plan;
+    std::vector<std::shared_ptr<connector::ConnectorSplit>> splits;
+
+    explicit PlanWithSplits(
+        core::PlanNodePtr _plan,
+        const std::vector<std::shared_ptr<connector::ConnectorSplit>>& _splits =
+            {})
+        : plan(std::move(_plan)), splits(_splits) {}
+  };
+
+ private:
+  static VectorFuzzer::Options getFuzzerOptions() {
+    VectorFuzzer::Options opts;
+    opts.vectorSize = FLAGS_batch_size;
+    opts.stringVariableLength = true;
+    opts.stringLength = 100;
+    opts.nullRatio = FLAGS_null_ratio;
+    return opts;
+  }
+
+  static inline const std::string kHiveConnectorId = "test-hive";
+
+  // Makes a connector split from a file path on storage.
+  static std::shared_ptr<connector::ConnectorSplit> makeSplit(
+      const std::string& filePath);
+
+  void seed(size_t seed) {
+    currentSeed_ = seed;
+    vectorFuzzer_.reSeed(seed);
+    rng_.seed(currentSeed_);
+  }
+
+  void reSeed() {
+    seed(rng_());
+  }
+
+  // Runs one test iteration from query plans generations, executions and result
+  // verifications.
+  void verify();
+
+  int32_t randInt(int32_t min, int32_t max) {
+    return boost::random::uniform_int_distribution<int32_t>(min, max)(rng_);
+  }
+
+  std::pair<std::vector<std::string>, std::vector<TypePtr>>
+  generatePartitionKeys();
+
+  std::vector<RowVectorPtr> generateInput(
+      const std::vector<std::string>& keyNames,
+      const std::vector<TypePtr>& keyTypes);
+
+  std::optional<MaterializedRowMultiset> computeReferenceResults(
+      core::PlanNodePtr& plan,
+      const std::vector<RowVectorPtr>& input);
+
+  RowVectorPtr execute(const PlanWithSplits& plan, bool injectSpill);
+
+  void addPlansWithTableScan(
+      const std::string& tableDir,
+      const std::vector<std::string>& partitionKeys,
+      const std::vector<RowVectorPtr>& input,
+      std::vector<PlanWithSplits>& altPlans);
+
+  // Makes the query plan with default settings in RowNumberFuzzer and value
+  // inputs for both probe and build sides.
+  //
+  // NOTE: 'input' could either input rows with lazy
+  // vectors or flatten ones.
+  static PlanWithSplits makeDefaultPlan(
+      const std::vector<std::string>& partitionKeys,
+      const std::vector<RowVectorPtr>& input);
+
+  static PlanWithSplits makePlanWithTableScan(
+      const RowTypePtr& type,
+      const std::vector<std::string>& partitionKeys,
+      const std::vector<std::shared_ptr<connector::ConnectorSplit>>& splits);
+
+  FuzzerGenerator rng_;
+  size_t currentSeed_{0};
+
+  std::shared_ptr<memory::MemoryPool> rootPool_{
+      memory::memoryManager()->addRootPool(
+          "rowNumberFuzzer",
+          memory::kMaxMemory,
+          memory::MemoryReclaimer::create())};
+  std::shared_ptr<memory::MemoryPool> pool_{rootPool_->addLeafChild(
+      "rowNumberFuzzerLeaf",
+      true,
+      exec::MemoryReclaimer::create())};
+  std::shared_ptr<memory::MemoryPool> writerPool_{rootPool_->addAggregateChild(
+      "rowNumberFuzzerWriter",
+      exec::MemoryReclaimer::create())};
+  VectorFuzzer vectorFuzzer_;
+  std::unique_ptr<ReferenceQueryRunner> referenceQueryRunner_;
+};
+
+RowNumberFuzzer::RowNumberFuzzer(
+    size_t initialSeed,
+    std::unique_ptr<ReferenceQueryRunner> referenceQueryRunner)
+    : vectorFuzzer_{getFuzzerOptions(), pool_.get()},
+      referenceQueryRunner_{std::move(referenceQueryRunner)} {
+  filesystems::registerLocalFileSystem();
+
+  // Make sure not to run out of open file descriptors.
+  const std::unordered_map<std::string, std::string> hiveConfig = {
+      {connector::hive::HiveConfig::kNumCacheFileHandles, "1000"}};
+  auto hiveConnector =
+      connector::getConnectorFactory(
+          connector::hive::HiveConnectorFactory::kHiveConnectorName)
+          ->newConnector(
+              kHiveConnectorId, std::make_shared<core::MemConfig>(hiveConfig));
+  connector::registerConnector(hiveConnector);
+
+  seed(initialSeed);
+}
+
+void writeToFile(
+    const std::string& path,
+    const VectorPtr& vector,
+    memory::MemoryPool* pool) {
+  dwrf::WriterOptions options;
+  options.schema = vector->type();
+  options.memoryPool = pool;
+  auto writeFile = std::make_unique<LocalWriteFile>(path, true, false);
+  auto sink =
+      std::make_unique<dwio::common::WriteFileSink>(std::move(writeFile), path);
+  dwrf::Writer writer(std::move(sink), options);
+  writer.write(vector);
+  writer.close();
+}
+
+// static
+std::shared_ptr<connector::ConnectorSplit> RowNumberFuzzer::makeSplit(
+    const std::string& filePath) {
+  return std::make_shared<connector::hive::HiveConnectorSplit>(
+      kHiveConnectorId, filePath, dwio::common::FileFormat::DWRF);
+}
+
+template <typename T>
+bool isDone(size_t i, T startTime) {
+  if (FLAGS_duration_sec > 0) {
+    std::chrono::duration<double> elapsed =
+        std::chrono::system_clock::now() - startTime;
+    return elapsed.count() >= FLAGS_duration_sec;
+  }
+  return i >= FLAGS_steps;
+}
+
+std::vector<RowVectorPtr> flatten(const std::vector<RowVectorPtr>& vectors) {
+  std::vector<RowVectorPtr> flatVectors;
+  for (const auto& vector : vectors) {
+    auto flat = BaseVector::create<RowVector>(
+        vector->type(), vector->size(), vector->pool());
+    flat->copy(vector.get(), 0, 0, vector->size());
+    flatVectors.push_back(flat);
+  }
+
+  return flatVectors;
+}
+
+std::pair<std::vector<std::string>, std::vector<TypePtr>>
+RowNumberFuzzer::generatePartitionKeys() {
+  const auto numKeys = randInt(1, 3);
+  std::vector<std::string> names;
+  std::vector<TypePtr> types;
+  for (auto i = 0; i < numKeys; ++i) {
+    names.push_back(fmt::format("c{}", i));
+    types.push_back(vectorFuzzer_.randType(/*maxDepth=*/1));
+  }
+  return std::make_pair(names, types);
+}
+
+std::vector<RowVectorPtr> RowNumberFuzzer::generateInput(
+    const std::vector<std::string>& keyNames,
+    const std::vector<TypePtr>& keyTypes) {
+  std::vector<std::string> names = keyNames;
+  std::vector<TypePtr> types = keyTypes;
+  // Add up to 3 payload columns.
+  const auto numPayload = randInt(0, 3);
+  for (auto i = 0; i < numPayload; ++i) {
+    names.push_back(fmt::format("c{}", i + keyNames.size()));
+    types.push_back(vectorFuzzer_.randType(/*maxDepth=*/2));
+  }
+
+  const auto inputType = ROW(std::move(names), std::move(types));
+  std::vector<RowVectorPtr> input;
+  input.reserve(FLAGS_num_batches);
+  for (auto i = 0; i < FLAGS_num_batches; ++i) {
+    input.push_back(vectorFuzzer_.fuzzInputRow(inputType));
+  }
+
+  return input;
+}
+
+RowNumberFuzzer::PlanWithSplits RowNumberFuzzer::makeDefaultPlan(
+    const std::vector<std::string>& partitionKeys,
+    const std::vector<RowVectorPtr>& input) {
+  auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
+  std::vector<std::string> projectFields = partitionKeys;
+  projectFields.emplace_back("row_number");
+  auto plan = PlanBuilder()
+                  .values(input)
+                  .rowNumber(partitionKeys)
+                  .project(projectFields)
+                  .planNode();
+  return PlanWithSplits{std::move(plan)};
+}
+
+bool containsType(const TypePtr& type, const TypePtr& search) {
+  if (type->equivalent(*search)) {
+    return true;
+  }
+
+  for (auto i = 0; i < type->size(); ++i) {
+    if (containsType(type->childAt(i), search)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool containsTypeKind(const TypePtr& type, const TypeKind& search) {
+  if (type->kind() == search) {
+    return true;
+  }
+
+  for (auto i = 0; i < type->size(); ++i) {
+    if (containsTypeKind(type->childAt(i), search)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool containsUnsupportedTypes(const TypePtr& type) {
+  // Skip queries that use Timestamp, Varbinary, and IntervalDayTime types.
+  // DuckDB doesn't support nanosecond precision for timestamps or casting from
+  // Bigint to Interval.
+  // TODO Investigate mismatches reported when comparing Varbinary.
+  return containsTypeKind(type, TypeKind::TIMESTAMP) ||
+      containsTypeKind(type, TypeKind::VARBINARY) ||
+      containsType(type, INTERVAL_DAY_TIME());
+}
+
+std::optional<MaterializedRowMultiset> RowNumberFuzzer::computeReferenceResults(
+    core::PlanNodePtr& plan,
+    const std::vector<RowVectorPtr>& input) {
+  if (containsUnsupportedTypes(input[0]->type())) {
+    return std::nullopt;
+  }
+
+  if (auto sql = referenceQueryRunner_->toSql(plan)) {
+    return referenceQueryRunner_->execute(
+        sql.value(), input, plan->outputType());
+  }
+
+  LOG(INFO) << "Query not supported by the reference DB";
+  return std::nullopt;
+}
+
+RowVectorPtr RowNumberFuzzer::execute(
+    const PlanWithSplits& plan,
+    bool injectSpill) {
+  LOG(INFO) << "Executing query plan: " << plan.plan->toString(true, true);
+
+  AssertQueryBuilder builder(plan.plan);
+  if (!plan.splits.empty()) {
+    builder.splits(plan.splits);
+  }
+
+  std::shared_ptr<TempDirectoryPath> spillDirectory;
+  int32_t spillPct{0};
+  if (injectSpill) {
+    spillDirectory = exec::test::TempDirectoryPath::create();
+    builder.config(core::QueryConfig::kSpillEnabled, true)
+        .config(core::QueryConfig::kRowNumberSpillEnabled, true)
+        .spillDirectory(spillDirectory->getPath());
+    spillPct = 10;
+  }
+
+  ScopedOOMInjector oomInjector(
+      []() -> bool { return folly::Random::oneIn(10); },
+      10); // Check the condition every 10 ms.
+  if (FLAGS_enable_oom_injection) {
+    oomInjector.enable();
+  }
+
+  // Wait for the task to be destroyed before start next query execution to
+  // avoid the potential interference of the background activities across query
+  // executions.
+  auto stopGuard = folly::makeGuard([&]() { waitForAllTasksToBeDeleted(); });
+
+  TestScopedSpillInjection scopedSpillInjection(spillPct);
+  RowVectorPtr result;
+  try {
+    result = builder.copyResults(pool_.get());
+  } catch (VeloxRuntimeError& e) {
+    if (FLAGS_enable_oom_injection &&
+        e.errorCode() == facebook::velox::error_code::kMemCapExceeded &&
+        e.message() == ScopedOOMInjector::kErrorMessage) {
+      // If we enabled OOM injection we expect the exception thrown by the
+      // ScopedOOMInjector.
+      return nullptr;
+    }
+
+    throw e;
+  }
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << std::endl << result->toString(0, result->size());
+  }
+
+  return result;
+}
+
+RowNumberFuzzer::PlanWithSplits RowNumberFuzzer::makePlanWithTableScan(
+    const RowTypePtr& type,
+    const std::vector<std::string>& partitionKeys,
+    const std::vector<std::shared_ptr<connector::ConnectorSplit>>& splits) {
+  std::vector<std::string> projectFields = partitionKeys;
+  projectFields.emplace_back("row_number");
+
+  auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
+  core::PlanNodeId scanId;
+  auto plan = PlanBuilder(planNodeIdGenerator)
+                  .tableScan(type)
+                  .rowNumber(partitionKeys)
+                  .project(projectFields)
+                  .planNode();
+  return PlanWithSplits{plan, splits};
+}
+
+bool isTableScanSupported(const TypePtr& type) {
+  if (type->kind() == TypeKind::ROW && type->size() == 0) {
+    return false;
+  }
+  if (type->kind() == TypeKind::UNKNOWN) {
+    return false;
+  }
+  if (type->kind() == TypeKind::HUGEINT) {
+    return false;
+  }
+  // Disable testing with TableScan when input contains TIMESTAMP type, due to
+  // the issue #8127.
+  if (type->kind() == TypeKind::TIMESTAMP) {
+    return false;
+  }
+
+  for (auto i = 0; i < type->size(); ++i) {
+    if (!isTableScanSupported(type->childAt(i))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void RowNumberFuzzer::addPlansWithTableScan(
+    const std::string& tableDir,
+    const std::vector<std::string>& partitionKeys,
+    const std::vector<RowVectorPtr>& input,
+    std::vector<PlanWithSplits>& altPlans) {
+  VELOX_CHECK(!tableDir.empty());
+
+  if (!isTableScanSupported(input[0]->type())) {
+    return;
+  }
+
+  std::vector<std::shared_ptr<connector::ConnectorSplit>> inputSplits;
+  for (auto i = 0; i < input.size(); ++i) {
+    const std::string filePath = fmt::format("{}/row_number/{}", tableDir, i);
+    writeToFile(filePath, input[i], writerPool_.get());
+    inputSplits.push_back(makeSplit(filePath));
+  }
+
+  altPlans.push_back(makePlanWithTableScan(
+      asRowType(input[0]->type()), partitionKeys, inputSplits));
+}
+
+void RowNumberFuzzer::verify() {
+  const auto [keyNames, keyTypes] = generatePartitionKeys();
+  const auto input = generateInput(keyNames, keyTypes);
+  // Flatten inputs.
+  const auto flatInput = flatten(input);
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "Input: " << input[0]->toString();
+    for (const auto& v : flatInput) {
+      VLOG(1) << std::endl << v->toString(0, v->size());
+    }
+  }
+
+  auto defaultPlan = makeDefaultPlan(keyNames, input);
+  const auto expected = execute(defaultPlan, /*injectSpill=*/false);
+
+  if (expected != nullptr) {
+    if (const auto referenceResult =
+            computeReferenceResults(defaultPlan.plan, input)) {
+      VELOX_CHECK(
+          assertEqualResults(
+              referenceResult.value(),
+              defaultPlan.plan->outputType(),
+              {expected}),
+          "Velox and Reference results don't match");
+    }
+  }
+
+  std::vector<PlanWithSplits> altPlans;
+  altPlans.push_back(std::move(defaultPlan));
+
+  const auto tableScanDir = exec::test::TempDirectoryPath::create();
+  addPlansWithTableScan(tableScanDir->getPath(), keyNames, input, altPlans);
+
+  for (auto i = 0; i < altPlans.size(); ++i) {
+    LOG(INFO) << "Testing plan #" << i;
+    auto actual = execute(altPlans[i], /*injectSpill=*/false);
+    if (actual != nullptr && expected != nullptr) {
+      VELOX_CHECK(
+          assertEqualResults({expected}, {actual}),
+          "Logically equivalent plans produced different results");
+    } else {
+      VELOX_CHECK(
+          FLAGS_enable_oom_injection, "Got unexpected nullptr for results");
+    }
+
+    if (FLAGS_enable_spill) {
+      LOG(INFO) << "Testing plan #" << i << " with spilling";
+      actual = execute(altPlans[i], /*=injectSpill=*/true);
+      if (actual != nullptr && expected != nullptr) {
+        try {
+          VELOX_CHECK(
+              assertEqualResults({expected}, {actual}),
+              "Logically equivalent plans produced different results");
+        } catch (const VeloxException& e) {
+          LOG(ERROR) << "Expected\n"
+                     << expected->toString(0, expected->size()) << "\nActual\n"
+                     << actual->toString(0, actual->size());
+          throw;
+        }
+      } else {
+        VELOX_CHECK(
+            FLAGS_enable_oom_injection, "Got unexpected nullptr for results");
+      }
+    }
+  }
+}
+
+void RowNumberFuzzer::go() {
+  VELOX_USER_CHECK(
+      FLAGS_steps > 0 || FLAGS_duration_sec > 0,
+      "Either --steps or --duration_sec needs to be greater than zero.")
+  VELOX_USER_CHECK_GE(FLAGS_batch_size, 10, "Batch size must be at least 10.");
+
+  const auto startTime = std::chrono::system_clock::now();
+  size_t iteration = 0;
+
+  while (!isDone(iteration, startTime)) {
+    LOG(INFO) << "==============================> Started iteration "
+              << iteration << " (seed: " << currentSeed_ << ")";
+    verify();
+    LOG(INFO) << "==============================> Done with iteration "
+              << iteration;
+
+    reSeed();
+    ++iteration;
+  }
+}
+} // namespace
+
+void rowNumberFuzzer(
+    size_t seed,
+    std::unique_ptr<test::ReferenceQueryRunner> referenceQueryRunner) {
+  RowNumberFuzzer(seed, std::move(referenceQueryRunner)).go();
+}
+} // namespace facebook::velox::exec::test
diff --git a/velox/exec/fuzzer/RowNumberFuzzer.h b/velox/exec/fuzzer/RowNumberFuzzer.h
new file mode 100644
index 0000000000000..30cd960e327f4
--- /dev/null
+++ b/velox/exec/fuzzer/RowNumberFuzzer.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>
+#include "velox/exec/fuzzer/ReferenceQueryRunner.h"
+
+namespace facebook::velox::exec::test {
+void rowNumberFuzzer(
+    size_t seed,
+    std::unique_ptr<ReferenceQueryRunner> referenceQueryRunner);
+}
diff --git a/velox/exec/fuzzer/RowNumberFuzzerRunner.h b/velox/exec/fuzzer/RowNumberFuzzerRunner.h
new file mode 100644
index 0000000000000..2d018f81d3068
--- /dev/null
+++ b/velox/exec/fuzzer/RowNumberFuzzerRunner.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "velox/common/file/FileSystems.h"
+
+#include "velox/exec/fuzzer/ReferenceQueryRunner.h"
+#include "velox/exec/fuzzer/RowNumberFuzzer.h"
+#include "velox/serializers/PrestoSerializer.h"
+
+/// RowNumber FuzzerRunner leverages RowNumberFuzzer and VectorFuzzer to
+/// automatically generate and execute tests. It works as follows:
+///
+///  1. Plan Generation: Generate two equivalent query plans, one is row-number
+///     over ValuesNode and the other is over TableScanNode.
+///  2. Executes a variety of logically equivalent query plans and checks the
+///     results are the same.
+///  3. Rinse and repeat.
+///
+/// It is used as follows:
+///
+///  $ ./velox_row_number_fuzzer_test --duration_sec 600
+///
+/// The flags that configure RowNumberFuzzer's behavior are:
+///
+///  --steps: how many iterations to run.
+///  --duration_sec: alternatively, for how many seconds it should run (takes
+///          precedence over --steps).
+///  --seed: pass a deterministic seed to reproduce the behavior (each iteration
+///          will print a seed as part of the logs).
+///  --v=1: verbose logging; print a lot more details about the execution.
+///  --batch_size: size of input vector batches generated.
+///  --num_batches: number if input vector batches to generate.
+///  --enable_spill: test plans with spilling enabled.
+///  --enable_oom_injection: randomly trigger OOM while executing query plans.
+/// e.g:
+///
+///  $ ./velox_row_number_fuzzer_test \
+///         --seed 123 \
+///         --duration_sec 600 \
+///         --v=1
+
+namespace facebook::velox::exec::test {
+
+class RowNumberFuzzerRunner {
+ public:
+  static int run(
+      size_t seed,
+      std::unique_ptr<ReferenceQueryRunner> referenceQueryRunner) {
+    serializer::presto::PrestoVectorSerde::registerVectorSerde();
+    filesystems::registerLocalFileSystem();
+    rowNumberFuzzer(seed, std::move(referenceQueryRunner));
+    return RUN_ALL_TESTS();
+  }
+};
+
+} // namespace facebook::velox::exec::test
diff --git a/velox/exec/fuzzer/WindowFuzzer.cpp b/velox/exec/fuzzer/WindowFuzzer.cpp
index 73f3c4e2494e7..5ce2a7730f751 100644
--- a/velox/exec/fuzzer/WindowFuzzer.cpp
+++ b/velox/exec/fuzzer/WindowFuzzer.cpp
@@ -166,8 +166,7 @@ std::string WindowFuzzer::generateOrderByClause(
       frame << ", ";
     }
     frame << sortingKeysAndOrders[i].key_ << " "
-          << sortingKeysAndOrders[i].order_ << " "
-          << sortingKeysAndOrders[i].nullsOrder_;
+          << sortingKeysAndOrders[i].sortOrder_.toString();
   }
   return frame.str();
 }
@@ -193,11 +192,10 @@ WindowFuzzer::generateSortingKeysAndOrders(
     std::vector<TypePtr>& types) {
   auto keys = generateSortingKeys(prefix, names, types);
   std::vector<SortingKeyAndOrder> results;
-  // TODO: allow randomly generating orders.
   for (auto i = 0; i < keys.size(); ++i) {
-    std::string order = "asc";
-    std::string nullsOrder = "nulls last";
-    results.push_back(SortingKeyAndOrder(keys[i], order, nullsOrder));
+    auto asc = vectorFuzzer_.coinToss(0.5);
+    auto nullsFirst = vectorFuzzer_.coinToss(0.5);
+    results.emplace_back(keys[i], core::SortOrder(asc, nullsFirst));
   }
   return results;
 }
@@ -250,8 +248,7 @@ void WindowFuzzer::go() {
     // If the function is order-dependent or uses "rows" frame, sort all input
     // rows by row_number additionally.
     if (requireSortedInput || isRowsFrame) {
-      sortingKeysAndOrders.push_back(
-          SortingKeyAndOrder("row_number", "asc", "nulls last"));
+      sortingKeysAndOrders.emplace_back("row_number", core::kAscNullsLast);
       ++stats_.numSortedInputs;
     }
 
@@ -301,20 +298,16 @@ void WindowFuzzer::testAlternativePlans(
     const std::vector<RowVectorPtr>& input,
     bool customVerification,
     const std::shared_ptr<ResultVerifier>& customVerifier,
-    const velox::test::ResultOrError& expected) {
+    const velox::fuzzer::ResultOrError& expected) {
   std::vector<AggregationFuzzerBase::PlanWithSplits> plans;
 
   std::vector<std::string> allKeys;
   for (const auto& key : partitionKeys) {
-    allKeys.push_back(key + " NULLS FIRST");
+    allKeys.emplace_back(key + " NULLS FIRST");
   }
   for (const auto& keyAndOrder : sortingKeysAndOrders) {
-    allKeys.push_back(folly::to<std::string>(
-        keyAndOrder.key_,
-        " ",
-        keyAndOrder.order_,
-        " ",
-        keyAndOrder.nullsOrder_));
+    allKeys.emplace_back(fmt::format(
+        "{} {}", keyAndOrder.key_, keyAndOrder.sortOrder_.toString()));
   }
 
   // Streaming window from values.
@@ -400,7 +393,7 @@ bool WindowFuzzer::verifyWindow(
     persistReproInfo({{plan, {}}}, reproPersistPath_);
   }
 
-  velox::test::ResultOrError resultOrError;
+  velox::fuzzer::ResultOrError resultOrError;
   try {
     resultOrError = execute(plan);
     if (resultOrError.exceptionPtr) {
diff --git a/velox/exec/fuzzer/WindowFuzzer.h b/velox/exec/fuzzer/WindowFuzzer.h
index a70b2c67deb85..bf36de8a9e3de 100644
--- a/velox/exec/fuzzer/WindowFuzzer.h
+++ b/velox/exec/fuzzer/WindowFuzzer.h
@@ -77,20 +77,11 @@ class WindowFuzzer : public AggregationFuzzerBase {
 
  private:
   struct SortingKeyAndOrder {
-    std::string key_;
-    std::string order_;
-    std::string nullsOrder_;
-
-    SortingKeyAndOrder() = delete;
-
-    SortingKeyAndOrder(
-        const std::string& key,
-        const std::string& order,
-        const std::string& nullsOrder) {
-      key_ = key;
-      order_ = order;
-      nullsOrder_ = nullsOrder;
-    }
+    const std::string key_;
+    const core::SortOrder sortOrder_;
+
+    SortingKeyAndOrder(std::string key, core::SortOrder sortOrder)
+        : key_(std::move(key)), sortOrder_(std::move(sortOrder)) {}
   };
 
   void addWindowFunctionSignatures(const WindowFunctionMap& signatureMap);
@@ -131,7 +122,7 @@ class WindowFuzzer : public AggregationFuzzerBase {
       const std::vector<RowVectorPtr>& input,
       bool customVerification,
       const std::shared_ptr<ResultVerifier>& customVerifier,
-      const velox::test::ResultOrError& expected);
+      const velox::fuzzer::ResultOrError& expected);
 
   const std::unordered_set<std::string> orderDependentFunctions_;
 
diff --git a/velox/exec/fuzzer/WindowFuzzerRunner.h b/velox/exec/fuzzer/WindowFuzzerRunner.h
index 16c512b9a51ca..147ea5471a222 100644
--- a/velox/exec/fuzzer/WindowFuzzerRunner.h
+++ b/velox/exec/fuzzer/WindowFuzzerRunner.h
@@ -26,7 +26,7 @@
 #include "velox/exec/Aggregate.h"
 #include "velox/exec/fuzzer/AggregationFuzzerOptions.h"
 #include "velox/exec/fuzzer/WindowFuzzer.h"
-#include "velox/expression/tests/utils/FuzzerToolkit.h"
+#include "velox/expression/fuzzer/FuzzerToolkit.h"
 #include "velox/parse/TypeResolver.h"
 #include "velox/serializers/PrestoSerializer.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
@@ -63,9 +63,9 @@ class WindowFuzzerRunner {
       exit(1);
     }
 
-    auto filteredAggregationSignatures = velox::test::filterSignatures(
+    auto filteredAggregationSignatures = velox::fuzzer::filterSignatures(
         aggregationSignatures, options.onlyFunctions, options.skipFunctions);
-    auto filteredWindowSignatures = velox::test::filterSignatures(
+    auto filteredWindowSignatures = velox::fuzzer::filterSignatures(
         windowSignatures, options.onlyFunctions, options.skipFunctions);
     if (filteredAggregationSignatures.empty() &&
         filteredWindowSignatures.empty()) {
diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt
index ddfb25743d23a..6b1f0f86b7da8 100644
--- a/velox/exec/tests/CMakeLists.txt
+++ b/velox/exec/tests/CMakeLists.txt
@@ -207,6 +207,12 @@ add_library(velox_join_fuzzer JoinFuzzer.cpp)
 target_link_libraries(velox_join_fuzzer velox_type velox_vector_fuzzer
                       velox_exec_test_lib velox_expression_test_utility)
 
+# RowNumber Fuzzer.
+add_executable(velox_row_number_fuzzer_test RowNumberFuzzerTest.cpp)
+
+target_link_libraries(velox_row_number_fuzzer_test velox_row_number_fuzzer
+                      gtest gtest_main)
+
 add_executable(velox_join_fuzzer_test JoinFuzzerTest.cpp)
 
 target_link_libraries(velox_join_fuzzer_test velox_join_fuzzer gtest gtest_main)
diff --git a/velox/exec/tests/DriverTest.cpp b/velox/exec/tests/DriverTest.cpp
index 5601cbe2e3119..990f8a3c75b19 100644
--- a/velox/exec/tests/DriverTest.cpp
+++ b/velox/exec/tests/DriverTest.cpp
@@ -26,6 +26,7 @@
 #include "velox/exec/tests/utils/Cursor.h"
 #include "velox/exec/tests/utils/OperatorTestBase.h"
 #include "velox/exec/tests/utils/PlanBuilder.h"
+#include "velox/functions/Udf.h"
 
 using namespace facebook::velox;
 using namespace facebook::velox::exec;
@@ -1519,6 +1520,44 @@ DEBUG_ONLY_TEST_F(DriverTest, driverCpuTimeSlicingCheck) {
   }
 }
 
+namespace {
+
+template <typename T>
+struct ThrowRuntimeExceptionFunction {
+  template <typename TResult, typename TInput>
+  void call(TResult& out, const TInput& in) {
+    VELOX_CHECK(false, "Throwing exception");
+  }
+};
+} // namespace
+
+TEST_F(DriverTest, additionalContextInRuntimeException) {
+  // Ensures that exceptions thrown during execution of an operator contain the
+  // expected context. This is done by executing a plan using project filter
+  // that uses expressions which setup hierarchical contexts. Finally, we verify
+  // that all essential context are present.
+  auto vector = makeRowVector({makeFlatVector<int64_t>({1, 2, 3, 4, 5, 6})});
+  registerFunction<ThrowRuntimeExceptionFunction, int64_t, int64_t>(
+      {"throwException"});
+  auto op = PlanBuilder()
+                .values({vector})
+                .project({"c0 + throwException(c0)"})
+                .planNode();
+  try {
+    assertQuery(op, vector);
+  } catch (VeloxException& e) {
+    ASSERT_EQ(e.context(), "throwexception(c0)");
+    auto additionalContext = e.additionalContext();
+    // Remove the string following `TaskId` from the additional context since
+    // its indeterministic.
+    additionalContext.resize(additionalContext.find(" TaskId:"));
+    ASSERT_EQ(
+        additionalContext,
+        "Top-level Expression: plus(c0, throwexception(c0)) Operator: "
+        "FilterProject(1) PlanNodeId: 1");
+  }
+}
+
 class OpCallStatusTest : public OperatorTestBase {};
 
 // Test that the opCallStatus is returned properly and formats the call as
diff --git a/velox/exec/tests/ExchangeClientTest.cpp b/velox/exec/tests/ExchangeClientTest.cpp
index 68021897b8ae6..4deae1b71fec6 100644
--- a/velox/exec/tests/ExchangeClientTest.cpp
+++ b/velox/exec/tests/ExchangeClientTest.cpp
@@ -423,5 +423,52 @@ TEST_F(ExchangeClientTest, sourceTimeout) {
   test::testingShutdownLocalExchangeSource();
 }
 
+TEST_F(ExchangeClientTest, callNextAfterClose) {
+  constexpr int32_t kNumSources = 3;
+  common::testutil::TestValue::enable();
+  auto client =
+      std::make_shared<ExchangeClient>("test", 17, 1 << 20, pool(), executor());
+
+  bool atEnd;
+  ContinueFuture future;
+  auto pages = client->next(1, &atEnd, &future);
+  ASSERT_EQ(0, pages.size());
+  ASSERT_FALSE(atEnd);
+
+  for (auto i = 0; i < kNumSources; ++i) {
+    client->addRemoteTaskId(fmt::format("local://{}", i));
+  }
+  client->noMoreRemoteTasks();
+
+  // Fetch a page. No page is found. All sources are fetching.
+  pages = client->next(1, &atEnd, &future);
+  EXPECT_TRUE(pages.empty());
+
+  const auto& queue = client->queue();
+  for (auto i = 0; i < 10; ++i) {
+    enqueue(*queue, makePage(1'000 + i));
+  }
+
+  // Fetch multiple pages. Each page is slightly larger than 1K bytes, hence,
+  // only 4 pages fit.
+  pages = client->next(5'000, &atEnd, &future);
+  EXPECT_EQ(4, pages.size());
+  EXPECT_FALSE(atEnd);
+
+  // Close the client and try calling next again.
+  client->close();
+
+  // Here we should have no pages returned, be at end (we are closed) and the
+  // future should be invalid (not based on a valid promise).
+  ContinueFuture futureFinal{ContinueFuture::makeEmpty()};
+  pages = client->next(10'000, &atEnd, &futureFinal);
+  EXPECT_EQ(0, pages.size());
+  EXPECT_TRUE(atEnd);
+  EXPECT_FALSE(futureFinal.valid());
+
+  client->close();
+  test::testingShutdownLocalExchangeSource();
+}
+
 } // namespace
 } // namespace facebook::velox::exec
diff --git a/velox/exec/tests/HashJoinTest.cpp b/velox/exec/tests/HashJoinTest.cpp
index 601a0d90a5e51..7bf3b87496ef7 100644
--- a/velox/exec/tests/HashJoinTest.cpp
+++ b/velox/exec/tests/HashJoinTest.cpp
@@ -203,7 +203,7 @@ std::pair<int32_t, int32_t> numTaskSpillFiles(const exec::Task& task) {
 void abortPool(memory::MemoryPool* pool) {
   try {
     VELOX_FAIL("Manual MemoryPool Abortion");
-  } catch (const VeloxException& error) {
+  } catch (const VeloxException&) {
     pool->abort(std::current_exception());
   }
 }
@@ -4041,6 +4041,71 @@ TEST_F(HashJoinTest, lazyVectors) {
   }
 }
 
+TEST_F(HashJoinTest, lazyVectorNotLoadedInFilter) {
+  // Ensure that if lazy vectors are temporarily wrapped during a filter's
+  // execution and remain unloaded, the temporary wrap is promptly
+  // discarded. This precaution prevents the generation of the probe's output
+  // from wrapping an unloaded vector while the temporary wrap is
+  // still alive.
+  // This is done by generating a sufficiently small batch to allow the lazy
+  // vector to remain unloaded, as it doesn't need to be split between batches.
+  // Then we use a filter that skips the execution of the expression containing
+  // the lazy vector, thereby avoiding its loading.
+  const vector_size_t vectorSize = 1'000;
+  auto probeVectors = makeBatches(1, [&](int32_t /*unused*/) {
+    return makeRowVector(
+        {makeFlatVector<int32_t>(vectorSize, folly::identity),
+         makeFlatVector<int64_t>(vectorSize, [](auto row) { return row % 23; }),
+         makeFlatVector<int32_t>(
+             vectorSize, [](auto row) { return row % 31; })});
+  });
+
+  std::vector<RowVectorPtr> buildVectors =
+      makeBatches(1, [&](int32_t /*unused*/) {
+        return makeRowVector({makeFlatVector<int32_t>(
+            vectorSize, [](auto row) { return row * 3; })});
+      });
+
+  std::shared_ptr<TempFilePath> probeFile = TempFilePath::create();
+  writeToFile(probeFile->getPath(), probeVectors);
+
+  std::shared_ptr<TempFilePath> buildFile = TempFilePath::create();
+  writeToFile(buildFile->getPath(), buildVectors);
+
+  createDuckDbTable("t", probeVectors);
+  createDuckDbTable("u", buildVectors);
+
+  // Lazy vector is part of the filter but never gets loaded.
+  auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
+  core::PlanNodeId probeScanId;
+  core::PlanNodeId buildScanId;
+  auto op = PlanBuilder(planNodeIdGenerator)
+                .tableScan(asRowType(probeVectors[0]->type()))
+                .capturePlanNodeId(probeScanId)
+                .hashJoin(
+                    {"c0"},
+                    {"c0"},
+                    PlanBuilder(planNodeIdGenerator)
+                        .tableScan(asRowType(buildVectors[0]->type()))
+                        .capturePlanNodeId(buildScanId)
+                        .planNode(),
+                    "c1 >= 0 OR c2 > 0",
+                    {"c1", "c2"})
+                .planNode();
+  SplitInput splitInput = {
+      {probeScanId,
+       {exec::Split(makeHiveConnectorSplit(probeFile->getPath()))}},
+      {buildScanId,
+       {exec::Split(makeHiveConnectorSplit(buildFile->getPath()))}},
+  };
+  HashJoinBuilder(*pool_, duckDbQueryRunner_, driverExecutor_.get())
+      .planNode(std::move(op))
+      .inputSplits(splitInput)
+      .checkSpillStats(false)
+      .referenceQuery("SELECT t.c1, t.c2 FROM t, u WHERE t.c0 = u.c0")
+      .run();
+}
+
 TEST_F(HashJoinTest, dynamicFilters) {
   const int32_t numSplits = 10;
   const int32_t numRowsProbe = 333;
@@ -4111,6 +4176,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
   {
     // Inner join.
     core::PlanNodeId probeScanId;
+    core::PlanNodeId joinId;
     auto op = PlanBuilder(planNodeIdGenerator, pool_.get())
                   .tableScan(probeType)
                   .capturePlanNodeId(probeScanId)
@@ -4121,6 +4187,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
                       "",
                       {"c0", "c1", "u_c1"},
                       core::JoinType::kInner)
+                  .capturePlanNodeId(joinId)
                   .project({"c0", "c1 + 1", "c1 + u_c1"})
                   .planNode();
     {
@@ -4131,16 +4198,21 @@ TEST_F(HashJoinTest, dynamicFilters) {
               "SELECT t.c0, t.c1 + 1, t.c1 + u.c1 FROM t, u WHERE t.c0 = u.c0")
           .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
             SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+            auto planStats = toPlanStats(task->taskStats());
             if (hasSpill) {
               // Dynamic filtering should be disabled with spilling triggered.
               ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
             } else {
               ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_EQ(
+                  planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                  std::unordered_set<core::PlanNodeId>({joinId}));
             }
           })
           .run();
@@ -4157,6 +4229,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
                  "",
                  {"c0", "c1"},
                  core::JoinType::kLeftSemiFilter)
+             .capturePlanNodeId(joinId)
              .project({"c0", "c1 + 1"})
              .planNode();
 
@@ -4168,17 +4241,22 @@ TEST_F(HashJoinTest, dynamicFilters) {
               "SELECT t.c0, t.c1 + 1 FROM t WHERE t.c0 IN (SELECT c0 FROM u)")
           .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
             SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+            auto planStats = toPlanStats(task->taskStats());
             if (hasSpill) {
               // Dynamic filtering should be disabled with spilling triggered.
               ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum);
               ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
             } else {
               ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
               ASSERT_GT(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_EQ(
+                  planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                  std::unordered_set<core::PlanNodeId>({joinId}));
             }
           })
           .run();
@@ -4195,6 +4273,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
                  "",
                  {"u_c0", "u_c1"},
                  core::JoinType::kRightSemiFilter)
+             .capturePlanNodeId(joinId)
              .project({"u_c0", "u_c1 + 1"})
              .planNode();
 
@@ -4206,17 +4285,22 @@ TEST_F(HashJoinTest, dynamicFilters) {
               "SELECT u.c0, u.c1 + 1 FROM u WHERE u.c0 IN (SELECT c0 FROM t)")
           .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
             SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+            auto planStats = toPlanStats(task->taskStats());
             if (hasSpill) {
               // Dynamic filtering should be disabled with spilling triggered.
               ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
             } else {
               ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_EQ(
+                  planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                  std::unordered_set<core::PlanNodeId>({joinId}));
             }
           })
           .run();
@@ -4232,6 +4316,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
     assignments["b"] = regularColumn("c1", BIGINT());
 
     core::PlanNodeId probeScanId;
+    core::PlanNodeId joinId;
     auto op = PlanBuilder(planNodeIdGenerator, pool_.get())
                   .startTableScan()
                   .outputType(scanOutputType)
@@ -4239,6 +4324,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
                   .endTableScan()
                   .capturePlanNodeId(probeScanId)
                   .hashJoin({"a"}, {"u_c0"}, buildSide, "", {"a", "b", "u_c1"})
+                  .capturePlanNodeId(joinId)
                   .project({"a", "b + 1", "b + u_c1"})
                   .planNode();
 
@@ -4249,17 +4335,22 @@ TEST_F(HashJoinTest, dynamicFilters) {
             "SELECT t.c0, t.c1 + 1, t.c1 + u.c1 FROM t, u WHERE t.c0 = u.c0")
         .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
           SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+          auto planStats = toPlanStats(task->taskStats());
           if (hasSpill) {
             // Dynamic filtering should be disabled with spilling triggered.
             ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
             ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum);
             ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
           } else {
             ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
             ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum);
             ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_EQ(
+                planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                std::unordered_set<core::PlanNodeId>({joinId}));
           }
         })
         .run();
@@ -4268,10 +4359,12 @@ TEST_F(HashJoinTest, dynamicFilters) {
   // Push-down that requires merging filters.
   {
     core::PlanNodeId probeScanId;
+    core::PlanNodeId joinId;
     auto op = PlanBuilder(planNodeIdGenerator, pool_.get())
                   .tableScan(probeType, {"c0 < 500::INTEGER"})
                   .capturePlanNodeId(probeScanId)
                   .hashJoin({"c0"}, {"u_c0"}, buildSide, "", {"c1", "u_c1"})
+                  .capturePlanNodeId(joinId)
                   .project({"c1 + u_c1"})
                   .planNode();
 
@@ -4282,17 +4375,22 @@ TEST_F(HashJoinTest, dynamicFilters) {
             "SELECT t.c1 + u.c1 FROM t, u WHERE t.c0 = u.c0 AND t.c0 < 500")
         .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
           SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+          auto planStats = toPlanStats(task->taskStats());
           if (hasSpill) {
             // Dynamic filtering should be disabled with spilling triggered.
             ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
             ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum);
             ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
           } else {
             ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
             ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum);
             ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_EQ(
+                planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                std::unordered_set<core::PlanNodeId>({joinId}));
           }
         })
         .run();
@@ -4301,11 +4399,13 @@ TEST_F(HashJoinTest, dynamicFilters) {
   // Push-down that turns join into a no-op.
   {
     core::PlanNodeId probeScanId;
+    core::PlanNodeId joinId;
     auto op =
         PlanBuilder(planNodeIdGenerator, pool_.get())
             .tableScan(probeType)
             .capturePlanNodeId(probeScanId)
             .hashJoin({"c0"}, {"u_c0"}, keyOnlyBuildSide, "", {"c0", "c1"})
+            .capturePlanNodeId(joinId)
             .project({"c0", "c1 + 1"})
             .planNode();
 
@@ -4315,12 +4415,14 @@ TEST_F(HashJoinTest, dynamicFilters) {
         .referenceQuery("SELECT t.c0, t.c1 + 1 FROM t, u WHERE t.c0 = u.c0")
         .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
           SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+          auto planStats = toPlanStats(task->taskStats());
           if (hasSpill) {
             // Dynamic filtering should be disabled with spilling triggered.
             ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
             ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum);
             ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
           } else {
             ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
@@ -4328,6 +4430,9 @@ TEST_F(HashJoinTest, dynamicFilters) {
                 getReplacedWithFilterRows(task, 1).sum,
                 numRowsBuild * numSplits);
             ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_EQ(
+                planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                std::unordered_set<core::PlanNodeId>({joinId}));
           }
         })
         .run();
@@ -4337,10 +4442,12 @@ TEST_F(HashJoinTest, dynamicFilters) {
   // number of columns than the input.
   {
     core::PlanNodeId probeScanId;
+    core::PlanNodeId joinId;
     auto op = PlanBuilder(planNodeIdGenerator, pool_.get())
                   .tableScan(probeType)
                   .capturePlanNodeId(probeScanId)
                   .hashJoin({"c0"}, {"u_c0"}, keyOnlyBuildSide, "", {"c0"})
+                  .capturePlanNodeId(joinId)
                   .planNode();
 
     HashJoinBuilder(*pool_, duckDbQueryRunner_, driverExecutor_.get())
@@ -4349,12 +4456,14 @@ TEST_F(HashJoinTest, dynamicFilters) {
         .referenceQuery("SELECT t.c0 FROM t JOIN u ON (t.c0 = u.c0)")
         .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
           SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+          auto planStats = toPlanStats(task->taskStats());
           if (hasSpill) {
             // Dynamic filtering should be disabled with spilling triggered.
             ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
             ASSERT_EQ(0, getReplacedWithFilterRows(task, 1).sum);
             ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
           } else {
             ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
@@ -4362,6 +4471,9 @@ TEST_F(HashJoinTest, dynamicFilters) {
                 getReplacedWithFilterRows(task, 1).sum,
                 numRowsBuild * numSplits);
             ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_EQ(
+                planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                std::unordered_set<core::PlanNodeId>({joinId}));
           }
         })
         .run();
@@ -4370,10 +4482,12 @@ TEST_F(HashJoinTest, dynamicFilters) {
   // Push-down that requires merging filters and turns join into a no-op.
   {
     core::PlanNodeId probeScanId;
+    core::PlanNodeId joinId;
     auto op = PlanBuilder(planNodeIdGenerator, pool_.get())
                   .tableScan(probeType, {"c0 < 500::INTEGER"})
                   .capturePlanNodeId(probeScanId)
                   .hashJoin({"c0"}, {"u_c0"}, keyOnlyBuildSide, "", {"c1"})
+                  .capturePlanNodeId(joinId)
                   .project({"c1 + 1"})
                   .planNode();
 
@@ -4384,17 +4498,22 @@ TEST_F(HashJoinTest, dynamicFilters) {
             "SELECT t.c1 + 1 FROM t, u WHERE t.c0 = u.c0 AND t.c0 < 500")
         .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
           SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+          auto planStats = toPlanStats(task->taskStats());
           if (hasSpill) {
             // Dynamic filtering should be disabled with spilling triggered.
             ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
             ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0);
             ASSERT_EQ(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
           } else {
             ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
             ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
             ASSERT_GT(getReplacedWithFilterRows(task, 1).sum, 0);
             ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+            ASSERT_EQ(
+                planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                std::unordered_set<core::PlanNodeId>({joinId}));
           }
         })
         .run();
@@ -4404,12 +4523,14 @@ TEST_F(HashJoinTest, dynamicFilters) {
   {
     // Inner join.
     core::PlanNodeId probeScanId;
+    core::PlanNodeId joinId;
     auto op =
         PlanBuilder(planNodeIdGenerator, pool_.get())
             .tableScan(probeType, {"c0 < 200::INTEGER"})
             .capturePlanNodeId(probeScanId)
             .hashJoin(
                 {"c0"}, {"u_c0"}, buildSide, "", {"c1"}, core::JoinType::kInner)
+            .capturePlanNodeId(joinId)
             .project({"c1 + 1"})
             .planNode();
 
@@ -4421,17 +4542,22 @@ TEST_F(HashJoinTest, dynamicFilters) {
               "SELECT t.c1 + 1 FROM t, u WHERE t.c0 = u.c0 AND t.c0 < 200")
           .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
             SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+            auto planStats = toPlanStats(task->taskStats());
             if (hasSpill) {
               // Dynamic filtering should be disabled with spilling triggered.
               ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
             } else {
               ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
               ASSERT_GT(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_EQ(
+                  planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                  std::unordered_set<core::PlanNodeId>({joinId}));
             }
           })
           .run();
@@ -4448,6 +4574,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
                  "",
                  {"c1"},
                  core::JoinType::kLeftSemiFilter)
+             .capturePlanNodeId(joinId)
              .project({"c1 + 1"})
              .planNode();
 
@@ -4459,17 +4586,22 @@ TEST_F(HashJoinTest, dynamicFilters) {
               "SELECT t.c1 + 1 FROM t WHERE t.c0 IN (SELECT c0 FROM u) AND t.c0 < 200")
           .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
             SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+            auto planStats = toPlanStats(task->taskStats());
             if (hasSpill) {
               // Dynamic filtering should be disabled with spilling triggered.
               ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
             } else {
               ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
               ASSERT_GT(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_EQ(
+                  planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                  std::unordered_set<core::PlanNodeId>({joinId}));
             }
           })
           .run();
@@ -4486,6 +4618,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
                  "",
                  {"u_c1"},
                  core::JoinType::kRightSemiFilter)
+             .capturePlanNodeId(joinId)
              .project({"u_c1 + 1"})
              .planNode();
 
@@ -4497,17 +4630,22 @@ TEST_F(HashJoinTest, dynamicFilters) {
               "SELECT u.c1 + 1 FROM u WHERE u.c0 IN (SELECT c0 FROM t) AND u.c0 < 200")
           .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
             SCOPED_TRACE(fmt::format("hasSpill:{}", hasSpill));
+            auto planStats = toPlanStats(task->taskStats());
             if (hasSpill) {
               // Dynamic filtering should be disabled with spilling triggered.
               ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
             } else {
               ASSERT_EQ(1, getFiltersProduced(task, 1).sum);
               ASSERT_EQ(1, getFiltersAccepted(task, 0).sum);
               ASSERT_EQ(getReplacedWithFilterRows(task, 1).sum, 0);
               ASSERT_LT(getInputPositions(task, 1), numRowsProbe * numSplits);
+              ASSERT_EQ(
+                  planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+                  std::unordered_set<core::PlanNodeId>({joinId}));
             }
           })
           .run();
@@ -4516,9 +4654,11 @@ TEST_F(HashJoinTest, dynamicFilters) {
 
   // Disable filter push-down by using values in place of scan.
   {
+    core::PlanNodeId joinId;
     auto op = PlanBuilder(planNodeIdGenerator, pool_.get())
                   .values(probeVectors)
                   .hashJoin({"c0"}, {"u_c0"}, buildSide, "", {"c1"})
+                  .capturePlanNodeId(joinId)
                   .project({"c1 + 1"})
                   .planNode();
 
@@ -4526,6 +4666,7 @@ TEST_F(HashJoinTest, dynamicFilters) {
         .planNode(std::move(op))
         .referenceQuery("SELECT t.c1 + 1 FROM t, u WHERE t.c0 = u.c0")
         .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
+          auto planStats = toPlanStats(task->taskStats());
           ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
           ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
           ASSERT_EQ(numRowsProbe * numSplits, getInputPositions(task, 1));
@@ -4537,11 +4678,13 @@ TEST_F(HashJoinTest, dynamicFilters) {
   // probe side.
   {
     core::PlanNodeId probeScanId;
+    core::PlanNodeId joinId;
     auto op = PlanBuilder(planNodeIdGenerator, pool_.get())
                   .tableScan(probeType)
                   .capturePlanNodeId(probeScanId)
                   .project({"cast(c0 + 1 as integer) AS t_key", "c1"})
                   .hashJoin({"t_key"}, {"u_c0"}, buildSide, "", {"c1"})
+                  .capturePlanNodeId(joinId)
                   .project({"c1 + 1"})
                   .planNode();
 
@@ -4550,14 +4693,113 @@ TEST_F(HashJoinTest, dynamicFilters) {
         .makeInputSplits(makeInputSplits(probeScanId))
         .referenceQuery("SELECT t.c1 + 1 FROM t, u WHERE (t.c0 + 1) = u.c0")
         .verifier([&](const std::shared_ptr<Task>& task, bool hasSpill) {
+          auto planStats = toPlanStats(task->taskStats());
           ASSERT_EQ(0, getFiltersProduced(task, 1).sum);
           ASSERT_EQ(0, getFiltersAccepted(task, 0).sum);
           ASSERT_EQ(numRowsProbe * numSplits, getInputPositions(task, 1));
+          ASSERT_TRUE(planStats.at(probeScanId).dynamicFilterStats.empty());
         })
         .run();
   }
 }
 
+TEST_F(HashJoinTest, dynamicFiltersStatsWithChainedJoins) {
+  const int32_t numSplits = 10;
+  const int32_t numProbeRows = 333;
+  const int32_t numBuildRows = 100;
+
+  std::vector<RowVectorPtr> probeVectors;
+  probeVectors.reserve(numSplits);
+  std::vector<std::shared_ptr<TempFilePath>> tempFiles;
+  for (int32_t i = 0; i < numSplits; ++i) {
+    auto rowVector = makeRowVector({
+        makeFlatVector<int32_t>(
+            numProbeRows, [&](auto row) { return row - i * 10; }),
+        makeFlatVector<int64_t>(numProbeRows, [](auto row) { return row; }),
+    });
+    probeVectors.push_back(rowVector);
+    tempFiles.push_back(TempFilePath::create());
+    writeToFile(tempFiles.back()->getPath(), rowVector);
+  }
+  auto makeInputSplits = [&](const core::PlanNodeId& nodeId) {
+    return [&] {
+      std::vector<exec::Split> probeSplits;
+      for (auto& file : tempFiles) {
+        probeSplits.push_back(
+            exec::Split(makeHiveConnectorSplit(file->getPath())));
+      }
+      SplitInput splits;
+      splits.emplace(nodeId, probeSplits);
+      return splits;
+    };
+  };
+
+  // 100 key values in [35, 233] range.
+  std::vector<RowVectorPtr> buildVectors;
+  for (int i = 0; i < 5; ++i) {
+    buildVectors.push_back(makeRowVector({
+        makeFlatVector<int32_t>(
+            numBuildRows / 5,
+            [i](auto row) { return 35 + 2 * (row + i * numBuildRows / 5); }),
+        makeFlatVector<int64_t>(numBuildRows / 5, [](auto row) { return row; }),
+    }));
+  }
+
+  createDuckDbTable("t", probeVectors);
+  createDuckDbTable("u", buildVectors);
+
+  auto probeType = ROW({"c0", "c1"}, {INTEGER(), BIGINT()});
+
+  auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
+
+  auto buildSide1 = PlanBuilder(planNodeIdGenerator, pool_.get())
+                        .values(buildVectors)
+                        .project({"c0 AS u_c0", "c1 AS u_c1"})
+                        .planNode();
+  auto buildSide2 = PlanBuilder(planNodeIdGenerator, pool_.get())
+                        .values(buildVectors)
+                        .project({"c0 AS u_c0", "c1 AS u_c1"})
+                        .planNode();
+  // Inner join pushdown.
+  core::PlanNodeId probeScanId;
+  core::PlanNodeId joinId1;
+  core::PlanNodeId joinId2;
+  auto op = PlanBuilder(planNodeIdGenerator, pool_.get())
+                .tableScan(probeType)
+                .capturePlanNodeId(probeScanId)
+                .hashJoin(
+                    {"c0"},
+                    {"u_c0"},
+                    buildSide1,
+                    "",
+                    {"c0", "c1"},
+                    core::JoinType::kInner)
+                .capturePlanNodeId(joinId1)
+                .hashJoin(
+                    {"c0"},
+                    {"u_c0"},
+                    buildSide2,
+                    "",
+                    {"c0", "c1", "u_c1"},
+                    core::JoinType::kInner)
+                .capturePlanNodeId(joinId2)
+                .project({"c0", "c1 + 1", "c1 + u_c1"})
+                .planNode();
+  HashJoinBuilder(*pool_, duckDbQueryRunner_, driverExecutor_.get())
+      .planNode(std::move(op))
+      .makeInputSplits(makeInputSplits(probeScanId))
+      .injectSpill(false)
+      .referenceQuery(
+          "SELECT t.c0, t.c1 + 1, t.c1 + u.c1 FROM t, u WHERE t.c0 = u.c0")
+      .verifier([&](const std::shared_ptr<Task>& task, bool /*unused*/) {
+        auto planStats = toPlanStats(task->taskStats());
+        ASSERT_EQ(
+            planStats.at(probeScanId).dynamicFilterStats.producerNodeIds,
+            std::unordered_set<core::PlanNodeId>({joinId1, joinId2}));
+      })
+      .run();
+}
+
 TEST_F(HashJoinTest, dynamicFiltersWithSkippedSplits) {
   const int32_t numSplits = 20;
   const int32_t numNonSkippedSplits = 10;
diff --git a/velox/exec/tests/JoinFuzzer.cpp b/velox/exec/tests/JoinFuzzer.cpp
index 5db955780c3ac..f1c534339824b 100644
--- a/velox/exec/tests/JoinFuzzer.cpp
+++ b/velox/exec/tests/JoinFuzzer.cpp
@@ -1020,7 +1020,7 @@ void JoinFuzzer::verify(core::JoinType joinType) {
           VELOX_CHECK(
               assertEqualResults({expected}, {actual}),
               "Logically equivalent plans produced different results");
-        } catch (const VeloxException& e) {
+        } catch (const VeloxException&) {
           LOG(ERROR) << "Expected\n"
                      << expected->toString(0, expected->size()) << "\nActual\n"
                      << actual->toString(0, actual->size());
diff --git a/velox/exec/tests/OperatorUtilsTest.cpp b/velox/exec/tests/OperatorUtilsTest.cpp
index cc192928e1e46..543b619dd90ea 100644
--- a/velox/exec/tests/OperatorUtilsTest.cpp
+++ b/velox/exec/tests/OperatorUtilsTest.cpp
@@ -464,3 +464,30 @@ TEST_F(OperatorUtilsTest, memStatsFromPool) {
   ASSERT_EQ(stats.peakSystemMemoryReservation, 0);
   ASSERT_EQ(stats.numMemoryAllocations, 1);
 }
+
+TEST_F(OperatorUtilsTest, dynamicFilterStats) {
+  DynamicFilterStats dynamicFilterStats;
+  ASSERT_TRUE(dynamicFilterStats.empty());
+  const std::string nodeId1{"node1"};
+  const std::string nodeId2{"node2"};
+  dynamicFilterStats.producerNodeIds.emplace(nodeId1);
+  ASSERT_FALSE(dynamicFilterStats.empty());
+  DynamicFilterStats dynamicFilterStatsToMerge;
+  dynamicFilterStatsToMerge.producerNodeIds.emplace(nodeId1);
+  ASSERT_FALSE(dynamicFilterStatsToMerge.empty());
+  dynamicFilterStats.add(dynamicFilterStatsToMerge);
+  ASSERT_EQ(dynamicFilterStats.producerNodeIds.size(), 1);
+  ASSERT_EQ(
+      dynamicFilterStats.producerNodeIds,
+      std::unordered_set<core::PlanNodeId>({nodeId1}));
+
+  dynamicFilterStatsToMerge.producerNodeIds.emplace(nodeId2);
+  dynamicFilterStats.add(dynamicFilterStatsToMerge);
+  ASSERT_EQ(dynamicFilterStats.producerNodeIds.size(), 2);
+  ASSERT_EQ(
+      dynamicFilterStats.producerNodeIds,
+      std::unordered_set<core::PlanNodeId>({nodeId1, nodeId2}));
+
+  dynamicFilterStats.clear();
+  ASSERT_TRUE(dynamicFilterStats.empty());
+}
diff --git a/velox/exec/tests/PlanBuilderTest.cpp b/velox/exec/tests/PlanBuilderTest.cpp
index 8d9ec1fa48935..31abd5fd3a611 100644
--- a/velox/exec/tests/PlanBuilderTest.cpp
+++ b/velox/exec/tests/PlanBuilderTest.cpp
@@ -239,4 +239,11 @@ TEST_F(PlanBuilderTest, windowFrame) {
           .planNode(),
       "Window frame of type RANGE PRECEDING or FOLLOWING requires single sorting key in ORDER BY");
 }
+
+TEST_F(PlanBuilderTest, missingOutputType) {
+  VELOX_ASSERT_THROW(
+      PlanBuilder().startTableScan().endTableScan(),
+      "outputType must be specified");
+}
+
 } // namespace facebook::velox::exec::test
diff --git a/velox/exec/tests/PrintPlanWithStatsTest.cpp b/velox/exec/tests/PrintPlanWithStatsTest.cpp
index 93f7c6aa2ecc9..8c8fef0b7b460 100644
--- a/velox/exec/tests/PrintPlanWithStatsTest.cpp
+++ b/velox/exec/tests/PrintPlanWithStatsTest.cpp
@@ -138,7 +138,7 @@ TEST_F(PrintPlanWithStatsTest, innerJoinWithTableScan) {
        {"     HashBuild: Input: 100 rows \\(.+\\), Output: 0 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+ Memory allocations: .+, Threads: 1"},
        {"     HashProbe: Input: 2000 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1"},
        {"    -- TableScan\\[table: hive_table\\] -> c0:INTEGER, c1:BIGINT"},
-       {"       Input: 2000 rows \\(.+\\), Raw Input: 20480 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1, Splits: 20"},
+       {"       Input: 2000 rows \\(.+\\), Raw Input: 20480 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1, Splits: 20, DynamicFilter producer plan nodes: 3"},
        {"    -- Project\\[expressions: \\(u_c0:INTEGER, ROW\\[\"c0\"\\]\\), \\(u_c1:BIGINT, ROW\\[\"c1\"\\]\\)\\] -> u_c0:INTEGER, u_c1:BIGINT"},
        {"       Output: 100 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: 0B, Memory allocations: .+, Threads: 1"},
        {"      -- Values\\[100 rows in 1 vectors\\] -> c0:INTEGER, c1:BIGINT"},
@@ -184,7 +184,7 @@ TEST_F(PrintPlanWithStatsTest, innerJoinWithTableScan) {
        {"        runningFinishWallNanos\\s+sum: .+, count: 1, min: .+, max: .+"},
        {"        runningGetOutputWallNanos\\s+sum: .+, count: 1, min: .+, max: .+"},
        {"    -- TableScan\\[table: hive_table\\] -> c0:INTEGER, c1:BIGINT"},
-       {"       Input: 2000 rows \\(.+\\), Raw Input: 20480 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1, Splits: 20"},
+       {"       Input: 2000 rows \\(.+\\), Raw Input: 20480 rows \\(.+\\), Output: 2000 rows \\(.+\\), Cpu time: .+, Blocked wall time: .+, Peak memory: .+, Memory allocations: .+, Threads: 1, Splits: 20, DynamicFilter producer plan nodes: 3"},
        {"          dataSourceAddSplitWallNanos[ ]* sum: .+, count: 1, min: .+, max: .+"},
        {"          dataSourceReadWallNanos[ ]* sum: .+, count: 1, min: .+, max: .+"},
        {"          dynamicFiltersAccepted[ ]* sum: 1, count: 1, min: 1, max: 1"},
diff --git a/velox/exec/tests/RowNumberFuzzerTest.cpp b/velox/exec/tests/RowNumberFuzzerTest.cpp
new file mode 100644
index 0000000000000..3abdc9fd3e767
--- /dev/null
+++ b/velox/exec/tests/RowNumberFuzzerTest.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/init/Init.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <unordered_set>
+#include "velox/common/memory/SharedArbitrator.h"
+#include "velox/connectors/hive/HiveConnector.h"
+#include "velox/exec/MemoryReclaimer.h"
+#include "velox/exec/fuzzer/DuckQueryRunner.h"
+#include "velox/exec/fuzzer/PrestoQueryRunner.h"
+#include "velox/exec/fuzzer/ReferenceQueryRunner.h"
+#include "velox/exec/fuzzer/RowNumberFuzzerRunner.h"
+
+DEFINE_int64(
+    seed,
+    0,
+    "Initial seed for random number generator used to reproduce previous "
+    "results (0 means start with random seed).");
+
+DEFINE_string(
+    presto_url,
+    "",
+    "Presto coordinator URI along with port. If set, we use Presto "
+    "source of truth. Otherwise, use DuckDB. Example: "
+    "--presto_url=http://127.0.0.1:8080");
+
+DEFINE_uint32(
+    req_timeout_ms,
+    1000,
+    "Timeout in milliseconds for HTTP requests made to reference DB, "
+    "such as Presto. Example: --req_timeout_ms=2000");
+
+using namespace facebook::velox::exec;
+
+namespace {
+std::unique_ptr<test::ReferenceQueryRunner> setupReferenceQueryRunner(
+    const std::string& prestoUrl,
+    const std::string& runnerName,
+    const uint32_t& reqTimeoutMs) {
+  if (prestoUrl.empty()) {
+    auto duckQueryRunner = std::make_unique<test::DuckQueryRunner>();
+    LOG(INFO) << "Using DuckDB as the reference DB.";
+    return duckQueryRunner;
+  }
+
+  LOG(INFO) << "Using Presto as the reference DB.";
+  return std::make_unique<test::PrestoQueryRunner>(
+      prestoUrl,
+      runnerName,
+      static_cast<std::chrono::milliseconds>(reqTimeoutMs));
+}
+
+// Invoked to set up memory system with arbitration.
+void setupMemory() {
+  FLAGS_velox_enable_memory_usage_track_in_default_memory_pool = true;
+  FLAGS_velox_memory_leak_check_enabled = true;
+  facebook::velox::memory::SharedArbitrator::registerFactory();
+  facebook::velox::memory::MemoryManagerOptions options;
+  options.allocatorCapacity = 8L << 30;
+  options.arbitratorCapacity = 6L << 30;
+  options.arbitratorKind = "SHARED";
+  options.checkUsageLeak = true;
+  options.arbitrationStateCheckCb = memoryArbitrationStateCheck;
+  facebook::velox::memory::MemoryManager::initialize(options);
+}
+} // namespace
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  // Calls common init functions in the necessary order, initializing
+  // singletons, installing proper signal handlers for better debugging
+  // experience, and initialize glog and gflags.
+  folly::Init init(&argc, &argv);
+  setupMemory();
+  auto referenceQueryRunner = setupReferenceQueryRunner(
+      FLAGS_presto_url, "row_number_fuzzer", FLAGS_req_timeout_ms);
+  const size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed;
+  return test::RowNumberFuzzerRunner::run(
+      initialSeed, std::move(referenceQueryRunner));
+}
diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp
index 400e55c767581..954d3c9804821 100644
--- a/velox/exec/tests/TableScanTest.cpp
+++ b/velox/exec/tests/TableScanTest.cpp
@@ -267,6 +267,8 @@ TEST_F(TableScanTest, allColumns) {
   ASSERT_TRUE(it != planStats.end());
   ASSERT_TRUE(it->second.peakMemoryBytes > 0);
   ASSERT_LT(0, it->second.customStats.at("ioWaitNanos").sum);
+  // Verifies there is no dynamic filter stats.
+  ASSERT_TRUE(it->second.dynamicFilterStats.empty());
 }
 
 TEST_F(TableScanTest, connectorStats) {
diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp
index 3edd7c0996f9a..73d7828c71942 100644
--- a/velox/exec/tests/utils/PlanBuilder.cpp
+++ b/velox/exec/tests/utils/PlanBuilder.cpp
@@ -135,6 +135,7 @@ PlanBuilder& PlanBuilder::tpchTableScan(
 }
 
 core::PlanNodePtr PlanBuilder::TableScanBuilder::build(core::PlanNodeId id) {
+  VELOX_CHECK_NOT_NULL(outputType_, "outputType must be specified");
   std::unordered_map<std::string, core::TypedExprPtr> typedMapping;
   bool hasAssignments = !(assignments_.empty());
   for (uint32_t i = 0; i < outputType_->size(); ++i) {
diff --git a/velox/exec/tests/utils/PlanBuilder.h b/velox/exec/tests/utils/PlanBuilder.h
index 5adf218555a0f..0c6928216a1f4 100644
--- a/velox/exec/tests/utils/PlanBuilder.h
+++ b/velox/exec/tests/utils/PlanBuilder.h
@@ -160,6 +160,10 @@ class PlanBuilder {
   /// Helper class to build a custom TableScanNode.
   /// Uses a planBuilder instance to get the next plan id, memory pool, and
   /// parse options.
+  ///
+  /// Uses the hive connector by default. Specify outputType, tableHandle, and
+  /// assignments for other connectors. If these three are specified, all other
+  /// builder arguments will be ignored.
   class TableScanBuilder {
    public:
     TableScanBuilder(PlanBuilder& builder) : planBuilder_(builder) {}
@@ -177,6 +181,7 @@ class PlanBuilder {
     }
 
     /// @param outputType List of column names and types to read from the table.
+    /// This property is required.
     TableScanBuilder& outputType(RowTypePtr outputType) {
       outputType_ = std::move(outputType);
       return *this;
diff --git a/velox/experimental/wave/common/Block.cuh b/velox/experimental/wave/common/Block.cuh
index 1ab27fcc265b9..1b1281f276452 100644
--- a/velox/experimental/wave/common/Block.cuh
+++ b/velox/experimental/wave/common/Block.cuh
@@ -16,39 +16,98 @@
 
 #pragma once
 
+#include <cub/block/block_radix_sort.cuh>
 #include <cub/block/block_reduce.cuh>
 #include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include "velox/experimental/wave/common/CudaUtil.cuh"
 
 /// Utilities for  booleans and indices and thread blocks.
 
 namespace facebook::velox::wave {
 
+/// Converts an array of flags to an array of indices of set flags. The first
+/// index is given by 'start'. The number of indices is returned in 'size', i.e.
+/// this is 1 + the index of the last set flag.
 template <
+    typename T,
     int32_t blockSize,
+    cub::BlockScanAlgorithm Algorithm = cub::BLOCK_SCAN_RAKING>
+inline int32_t __device__ __host__ boolToIndicesSharedSize() {
+  typedef cub::BlockScan<T, blockSize, Algorithm> BlockScanT;
+
+  return sizeof(typename BlockScanT::TempStorage);
+}
+
+/// Converts an array of flags to an array of indices of set flags. The first
+/// index is given by 'start'. The number of indices is returned in 'size', i.e.
+/// this is 1 + the index of the last set flag.
+template <
+    int32_t blockSize,
+    typename T,
     cub::BlockScanAlgorithm Algorithm = cub::BLOCK_SCAN_RAKING,
     typename Getter>
-__device__ inline void boolBlockToIndices(
-    Getter getter,
-    int32_t start,
-    int32_t* indices,
-    void* shmem,
-    int32_t& size) {
-  typedef cub::BlockScan<int, blockSize, Algorithm> BlockScanT;
+__device__ inline void
+boolBlockToIndices(Getter getter, T start, T* indices, void* shmem, T& size) {
+  typedef cub::BlockScan<T, blockSize, Algorithm> BlockScanT;
 
   auto* temp = reinterpret_cast<typename BlockScanT::TempStorage*>(shmem);
-  int data[1];
+  T data[1];
   uint8_t flag = getter();
   data[0] = flag;
   __syncthreads();
-  int aggregate;
+  T aggregate;
   BlockScanT(*temp).ExclusiveSum(data, data, aggregate);
-  __syncthreads();
   if (flag) {
     indices[data[0]] = threadIdx.x + start;
   }
   if (threadIdx.x == 0) {
     size = aggregate;
   }
+  __syncthreads();
+}
+
+inline int32_t __device__ __host__ bool256ToIndicesSize() {
+  return sizeof(typename cub::WarpScan<uint16_t>::TempStorage) +
+      33 * sizeof(uint16_t);
+}
+
+/// Returns indices of set bits for 256 one byte flags. 'getter8' is
+/// invoked for 8 flags at a time, with the ordinal of the 8 byte
+/// flags word as argument, so that an index of 1 means flags
+/// 8..15. The indices start at 'start' and last index + 1 is
+/// returned in 'size'.
+template <typename T, typename Getter8>
+__device__ inline void
+bool256ToIndices(Getter8 getter8, T start, T* indices, T& size, char* smem) {
+  using Scan = cub::WarpScan<uint16_t>;
+  auto* smem16 = reinterpret_cast<uint16_t*>(smem);
+  int32_t group = threadIdx.x / 8;
+  uint64_t bits = getter8(group) & 0x0101010101010101;
+  if ((threadIdx.x & 7) == 0) {
+    smem16[group] = __popcll(bits);
+    if (threadIdx.x == blockDim.x - 8) {
+      smem16[32] = smem16[31];
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x < 32) {
+    auto* temp = reinterpret_cast<typename Scan::TempStorage*>((smem + 72));
+    uint16_t data = smem16[threadIdx.x];
+    Scan(*temp).ExclusiveSum(data, data);
+    smem16[threadIdx.x] = data;
+  }
+  __syncthreads();
+  int32_t tidInGroup = threadIdx.x & 7;
+  if (bits & (1UL << (tidInGroup * 8))) {
+    int32_t base =
+        smem16[group] + __popcll(bits & lowMask<uint64_t>(tidInGroup * 8));
+    indices[base] = threadIdx.x + start;
+  }
+  if (threadIdx.x == 0) {
+    size = smem16[31] + smem16[32];
+  }
+  __syncthreads();
 }
 
 template <int32_t blockSize, typename T, typename Getter>
@@ -65,4 +124,164 @@ __device__ inline void blockSum(Getter getter, void* shmem, T* result) {
   }
 }
 
+template <
+    int32_t kBlockSize,
+    int32_t kItemsPerThread,
+    typename Key,
+    typename Value>
+using RadixSort =
+    typename cub::BlockRadixSort<Key, kBlockSize, kItemsPerThread, Value>;
+
+template <
+    int32_t kBlockSize,
+    int32_t kItemsPerThread,
+    typename Key,
+    typename Value>
+inline int32_t __host__ __device__ blockSortSharedSize() {
+  return sizeof(
+      typename RadixSort<kBlockSize, kItemsPerThread, Key, Value>::TempStorage);
+}
+
+template <
+    int32_t kBlockSize,
+    int32_t kItemsPerThread,
+    typename Key,
+    typename Value,
+    typename KeyGetter,
+    typename ValueGetter>
+void __device__ blockSort(
+    KeyGetter keyGetter,
+    ValueGetter valueGetter,
+    Key* keyOut,
+    Value* valueOut,
+    char* smem) {
+  using Sort = cub::BlockRadixSort<Key, kBlockSize, kItemsPerThread, Value>;
+
+  // Per-thread tile items
+  Key keys[kItemsPerThread];
+  Value values[kItemsPerThread];
+
+  // Our current block's offset
+  int blockOffset = 0;
+
+  // Load items into a blocked arrangement
+  for (auto i = 0; i < kItemsPerThread; ++i) {
+    int32_t idx = blockOffset + i * kBlockSize + threadIdx.x;
+    values[i] = valueGetter(idx);
+    keys[i] = keyGetter(idx);
+  }
+
+  __syncthreads();
+  auto* temp_storage = reinterpret_cast<typename Sort::TempStorage*>(smem);
+
+  Sort(*temp_storage).SortBlockedToStriped(keys, values);
+
+  // Store output in striped fashion
+  cub::StoreDirectStriped<kBlockSize>(
+      threadIdx.x, valueOut + blockOffset, values);
+  cub::StoreDirectStriped<kBlockSize>(threadIdx.x, keyOut + blockOffset, keys);
+  __syncthreads();
+}
+
+template <int kBlockSize>
+int32_t partitionRowsSharedSize(int32_t numPartitions) {
+  using Scan = cub::BlockScan<int, kBlockSize>;
+  auto scanSize = sizeof(typename Scan::TempStorage) + sizeof(int32_t);
+  int32_t counterSize = sizeof(int32_t) * numPartitions;
+  if (counterSize <= scanSize) {
+    return scanSize;
+  }
+  static_assert(
+      sizeof(typename Scan::TempStorage) >= sizeof(int32_t) * kBlockSize);
+  return scanSize + counterSize; // - kBlockSize * sizeof(int32_t);
+}
+
+/// Partitions a sequence of indices into runs where the indices
+/// belonging to the same partition are contiguous. Indices from 0 to
+/// 'numKeys-1' are partitioned into 'partitionedRows', which must
+/// have space for 'numKeys' row numbers. The 0-based partition number
+/// for row 'i' is given by 'getter(i)'.  The row numbers for
+/// partition 0 start at 0. The row numbers for partition i start at
+/// 'partitionStarts[i-1]'. There must be at least the amount of
+/// shared memory given by partitionSharedSize(numPartitions).
+/// 'ranks' is a temporary array of 'numKeys' elements.
+template <int32_t kBlockSize, typename RowNumber, typename Getter>
+void __device__ partitionRows(
+    Getter getter,
+    uint32_t numKeys,
+    uint32_t numPartitions,
+    RowNumber* ranks,
+    RowNumber* partitionStarts,
+    RowNumber* partitionedRows) {
+  using Scan = cub::BlockScan<int32_t, kBlockSize>;
+  constexpr int32_t kWarpThreads = 1 << CUB_LOG_WARP_THREADS(0);
+  auto warp = threadIdx.x / kWarpThreads;
+  auto lane = cub::LaneId();
+  extern __shared__ __align__(16) char smem[];
+  auto* counters = reinterpret_cast<uint32_t*>(
+      numPartitions <= kBlockSize ? smem
+                                  : smem +
+              sizeof(typename Scan::
+                         TempStorage) /*- kBlockSize * sizeof(uint32_t)*/);
+  for (auto i = threadIdx.x; i < numPartitions; i += kBlockSize) {
+    counters[i] = 0;
+  }
+  __syncthreads();
+  for (auto start = 0; start < numKeys; start += kBlockSize) {
+    int32_t warpStart = start + warp * kWarpThreads;
+    if (start >= numKeys) {
+      break;
+    }
+    uint32_t laneMask = warpStart + kWarpThreads <= numKeys
+        ? 0xffffffff
+        : lowMask<uint32_t>(numKeys - warpStart);
+    if (warpStart + lane < numKeys) {
+      int32_t key = getter(warpStart + lane);
+      uint32_t mask = __match_any_sync(laneMask, key);
+      int32_t leader = (kWarpThreads - 1) - __clz(mask);
+      uint32_t cnt = __popc(mask & lowMask<uint32_t>(lane + 1));
+      uint32_t base;
+      if (lane == leader) {
+        base = atomicAdd(&counters[key], cnt);
+      }
+      base = __shfl_sync(laneMask, base, leader);
+      ranks[warpStart + lane] = base + cnt - 1;
+    }
+  }
+  // Prefix sum the counts. All counters must have their final value.
+  __syncthreads();
+  auto* temp = reinterpret_cast<typename Scan::TempStorage*>(smem);
+  int32_t* aggregate = reinterpret_cast<int32_t*>(smem);
+  for (auto start = 0; start < numPartitions; start += kBlockSize) {
+    int32_t localCount[1];
+    localCount[0] =
+        threadIdx.x + start < numPartitions ? counters[start + threadIdx.x] : 0;
+    if (threadIdx.x == 0 && start > 0) {
+      // The sum of the previous round is carried over as start of this.
+      localCount[0] += *aggregate;
+    }
+    Scan(*temp).InclusiveSum(localCount, localCount);
+    if (start + threadIdx.x < numPartitions) {
+      partitionStarts[start + threadIdx.x] = localCount[0];
+    }
+    if (threadIdx.x == kBlockSize - 1 && start + kBlockSize < numPartitions) {
+      *aggregate = localCount[0];
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) {
+    if (partitionStarts[numPartitions - 1] != numKeys) {
+      *(long*)0 = 0;
+    }
+  }
+  // Write the row numbers of the inputs into the rankth position in each
+  // partition.
+  for (auto i = threadIdx.x; i < numKeys; i += kBlockSize) {
+    auto key = getter(i);
+    auto keyStart = key == 0 ? 0 : partitionStarts[key - 1];
+    partitionedRows[keyStart + ranks[i]] = i;
+  }
+  __syncthreads();
+}
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/Buffer.h b/velox/experimental/wave/common/Buffer.h
index 57451596a94b7..a205e173ffccf 100644
--- a/velox/experimental/wave/common/Buffer.h
+++ b/velox/experimental/wave/common/Buffer.h
@@ -32,6 +32,8 @@ class GpuArena;
 /// Buffer free list.
 class Buffer {
  public:
+  virtual ~Buffer() = default;
+
   template <typename T>
   T* as() {
     return reinterpret_cast<T*>(ptr_);
@@ -71,9 +73,9 @@ class Buffer {
     return referenceCount_;
   }
 
-  void release();
+  virtual void release();
 
- private:
+ protected:
   // Number of WaveBufferPtrs referencing 'this'.
   std::atomic<int32_t> referenceCount_{0};
 
@@ -108,4 +110,34 @@ static inline void intrusive_ptr_release(Buffer* buffer) {
   buffer->release();
 }
 
+template <typename Releaser>
+class WaveBufferView : public Buffer {
+ public:
+  static WaveBufferPtr create(uint8_t* data, size_t size, Releaser releaser) {
+    WaveBufferView<Releaser>* view = new WaveBufferView(data, size, releaser);
+    WaveBufferPtr result(view);
+    return result;
+  }
+
+  ~WaveBufferView() override = default;
+
+  void release() override {
+    if (referenceCount_.fetch_sub(1) == 1) {
+      // Destructs releaser, which should release the hold on the underlying
+      // buffer.
+      delete this;
+    }
+  }
+
+ private:
+  WaveBufferView(uint8_t* data, size_t size, Releaser releaser)
+      : Buffer(), releaser_(releaser) {
+    ptr_ = data;
+    size_ = size;
+    capacity_ = size;
+  }
+
+  Releaser const releaser_;
+};
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/Cuda.cu b/velox/experimental/wave/common/Cuda.cu
index 10e716065a244..dd63f28b82439 100644
--- a/velox/experimental/wave/common/Cuda.cu
+++ b/velox/experimental/wave/common/Cuda.cu
@@ -16,10 +16,13 @@
 
 #include <cuda_runtime.h>
 #include <fmt/format.h>
+#include <iostream>
 #include "velox/experimental/wave/common/Cuda.h"
 #include "velox/experimental/wave/common/CudaUtil.cuh"
 #include "velox/experimental/wave/common/Exception.h"
 
+#include <sstream>
+
 namespace facebook::velox::wave {
 
 void cudaCheck(cudaError_t err, const char* file, int line) {
@@ -30,6 +33,16 @@ void cudaCheck(cudaError_t err, const char* file, int line) {
       fmt::format("Cuda error: {}:{} {}", file, line, cudaGetErrorString(err)));
 }
 
+void cudaCheckFatal(cudaError_t err, const char* file, int line) {
+  if (err == cudaSuccess) {
+    return;
+  }
+  auto error =
+      fmt::format("Cuda error: {}:{} {}", file, line, cudaGetErrorString(err));
+  std::cerr << err << std::endl;
+  exit(1);
+}
+
 namespace {
 class CudaManagedAllocator : public GpuAllocator {
  public:
@@ -208,5 +221,67 @@ float Event::elapsedTime(const Event& start) const {
   CUDA_CHECK(cudaEventElapsedTime(&ms, start.event_->event, event_->event));
   return ms;
 }
+namespace {
+struct KernelEntry {
+  const char* name;
+  const void* func;
+};
+
+int32_t numKernelEntries = 0;
+KernelEntry kernelEntries[200];
+} // namespace
+
+bool registerKernel(const char* name, const void* func) {
+  kernelEntries[numKernelEntries].name = name;
+  kernelEntries[numKernelEntries].func = func;
+  ++numKernelEntries;
+  if (numKernelEntries >= sizeof(kernelEntries) / sizeof(kernelEntries[0])) {
+    LOG(ERROR) << "Reserve more space in kernelEntries";
+    exit(1);
+  }
+  return true;
+}
+
+KernelInfo kernelInfo(const const void* func) {
+  cudaFuncAttributes attrs;
+  CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, func));
+  KernelInfo info;
+  info.numRegs = attrs.numRegs;
+  info.maxThreadsPerBlock = attrs.maxThreadsPerBlock;
+  info.sharedMemory = attrs.sharedSizeBytes;
+  int max;
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max, func, 256, 0);
+  info.maxOccupancy0 = max;
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max, func, 256, 16);
+  info.maxOccupancy16 = max;
+
+  return info;
+}
+
+std::string KernelInfo::toString() const {
+  std::stringstream out;
+  out << "NumRegs=" << numRegs << " maxThreadsPerBlock= " << maxThreadsPerBlock
+      << " sharedMemory=" << sharedMemory
+      << " occupancy 256,  0=" << maxOccupancy0
+      << " occupancy 256,16=" << maxOccupancy16;
+  return out.str();
+}
+
+KernelInfo getRegisteredKernelInfo(const char* name) {
+  for (auto i = 0; i < numKernelEntries; ++i) {
+    if (strcmp(name, kernelEntries[i].name) == 0) {
+      return kernelInfo(kernelEntries[i].func);
+    }
+  }
+  return KernelInfo();
+}
+
+void printKernels() {
+  for (auto i = 0; i < numKernelEntries; ++i) {
+    std::cout << kernelEntries[i].name << " - "
+              << getRegisteredKernelInfo(kernelEntries[i].name).toString()
+              << std::endl;
+  }
+}
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/Cuda.h b/velox/experimental/wave/common/Cuda.h
index b8e11806013d1..cf7900ec2f50a 100644
--- a/velox/experimental/wave/common/Cuda.h
+++ b/velox/experimental/wave/common/Cuda.h
@@ -18,6 +18,8 @@
 
 #include <functional>
 #include <memory>
+#include <string>
+#include <unordered_map>
 /// Contains wrappers for common Cuda objects. Wave does not directly
 /// include Cuda headers because of interference with BitUtils.h and
 /// SimdUtils.h.
@@ -183,4 +185,23 @@ GpuAllocator::UniquePtr<T[]> GpuAllocator::allocate(size_t n) {
   return UniquePtr<T[]>(ptr, Deleter(this, bytes));
 }
 
+/// Info on kernel occupancy limits.
+struct KernelInfo {
+  int32_t numRegs{0};
+  int32_t maxThreadsPerBlock;
+  int32_t sharedMemory{0};
+  int32_t maxOccupancy0{0};
+  int32_t maxOccupancy16{0};
+
+  std::string toString() const;
+};
+
+KernelInfo getRegisteredKernelInfo(const char* name);
+
+KernelInfo kernelInfo(const void* func);
+
+std::unordered_map<std::string, KernelInfo>& kernelRegistry();
+/// Prints summary of registered kernels.
+void printKernels();
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/CudaUtil.cuh b/velox/experimental/wave/common/CudaUtil.cuh
index 07549cfc3f32f..120ce1ec90371 100644
--- a/velox/experimental/wave/common/CudaUtil.cuh
+++ b/velox/experimental/wave/common/CudaUtil.cuh
@@ -25,13 +25,25 @@ namespace facebook::velox::wave {
 
 void cudaCheck(cudaError_t err, const char* file, int line);
 
+void cudaCheckFatal(cudaError_t err, const char* file, int line);
+
 #define CUDA_CHECK(e) ::facebook::velox::wave::cudaCheck(e, __FILE__, __LINE__)
 
+#ifndef CUDA_CHECK_FATAL
+#define CUDA_CHECK_FATAL(e) \
+  ::facebook::velox::wave::cudaCheckFatal(e, __FILE__, __LINE__)
+#endif
+
 template <typename T, typename U>
 __host__ __device__ constexpr inline T roundUp(T value, U factor) {
   return (value + (factor - 1)) / factor * factor;
 }
 
+template <typename T>
+T __device__ __host__ lowMask(int32_t bits) {
+  return (static_cast<T>(1) << bits) - 1;
+}
+
 __device__ __host__ inline int
 memcmp(const void* lhs, const void* rhs, size_t n) {
   auto* a = reinterpret_cast<const uint8_t*>(lhs);
@@ -44,7 +56,20 @@ memcmp(const void* lhs, const void* rhs, size_t n) {
   return 0;
 }
 
+inline uint32_t __device__ deviceScale32(uint32_t n, uint32_t scale) {
+  return (static_cast<uint64_t>(static_cast<uint32_t>(n)) * scale) >> 32;
+}
+
 struct StreamImpl {
   cudaStream_t stream;
 };
+
+bool registerKernel(const char* name, const void* func);
+
+#define REGISTER_KERNEL(name, func)                              \
+  namespace {                                                    \
+  static bool func##_reg =                                       \
+      registerKernel(name, reinterpret_cast<const void*>(func)); \
+  }
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/FreeSet.cuh b/velox/experimental/wave/common/FreeSet.cuh
new file mode 100644
index 0000000000000..c6e7f2bfc3cf1
--- /dev/null
+++ b/velox/experimental/wave/common/FreeSet.cuh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace facebook::velox::wave {
+
+template <typename T, int32_t kSize>
+class FreeSet {
+ public:
+  static constexpr uint32_t kEmpty = ~0;
+  static constexpr int32_t kBitSizeMask = (kSize / 64) - 1;
+  static constexpr int32_t kSizeMask = kSize - 1;
+
+  void __device__ clear() {
+    for (auto i = threadIdx.x; i < kSize; i += blockDim.x) {
+      if (i < sizeof(bits_) / sizeof(bits_[0])) {
+        bits_[i] = 0;
+      }
+      items_[i] = kEmpty;
+    }
+  }
+
+  // Adds an item. Returns true if succeededs.
+  bool __device__ put(T item) {
+    if (full_) {
+      return false;
+    }
+    auto tid = threadIdx.x + blockDim.x * blockIdx.x;
+    auto bitIdx = tid & kBitSizeMask;
+    for (auto count = 0; count <= kBitSizeMask; ++count) {
+      auto word = ~bits_[bitIdx];
+      while (word) {
+        auto bit = __ffsll(word);
+        --bit;
+        if (kEmpty == atomicCAS(&items_[bitIdx * 64 + bit], kEmpty, item)) {
+          atomicOr(&bits_[bitIdx], 1UL << bit);
+          if (empty_) {
+            atomicExch(&empty_, 0);
+          }
+          return true;
+        }
+        word &= word - 1;
+      }
+      bitIdx = bitIdx + 1 & kBitSizeMask;
+    }
+    atomicExch(&full_, 1);
+    return false;
+  }
+
+  T __device__ get() {
+    if (empty_) {
+      return kEmpty;
+    }
+
+    auto tid = threadIdx.x + blockDim.x * blockIdx.x;
+    auto bitIdx = tid & kBitSizeMask;
+    for (auto count = 0; count <= kBitSizeMask; ++count) {
+      auto word = bits_[bitIdx];
+      while (word) {
+        auto bit = __ffsll(word);
+        --bit;
+        T item = atomicExch(&items_[bitIdx * 64 + bit], kEmpty);
+        if (item != kEmpty) {
+          atomicAnd(&bits_[bitIdx], ~(1UL << bit));
+          if (full_) {
+            atomicExch(&full_, 0);
+          }
+          return item;
+        }
+        word &= word - 1;
+      }
+      bitIdx = bitIdx + 1 & kBitSizeMask;
+    }
+    atomicExch(&empty_, true);
+    return kEmpty;
+  }
+
+  int32_t full_{0};
+  int32_t empty_{1};
+  unsigned long long bits_[kBitSizeMask + 1];
+  T items_[kSize];
+};
+} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/GpuArena.cpp b/velox/experimental/wave/common/GpuArena.cpp
index 2a41f8859e0d6..2d2f4610036ce 100644
--- a/velox/experimental/wave/common/GpuArena.cpp
+++ b/velox/experimental/wave/common/GpuArena.cpp
@@ -272,7 +272,9 @@ std::string GpuSlab::toString() const {
 }
 
 GpuArena::Buffers::Buffers() {
-  memset(&buffers[0], 0, sizeof(buffers));
+  for (auto i = 0; i < sizeof(buffers) / sizeof(buffers[0]); ++i) {
+    new (&buffers[i]) Buffer();
+  }
 }
 
 GpuArena::GpuArena(uint64_t singleArenaCapacity, GpuAllocator* allocator)
@@ -297,6 +299,7 @@ WaveBufferPtr GpuArena::getBuffer(void* ptr, size_t size) {
     result = firstFreeBuffer_;
   }
   firstFreeBuffer_ = reinterpret_cast<Buffer*>(result->ptr_);
+  new (result) Buffer();
   result->arena_ = this;
   result->ptr_ = ptr;
   result->size_ = size;
@@ -325,10 +328,9 @@ WaveBufferPtr GpuArena::allocateBytes(uint64_t bytes) {
   // If first allocation fails we create a new GpuSlab for another attempt. If
   // it ever fails again then it means requested bytes is larger than a single
   // GpuSlab's capacity. No further attempts will happen.
+  auto arenaBytes = std::max<uint64_t>(singleArenaCapacity_, bytes);
   auto newArena = std::make_shared<GpuSlab>(
-      allocator_->allocate(singleArenaCapacity_),
-      singleArenaCapacity_,
-      allocator_);
+      allocator_->allocate(arenaBytes), arenaBytes, allocator_);
   arenas_.emplace(reinterpret_cast<uint64_t>(newArena->address()), newArena);
   currentArena_ = newArena;
   result = currentArena_->allocate(bytes);
diff --git a/velox/experimental/wave/common/GpuArena.h b/velox/experimental/wave/common/GpuArena.h
index 8cb39948139aa..393899d9338cd 100644
--- a/velox/experimental/wave/common/GpuArena.h
+++ b/velox/experimental/wave/common/GpuArena.h
@@ -124,7 +124,7 @@ class GpuArena {
   WaveBufferPtr allocateBytes(uint64_t bytes);
 
   template <typename T>
-  WaveBufferPtr allocate(int32_t items) {
+  WaveBufferPtr allocate(uint64_t items) {
     static_assert(std::is_trivially_destructible_v<T>);
     return allocateBytes(sizeof(T) * items);
   }
diff --git a/velox/experimental/wave/common/Hash.h b/velox/experimental/wave/common/Hash.h
index ca3e51e95e265..c91f25407bdca 100644
--- a/velox/experimental/wave/common/Hash.h
+++ b/velox/experimental/wave/common/Hash.h
@@ -93,6 +93,17 @@ __device__ __host__ inline uint32_t twang32From64(uint64_t key) {
   return static_cast<uint32_t>(key);
 }
 
+__device__ inline uint64_t hashMix(const uint64_t upper, const uint64_t lower) {
+  // Murmur-inspired hashing.
+  const uint64_t kMul = 0x9ddfea08eb382d69ULL;
+  uint64_t a = (lower ^ upper) * kMul;
+  a ^= (a >> 47);
+  uint64_t b = (upper ^ a) * kMul;
+  b ^= (b >> 47);
+  b *= kMul;
+  return b;
+}
+
 template <typename T>
 struct IntHasher32 {
   __device__ __host__ uint32_t operator()(T val) const {
diff --git a/velox/experimental/wave/common/HashTable.cuh b/velox/experimental/wave/common/HashTable.cuh
new file mode 100644
index 0000000000000..9ece4aa0cfbce
--- /dev/null
+++ b/velox/experimental/wave/common/HashTable.cuh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda/atomic>
+#include <cuda/semaphore>
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_ptx.cuh>
+
+#include "velox/experimental/wave/common/CudaUtil.cuh"
+#include "velox/experimental/wave/common/FreeSet.cuh"
+#include "velox/experimental/wave/common/Hash.h"
+#include "velox/experimental/wave/common/HashTable.h"
+
+namespace facebook::velox::wave {
+
+#define GPF() *(long*)0 = 0
+
+template <typename T, typename U>
+inline __device__ cuda::atomic<T, cuda::thread_scope_device>* asDeviceAtomic(
+    U* ptr) {
+  return reinterpret_cast<cuda::atomic<T, cuda::thread_scope_device>*>(ptr);
+}
+
+template <typename T>
+inline bool __device__ atomicTryLock(T* lock) {
+  return 0 ==
+      asDeviceAtomic<int32_t>(lock)->exchange(1, cuda::memory_order_consume);
+}
+
+template <typename T>
+inline void __device__ atomicUnlock(T* lock) {
+  asDeviceAtomic<int32_t>(lock)->store(0, cuda::memory_order_release);
+}
+
+/// Allocator subclass that defines device member functions.
+struct RowAllocator : public HashPartitionAllocator {
+  template <typename T>
+  T* __device__ allocateRow() {
+    auto fromFree = getFromFree();
+    if (fromFree != kEmpty) {
+      ++numFromFree;
+      return reinterpret_cast<T*>(base + fromFree);
+    }
+    auto offset = atomicAdd(&rowOffset, rowSize);
+
+    if (offset + rowSize < cub::ThreadLoad<cub::LOAD_CG>(&stringOffset)) {
+      if (!inRange(base + offset)) {
+        GPF();
+      }
+      return reinterpret_cast<T*>(base + offset);
+    }
+    return nullptr;
+  }
+
+  uint32_t __device__ getFromFree() {
+    uint32_t item = reinterpret_cast<FreeSet<uint32_t, 1024>*>(freeSet)->get();
+    if (item != kEmpty) {
+      ++numFromFree;
+    }
+    return item;
+  }
+
+  void __device__ freeRow(void* row) {
+    if (!inRange(row)) {
+      GPF();
+    }
+    uint32_t offset = reinterpret_cast<uint64_t>(row) - base;
+    numFull += reinterpret_cast<FreeSet<uint32_t, 1024>*>(freeSet)->put(
+                   offset) == false;
+  }
+
+  template <typename T>
+  T* __device__ allocate(int32_t cnt) {
+    uint32_t size = sizeof(T) * cnt;
+    auto offset = atomicSub(&stringOffset, size);
+    if (offset - size > cub::ThreadLoad<cub::LOAD_CG>(&rowOffset)) {
+      if (!inRange(base + offset - size)) {
+        GPF();
+      }
+      return reinterpret_cast<T*>(base + offset - size);
+    }
+    return nullptr;
+  }
+
+  template <typename T>
+  bool __device__ inRange(T ptr) {
+    return reinterpret_cast<uint64_t>(ptr) >= base &&
+        reinterpret_cast<uint64_t>(ptr) < base + capacity;
+  }
+};
+
+inline uint8_t __device__ hashTag(uint64_t h) {
+  return 0x80 | (h >> 32);
+}
+
+struct GpuBucket : public GpuBucketMembers {
+  template <typename RowType>
+  inline RowType* __device__ load(int32_t idx) const {
+    uint64_t uptr = reinterpret_cast<const uint32_t*>(&data)[idx];
+    if (uptr == 0) {
+      return nullptr;
+    }
+    uptr |= static_cast<uint64_t>(data[idx + 8]) << 32;
+    return reinterpret_cast<RowType*>(uptr);
+  }
+
+  template <typename RowType>
+  inline RowType* __device__ loadConsume(int32_t idx) {
+    uint64_t uptr =
+        asDeviceAtomic<uint32_t>(&data)[idx].load(cuda::memory_order_consume);
+    if (uptr == 0) {
+      return nullptr;
+    }
+    uptr |= static_cast<uint64_t>(data[idx + 8]) << 32;
+    return reinterpret_cast<RowType*>(uptr);
+  }
+
+  template <typename RowType>
+  inline RowType* __device__ loadWithWait(int32_t idx) {
+    RowType* hit;
+    do {
+      // It could be somebody inserted the tag but did not fill in the
+      // pointer. The pointer is coming in a few clocks.
+      hit = loadConsume<RowType>(idx);
+    } while (!hit);
+    return hit;
+  }
+
+  inline void __device__ store(int32_t idx, void* ptr) {
+    auto uptr = reinterpret_cast<uint64_t>(ptr);
+    data[8 + idx] = uptr >> 32;
+    // The high part must be seen if the low part is seen.
+    asDeviceAtomic<uint32_t>(&data)[idx].store(
+        uptr, cuda::memory_order_release);
+  }
+
+  bool __device__ addNewTag(uint8_t tag, uint32_t oldTags, uint8_t tagShift) {
+    uint32_t newTags = oldTags | ((static_cast<uint32_t>(tag) << tagShift));
+    return (oldTags == atomicCAS(&tags, oldTags, newTags));
+  }
+};
+
+/// Shared memory state for an updating probe.
+struct ProbeShared {
+  int32_t* inputRetries;
+  int32_t* outputRetries;
+  uint32_t numKernelRetries;
+  uint32_t numHostRetries;
+  int32_t blockBase;
+  int32_t blockEnd;
+  int32_t numRounds;
+  int32_t toDo;
+  int32_t done;
+  int32_t numUpdated;
+  int32_t numTried;
+
+  /// Initializes a probe. Sets outputRetries and clears inputRetries and other
+  /// state.
+  void __device__ init(HashProbe* probe, int32_t base) {
+    inputRetries = nullptr;
+    outputRetries = probe->kernelRetries1;
+    numKernelRetries = 0;
+    numHostRetries = 0;
+    blockBase = base;
+    toDo = 0;
+    done = 0;
+    numRounds = 0;
+  }
+
+  // Resets retrry count and swaps input and output retries.
+  void __device__ nextRound(HashProbe* probe) {
+    numKernelRetries = 0;
+    if (!inputRetries) {
+      // This is after the initial round where there are no input retries.
+      inputRetries = outputRetries;
+      outputRetries = probe->kernelRetries2;
+    } else {
+      // swap input and output retries.
+      auto temp = outputRetries;
+      outputRetries = inputRetries;
+      inputRetries = temp;
+    }
+  }
+};
+
+class GpuHashTable : public GpuHashTableBase {
+ public:
+  static constexpr int32_t kExclusive = 1;
+
+  static int32_t updatingProbeSharedSize() {
+    return sizeof(ProbeShared);
+  }
+
+  template <typename RowType, typename Ops>
+  void __device__ readOnlyProbe(HashProbe* probe, Ops ops) {
+    int32_t blockBase = ops.blockBase(probe);
+    int32_t end = ops.numRowsInBlock(probe) + blockBase;
+    for (auto i = blockBase + threadIdx.x; i < end; i += blockDim.x) {
+      auto h = ops.hash(i, probe);
+      uint32_t tagWord = hashTag(h);
+      tagWord |= tagWord << 8;
+      tagWord = tagWord | tagWord << 16;
+      auto bucketIdx = h & sizeMask;
+      for (;;) {
+        GpuBucket* bucket = buckets + bucketIdx;
+        auto tags = bucket->tags;
+        auto hits = __vcmpeq4(tags, tagWord) & 0x01010101;
+        while (hits) {
+          auto hitIdx = (__ffs(hits) - 1) / 8;
+          auto* hit = bucket->load<RowType>(hitIdx);
+          if (ops.compare(this, hit, i, probe)) {
+            ops.hit(i, probe, hit);
+            goto done;
+          }
+          hits = hits & (hits - 1);
+        }
+        if (__vcmpeq4(tags, 0)) {
+          ops.miss(i, probe);
+          break;
+        }
+        bucketIdx = (bucketIdx + 1) & sizeMask;
+      }
+    done:;
+    }
+  }
+
+  template <typename RowType, typename Ops>
+  void __device__ updatingProbe(HashProbe* probe, Ops ops) {
+    extern __shared__ __align__(16) char smem[];
+    auto* sharedState = reinterpret_cast<ProbeShared*>(smem);
+    if (threadIdx.x == 0) {
+      sharedState->init(probe, ops.blockBase(probe));
+    }
+    __syncthreads();
+    auto lane = cub::LaneId();
+    constexpr int32_t kWarpThreads = 1 << CUB_LOG_WARP_THREADS(0);
+    auto warp = threadIdx.x / kWarpThreads;
+    int32_t end = ops.numRowsInBlock(probe) + sharedState->blockBase;
+    for (auto i = threadIdx.x + sharedState->blockBase; i < end;
+         i += blockDim.x) {
+      auto start = i & ~(kWarpThreads - 1);
+      uint32_t laneMask =
+          start + kWarpThreads <= end ? ~0 : lowMask<uint32_t>(end - start);
+      auto h = ops.hash(i, probe);
+      uint32_t tagWord = hashTag(h);
+      tagWord |= tagWord << 8;
+      tagWord = tagWord | tagWord << 16;
+      auto bucketIdx = h & sizeMask;
+      uint32_t misses = 0;
+      RowType* hit = nullptr;
+      RowType* toInsert = nullptr;
+      int32_t hitIdx;
+      GpuBucket* bucket;
+      uint32_t tags;
+      for (;;) {
+        bucket = buckets + bucketIdx;
+      reprobe:
+        tags = asDeviceAtomic<uint32_t>(&bucket->tags)
+                   ->load(cuda::memory_order_consume);
+        auto hits = __vcmpeq4(tags, tagWord) & 0x01010101;
+        while (hits) {
+          hitIdx = (__ffs(hits) - 1) / 8;
+          auto candidate = bucket->loadWithWait<RowType>(hitIdx);
+          if (ops.compare(this, candidate, i, probe)) {
+            if (toInsert) {
+              freeInsertable(toInsert, h);
+            }
+            hit = candidate;
+            break;
+          }
+          hits = hits & (hits - 1);
+        }
+        if (hit) {
+          break;
+        }
+        misses = __vcmpeq4(tags, 0);
+        if (misses) {
+          auto success = ops.insert(
+              this,
+              partitionIdx(h),
+              bucket,
+              misses,
+              tags,
+              tagWord,
+              i,
+              probe,
+              toInsert);
+          if (success == ProbeState::kRetry) {
+            goto reprobe;
+          }
+          if (success == ProbeState::kNeedSpace) {
+            addHostRetry(sharedState, i, probe);
+          }
+          hit = toInsert;
+          break;
+        }
+        bucketIdx = (bucketIdx + 1) & sizeMask;
+      }
+      // Every lane has a hit, or a nullptr if out of space.
+      uint32_t peers =
+          __match_any_sync(laneMask, reinterpret_cast<int64_t>(hit));
+      if (hit) {
+        int32_t leader = (kWarpThreads - 1) - __clz(peers);
+        RowType* writable = nullptr;
+        if (lane == leader) {
+          writable = ops.getExclusive(this, bucket, hit, hitIdx, warp);
+        }
+        auto toUpdate = peers;
+        ProbeState success = ProbeState::kDone;
+        while (toUpdate) {
+          auto peer = __ffs(toUpdate) - 1;
+          auto idxToUpdate = __shfl_sync(peers, i, peer);
+          if (lane == leader) {
+            if (success == ProbeState::kDone) {
+              success = ops.update(this, bucket, writable, idxToUpdate, probe);
+            }
+            if (success == ProbeState::kNeedSpace) {
+              addHostRetry(sharedState, idxToUpdate, probe);
+            }
+            if (success != ProbeState::kDone) {
+              printf("");
+            }
+          }
+          toUpdate &= toUpdate - 1;
+        }
+        if (lane == leader) {
+          ops.writeDone(writable);
+        }
+      } else {
+        printf("");
+      }
+    }
+  }
+
+  template <typename RowType>
+  void __device__ freeInsertable(RowType*& row, uint64_t h) {
+    allocators[partitionIdx(h)].freeRow(row);
+    row = nullptr;
+  }
+
+  int32_t __device__ partitionIdx(uint64_t h) const {
+    return (h & partitionMask) >> partitionShift;
+  }
+
+ private:
+  static void __device__
+  addHostRetry(ProbeShared* shared, int32_t i, HashProbe* probe) {
+    probe->hostRetries
+        [shared->blockBase + atomicAdd(&shared->numHostRetries, 1)] = i;
+  }
+};
+} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/HashTable.h b/velox/experimental/wave/common/HashTable.h
new file mode 100644
index 0000000000000..54dec795bb3d2
--- /dev/null
+++ b/velox/experimental/wave/common/HashTable.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+/// Structs for tagged GPU hash table. Can be inclued in both Velox .cpp and
+/// .cu.
+namespace facebook::velox::wave {
+
+/// A 32 byte tagged bucket with 4 tags, 4 flag bytes and 4 6-byte
+/// pointers. Fits in one 32 byte GPU cache sector.
+struct GpuBucketMembers {
+  uint32_t tags;
+  uint32_t flags;
+  uint16_t data[12];
+
+  template <typename T>
+  T* testingLoad(int32_t idx) {
+    auto uptr = static_cast<uint64_t>(data[8 + idx]) << 32;
+    uptr |= reinterpret_cast<uint32_t*>(data)[idx];
+    return reinterpret_cast<T*>(uptr);
+  }
+};
+
+template <typename T, int32_t kSize>
+class FreeSetBase {
+  int32_t full_{0};
+  int32_t empty_{1};
+  unsigned long long bits_[kSize / 64] = {};
+  T items_[kSize] = {};
+};
+
+/// A device arena for device side allocation.
+struct HashPartitionAllocator {
+  static constexpr uint32_t kEmpty = ~0;
+
+  HashPartitionAllocator(
+      char* data,
+      uint32_t size,
+      uint32_t rowSize,
+      void* freeSet)
+      : rowSize(rowSize),
+        base(reinterpret_cast<uint64_t>(data)),
+        capacity(size),
+        stringOffset(capacity),
+        freeSet(freeSet) {}
+
+  const int32_t rowSize{0};
+  const uint64_t base{0};
+  uint32_t rowOffset{0};
+  const uint32_t capacity{0};
+  uint32_t stringOffset{0};
+  void* freeSet{nullptr};
+  int32_t numFromFree{0};
+  int32_t numFull{0};
+};
+
+/// Implementation of HashPartitionAllocator, defined in .cuh.
+struct RowAllocator;
+
+enum class ProbeState : uint8_t { kDone, kMoreValues, kNeedSpace, kRetry };
+
+/// Operands for one TB of hash probe.
+struct HashProbe {
+  /// The number of input rows processed by each thread of a TB. The base index
+  /// for a block in the arrays in 'this' is 'numRowsPerThread * blockDim.x *
+  /// blockIdx.x'
+  int32_t numRowsPerThread{1};
+
+  /// Count of probe keys for each TB. Subscript is blockIdx.x.
+  int32_t* numRows;
+
+  /// Data for probe keys. To be interpreted by Ops of the probe, no
+  /// fixed format.
+  void* keys;
+
+  /// Hash numbers for probe keys.
+  uint64_t* hashes;
+
+  /// List of input rows to retry in kernel. Sized to one per row of
+  /// input. Used inside kernel, not meaningful after return. Sample
+  /// use case is another warp updating the same row.
+  int32_t* kernelRetries1;
+  int32_t* kernelRetries2;
+
+  /// List of input rows to retry after host updated state. Sized to
+  /// one per row of input. The reason for a host side retry is
+  /// needing more space. The host will decide to allocate/spill/error
+  /// out.
+  int32_t* hostRetries;
+
+  /// Count of valid items in 'hostRetries'. The subscript is blockIdx.x.
+  int32_t* numHostRetries;
+
+  /// Space in 'hits' and 'hitRows'. Should be a multiple of probe block width.
+  int32_t maxHits{0};
+
+  /// Row numbers for hits. Indices into 'hashes'.
+  int32_t* hitRows{nullptr};
+
+  // Optional payload rows hitting from a probe.
+  void** hits{nullptr};
+};
+
+struct GpuBucket;
+
+struct GpuHashTableBase {
+  /// Bucket array. Size is 'sizeMask + 1'.
+  GpuBucket* buckets{nullptr};
+
+  // Mask to extract index into 'buckets' from a hash number. a
+  // sizemask of 63 means 64 buckets, which is up to 256 entries.
+  uint32_t sizeMask;
+
+  // Translates a hash number to a partition number '(hash &
+  // partitionMask) >> partitionShift' is a partition number used as
+  // a physical partition of the table. Used as index into 'allocators'.
+  uint32_t partitionMask{0};
+  uint8_t partitionShift{0};
+
+  /// A RowAllocator for each partition.
+  RowAllocator* allocators;
+};
+
+} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/BlockTest.cpp b/velox/experimental/wave/common/tests/BlockTest.cpp
index 012010233e71a..b5b543c450330 100644
--- a/velox/experimental/wave/common/tests/BlockTest.cpp
+++ b/velox/experimental/wave/common/tests/BlockTest.cpp
@@ -27,6 +27,15 @@
 using namespace facebook::velox;
 using namespace facebook::velox::wave;
 
+constexpr int32_t kNumPartitionBlocks = 100;
+struct PartitionRun {
+  uint16_t* keys[kNumPartitionBlocks];
+  int32_t numRows[kNumPartitionBlocks];
+  int32_t* ranks[kNumPartitionBlocks];
+  int32_t* partitionStarts[kNumPartitionBlocks];
+  int32_t* partitionedRows[kNumPartitionBlocks];
+};
+
 class BlockTest : public testing::Test {
  protected:
   void SetUp() override {
@@ -39,77 +48,301 @@ class BlockTest : public testing::Test {
   void prefetch(Stream& stream, WaveBufferPtr buffer) {
     stream.prefetch(device_, buffer->as<char>(), buffer->capacity());
   }
+  void testBoolToIndices(bool use256) {
+    /// We make a set of 256 flags and corresponding 256 indices of true flags.
+    constexpr int32_t kNumBlocks = 20480;
+    constexpr int32_t kBlockSize = 256;
+    constexpr int32_t kNumFlags = kBlockSize * kNumBlocks;
+    auto flagsBuffer = arena_->allocate<uint8_t>(kNumFlags);
+    auto indicesBuffer = arena_->allocate<int32_t>(kNumFlags);
+    auto sizesBuffer = arena_->allocate<int32_t>(kNumBlocks);
+    BlockTestStream stream;
+
+    std::vector<int32_t> referenceIndices(kNumFlags);
+    std::vector<int32_t> referenceSizes(kNumBlocks);
+    uint8_t* flags = flagsBuffer->as<uint8_t>();
+    for (auto i = 0ul; i < kNumFlags; ++i) {
+      if ((i >> 8) % 17 == 0) {
+        flags[i] = 0;
+      } else if ((i >> 8) % 23 == 0) {
+        flags[i] = 1;
+      } else {
+        flags[i] = (i * 1121) % 73 > 50;
+      }
+    }
+    for (auto b = 0; b < kNumBlocks; ++b) {
+      auto start = b * kBlockSize;
+      int32_t counter = start;
+      for (auto i = 0; i < kBlockSize; ++i) {
+        if (flags[start + i]) {
+          referenceIndices[counter++] = start + i;
+        }
+      }
+      referenceSizes[b] = counter - start;
+    }
+
+    prefetch(stream, flagsBuffer);
+    prefetch(stream, indicesBuffer);
+    prefetch(stream, sizesBuffer);
+    stream.wait();
+    auto indicesPointers = arena_->allocate<void*>(kNumBlocks);
+    auto flagsPointers = arena_->allocate<void*>(kNumBlocks);
+    for (auto i = 0; i < kNumBlocks; ++i) {
+      flagsPointers->as<uint8_t*>()[i] = flags + (i * kBlockSize);
+      indicesPointers->as<int32_t*>()[i] =
+          indicesBuffer->as<int32_t>() + (i * kBlockSize);
+    }
+
+    prefetch(stream, flagsBuffer);
+    prefetch(stream, indicesBuffer);
+    prefetch(stream, sizesBuffer);
+    stream.wait();
+    auto startMicros = getCurrentTimeMicro();
+    if (use256) {
+      stream.testBool256ToIndices(
+          kNumBlocks,
+          flagsPointers->as<uint8_t*>(),
+          indicesPointers->as<int32_t*>(),
+          sizesBuffer->as<int32_t>());
+
+    } else {
+      stream.testBoolToIndices(
+          kNumBlocks,
+          flagsPointers->as<uint8_t*>(),
+          indicesPointers->as<int32_t*>(),
+          sizesBuffer->as<int32_t>());
+    }
+    stream.wait();
+    auto elapsed = getCurrentTimeMicro() - startMicros;
+    for (auto b = 0; b < kNumBlocks; ++b) {
+      auto* reference = referenceIndices.data() + b * kBlockSize;
+      auto* actual = indicesBuffer->as<int32_t>() + b * kBlockSize;
+      auto* referenceSizesData = referenceSizes.data();
+      auto* actualSizes = sizesBuffer->as<int32_t>();
+      ASSERT_EQ(
+          0, ::memcmp(reference, actual, referenceSizes[b] * sizeof(int32_t)));
+      ASSERT_EQ(referenceSizesData[b], actualSizes[b]);
+    }
+    std::cout << "Flags " << (use256 ? "256" : "") << " to indices: " << elapsed
+              << "us, " << kNumFlags / static_cast<float>(elapsed) << " Mrows/s"
+              << std::endl;
+
+    auto temp = arena_->allocate<char>(
+        BlockTestStream::boolToIndicesSize() * kNumBlocks);
+    prefetch(stream, temp);
+    prefetch(stream, flagsBuffer);
+    prefetch(stream, indicesBuffer);
+    prefetch(stream, sizesBuffer);
+    stream.wait();
+
+    startMicros = getCurrentTimeMicro();
+    if (use256) {
+      stream.testBool256ToIndicesNoShared(
+          kNumBlocks,
+          flagsPointers->as<uint8_t*>(),
+          indicesPointers->as<int32_t*>(),
+          sizesBuffer->as<int32_t>(),
+          temp->as<char>());
+    } else {
+      stream.testBoolToIndicesNoShared(
+          kNumBlocks,
+          flagsPointers->as<uint8_t*>(),
+          indicesPointers->as<int32_t*>(),
+          sizesBuffer->as<int32_t>(),
+          temp->as<char>());
+    }
+    stream.wait();
+    elapsed = getCurrentTimeMicro() - startMicros;
+    std::cout << "Flags " << (use256 ? "256" : "")
+              << " to indices: " << " to indices no smem: " << elapsed << "us, "
+              << kNumFlags / static_cast<float>(elapsed) << " Mrows/s"
+              << std::endl;
+  }
+
+  void makePartitionRun(
+      int32_t numRows,
+      int32_t numPartitions,
+      PartitionRun*& run,
+      WaveBufferPtr& buffer) {
+    auto rowsRounded = bits::roundUp(numRows, 8);
+    auto partitionsRounded = bits::roundUp(numPartitions, 8);
+    int64_t bytes = sizeof(PartitionRun) +
+        kNumPartitionBlocks *
+            (rowsRounded * sizeof(int32_t) * 4 +
+             partitionsRounded * sizeof(int32_t));
+    if (!buffer || buffer->capacity() < bytes) {
+      buffer = arena_->allocate<char>(bytes);
+    }
+    run = buffer->as<PartitionRun>();
+    auto chars = buffer->as<char>() + sizeof(PartitionRun);
+    for (auto block = 0; block < kNumPartitionBlocks; ++block) {
+      run->keys[block] = reinterpret_cast<uint16_t*>(chars);
+      run->numRows[block] = numRows;
+      chars += rowsRounded * sizeof(uint16_t);
+      run->partitionStarts[block] = reinterpret_cast<int32_t*>(chars);
+      chars += numPartitions * sizeof(int32_t);
+      run->ranks[block] = reinterpret_cast<int32_t*>(chars);
+      chars += sizeof(int32_t) * numRows;
+      run->partitionedRows[block] = reinterpret_cast<int32_t*>(chars);
+      chars += sizeof(int32_t) * numRows;
+      for (auto i = 0; i < numRows; ++i) {
+        run->keys[block][i] = (block + i * 2017) % numPartitions;
+      }
+    }
+    VELOX_CHECK_LE(chars - buffer->as<char>(), bytes);
+  }
 
+  void checkPartitionRun(const PartitionRun& run, int32_t numPartitions) {
+    // Check that every row is once in its proper partition.
+    for (auto block = 0; block < kNumPartitionBlocks; ++block) {
+      std::vector<bool> flags(run.numRows[block], false);
+      for (auto part = 0; part < numPartitions; ++part) {
+        for (auto i = (part == 0 ? 0 : run.partitionStarts[block][part - 1]);
+             i < run.partitionStarts[block][part];
+             ++i) {
+          auto row = run.partitionedRows[block][i];
+          EXPECT_LT(row, run.numRows[block]);
+          EXPECT_FALSE(flags[row]);
+          EXPECT_EQ(part, run.keys[block][row]);
+          flags[row] = true;
+        }
+      }
+      // Expect that all flags are set.
+      for (auto i = 0; i < run.numRows[block]; ++i) {
+        EXPECT_TRUE(flags[i]);
+      }
+    }
+  }
   Device* device_;
   GpuAllocator* allocator_;
   std::unique_ptr<GpuArena> arena_;
 };
 
 TEST_F(BlockTest, boolToIndices) {
-  /// We make a set of 256 flags and corresponding 256 indices of true flags.
-  constexpr int32_t kNumBlocks = 20480;
-  constexpr int32_t kBlockSize = 256;
-  constexpr int32_t kNumFlags = kBlockSize * kNumBlocks;
-  auto flagsBuffer = arena_->allocate<uint8_t>(kNumFlags);
-  auto indicesBuffer = arena_->allocate<int32_t>(kNumFlags);
-  auto sizesBuffer = arena_->allocate<int32_t>(kNumBlocks);
-  auto timesBuffer = arena_->allocate<int64_t>(kNumBlocks);
+  testBoolToIndices(false);
+  testBoolToIndices(true);
+}
+
+TEST_F(BlockTest, shortRadixSort) {
+  // We make a set of 8K uint16_t keys  and uint16_t values.
+  constexpr int32_t kNumBlocks = 1024;
+  constexpr int32_t kBlockSize = 1024;
+  constexpr int32_t kValuesPerThread = 8;
+  constexpr int32_t kValuesPerBlock = kBlockSize * kValuesPerThread;
+  constexpr int32_t kNumValues = kBlockSize * kNumBlocks * kValuesPerThread;
+  auto keysBuffer = arena_->allocate<uint16_t>(kNumValues);
+  auto valuesBuffer = arena_->allocate<int16_t>(kNumValues);
   BlockTestStream stream;
 
-  std::vector<int32_t> referenceIndices(kNumFlags);
-  std::vector<int32_t> referenceSizes(kNumBlocks);
-  uint8_t* flags = flagsBuffer->as<uint8_t>();
-  for (auto i = 0ul; i < kNumFlags; ++i) {
-    if ((i >> 8) % 17 == 0) {
-      flags[i] = 0;
-    } else if ((i >> 8) % 23 == 0) {
-      flags[i] = 1;
-    } else {
-      flags[i] = (i * 1121) % 73 > 50;
-    }
+  std::vector<uint16_t> referenceKeys(kNumValues);
+  std::vector<uint16_t> referenceValues(kNumValues);
+  uint16_t* keys = keysBuffer->as<uint16_t>();
+  uint16_t* values = valuesBuffer->as<uint16_t>();
+  for (auto i = 0; i < kNumValues; ++i) {
+    keys[i] = i * 2017;
+    values[i] = i;
   }
+
   for (auto b = 0; b < kNumBlocks; ++b) {
-    auto start = b * kBlockSize;
-    int32_t counter = start;
-    for (auto i = 0; i < kBlockSize; ++i) {
-      if (flags[start + i]) {
-        referenceIndices[counter++] = start + i;
-      }
+    auto start = b * kValuesPerBlock;
+    std::vector<uint16_t> indices(kValuesPerBlock);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::sort(indices.begin(), indices.end(), [&](auto left, auto right) {
+      return keys[start + left] < keys[start + right];
+    });
+    for (auto i = 0; i < kValuesPerBlock; ++i) {
+      referenceValues[start + i] = values[start + indices[i]];
     }
-    referenceSizes[b] = counter - start;
   }
 
-  prefetch(stream, flagsBuffer);
-  prefetch(stream, indicesBuffer);
-  prefetch(stream, sizesBuffer);
+  prefetch(stream, valuesBuffer);
+  prefetch(stream, keysBuffer);
 
-  auto indicesPointers = arena_->allocate<void*>(kNumBlocks);
-  auto flagsPointers = arena_->allocate<void*>(kNumBlocks);
+  auto keysPointers = arena_->allocate<void*>(kNumBlocks);
+  auto valuesPointers = arena_->allocate<void*>(kNumBlocks);
   for (auto i = 0; i < kNumBlocks; ++i) {
-    flagsPointers->as<uint8_t*>()[i] = flags + (i * kBlockSize);
-    indicesPointers->as<int32_t*>()[i] =
-        indicesBuffer->as<int32_t>() + (i * kBlockSize);
+    keysPointers->as<uint16_t*>()[i] = keys + (i * kValuesPerBlock);
+    valuesPointers->as<uint16_t*>()[i] =
+        valuesBuffer->as<uint16_t>() + (i * kValuesPerBlock);
   }
-
+  auto keySegments = keysPointers->as<uint16_t*>();
+  auto valueSegments = valuesPointers->as<uint16_t*>();
+  prefetch(stream, keysPointers);
+  prefetch(stream, valuesPointers);
+  stream.wait();
   auto startMicros = getCurrentTimeMicro();
-  stream.testBoolToIndices(
-      kNumBlocks,
-      flagsPointers->as<uint8_t*>(),
-      indicesPointers->as<int32_t*>(),
-      sizesBuffer->as<int32_t>(),
-      timesBuffer->as<int64_t>());
+  stream.testSort16(kNumBlocks, keySegments, valueSegments);
   stream.wait();
   auto elapsed = getCurrentTimeMicro() - startMicros;
   for (auto b = 0; b < kNumBlocks; ++b) {
     ASSERT_EQ(
         0,
         ::memcmp(
-            referenceIndices.data() + b * kBlockSize,
-            indicesBuffer->as<int32_t>() + b * kBlockSize,
-            referenceSizes[b] * sizeof(int32_t)));
-    ASSERT_EQ(referenceSizes[b], sizesBuffer->as<int32_t>()[b]);
+            referenceValues.data() + b * kValuesPerBlock,
+            valueSegments[b],
+            kValuesPerBlock * sizeof(uint16_t)));
+  }
+  std::cout << "sort16: " << elapsed << "us, "
+            << kNumValues / static_cast<float>(elapsed) << " Mrows/s"
+            << std::endl;
+
+  // Reset the test values for second test.
+  for (auto i = 0; i < kNumValues; ++i) {
+    keys[i] = i * 2017;
+    values[i] = i;
   }
-  std::cout << "Flags to indices: " << elapsed << "us, "
-            << kNumFlags / static_cast<float>(elapsed) << " Mrows/s"
+  auto temp =
+      arena_->allocate<char>(kNumBlocks * BlockTestStream::sort16SharedSize());
+  prefetch(stream, temp);
+  prefetch(stream, valuesBuffer);
+  prefetch(stream, keysBuffer);
+  prefetch(stream, keysPointers);
+  prefetch(stream, valuesPointers);
+  stream.wait();
+  startMicros = getCurrentTimeMicro();
+  stream.testSort16NoShared(
+      kNumBlocks, keySegments, valueSegments, temp->as<char>());
+  stream.wait();
+  elapsed = getCurrentTimeMicro() - startMicros;
+  std::cout << "sort16 no shared: " << elapsed << "us, "
+            << kNumValues / static_cast<float>(elapsed) << " Mrows/s"
             << std::endl;
 }
+
+TEST_F(BlockTest, partition) {
+  // We make severl sets of keys and temp and result buffers. These
+  // are in unified memory. We run the partition for all and check the
+  // outcome on the host. We run at several different partition counts
+  // and batch sizes. All experiments are submitted as kNumPartitionBlocks
+  // concurrent thread blocks of 256 threads.
+  BlockTestStream stream;
+  std::vector<int32_t> partitionCounts = {1, 2, 32, 333, 1000, 8000};
+  std::vector<int32_t> runSizes = {100, 1000, 10000, 30000};
+  WaveBufferPtr buffer;
+  PartitionRun* run;
+  for (auto parts : partitionCounts) {
+    for (auto rows : runSizes) {
+      makePartitionRun(rows, parts, run, buffer);
+      prefetch(stream, buffer);
+      auto startMicros = getCurrentTimeMicro();
+      stream.partitionShorts(
+          kNumPartitionBlocks,
+          run->keys,
+          run->numRows,
+          parts,
+          run->ranks,
+          run->partitionStarts,
+          run->partitionedRows);
+      stream.wait();
+      auto time = getCurrentTimeMicro() - startMicros;
+      std::cout << fmt::format(
+                       "Partition {} batch={}  fanout={}  rate={} Mrows/s",
+                       kNumPartitionBlocks,
+                       rows,
+                       parts,
+                       kNumPartitionBlocks * static_cast<float>(rows) / time)
+                << std::endl;
+      checkPartitionRun(*run, parts);
+    }
+  }
+}
diff --git a/velox/experimental/wave/common/tests/BlockTest.cu b/velox/experimental/wave/common/tests/BlockTest.cu
index 6b32b9880f125..695b075b9a2bc 100644
--- a/velox/experimental/wave/common/tests/BlockTest.cu
+++ b/velox/experimental/wave/common/tests/BlockTest.cu
@@ -1,20 +1,35 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "velox/experimental/wave/common/Block.cuh"
 #include "velox/experimental/wave/common/CudaUtil.cuh"
+#include "velox/experimental/wave/common/HashTable.cuh"
 #include "velox/experimental/wave/common/tests/BlockTest.h"
+#include "velox/experimental/wave/common/tests/HashTestUtil.h"
+#include "velox/experimental/wave/common/tests/Updates.cuh"
 
 namespace facebook::velox::wave {
 
 using ScanAlgorithm = cub::BlockScan<int, 256, cub::BLOCK_SCAN_RAKING>;
 
-__global__ void boolToIndices(
-    uint8_t** bools,
-    int32_t** indices,
-    int32_t* sizes,
-    int64_t* times) {
+__global__ void
+boolToIndicesKernel(uint8_t** bools, int32_t** indices, int32_t* sizes) {
   extern __shared__ char smem[];
   int32_t idx = blockIdx.x;
   // Start cycle timer
-  clock_t start = clock();
   uint8_t* blockBools = bools[idx];
   boolBlockToIndices<256>(
       [&]() { return blockBools[threadIdx.x]; },
@@ -22,25 +37,111 @@ __global__ void boolToIndices(
       indices[idx],
       smem,
       sizes[idx]);
-  clock_t stop = clock();
-  if (threadIdx.x == 0) {
-    times[idx] = (start > stop) ? start - stop : stop - start;
-  }
 }
 
 void BlockTestStream::testBoolToIndices(
     int32_t numBlocks,
     uint8_t** flags,
     int32_t** indices,
-    int32_t* sizes,
-    int64_t* times) {
+    int32_t* sizes) {
   CUDA_CHECK(cudaGetLastError());
   auto tempBytes = sizeof(typename ScanAlgorithm::TempStorage);
-  boolToIndices<<<numBlocks, 256, tempBytes, stream_->stream>>>(
-      flags, indices, sizes, times);
+  boolToIndicesKernel<<<numBlocks, 256, tempBytes, stream_->stream>>>(
+      flags, indices, sizes);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+__global__ void boolToIndicesNoSharedKernel(
+    uint8_t** bools,
+    int32_t** indices,
+    int32_t* sizes,
+    void* temp) {
+  int32_t idx = blockIdx.x;
+
+  uint8_t* blockBools = bools[idx];
+  char* smem = reinterpret_cast<char*>(temp) +
+      blockIdx.x * sizeof(typename ScanAlgorithm::TempStorage);
+  boolBlockToIndices<256>(
+      [&]() { return blockBools[threadIdx.x]; },
+      idx * 256,
+      indices[idx],
+      smem,
+      sizes[idx]);
+}
+
+void BlockTestStream::testBoolToIndicesNoShared(
+    int32_t numBlocks,
+    uint8_t** flags,
+    int32_t** indices,
+    int32_t* sizes,
+    void* temp) {
+  CUDA_CHECK(cudaGetLastError());
+  boolToIndicesNoSharedKernel<<<numBlocks, 256, 0, stream_->stream>>>(
+      flags, indices, sizes, temp);
   CUDA_CHECK(cudaGetLastError());
 }
 
+int32_t BlockTestStream::boolToIndicesSize() {
+  return sizeof(typename ScanAlgorithm::TempStorage);
+}
+
+__global__ void
+bool256ToIndicesKernel(uint8_t** bools, int32_t** indices, int32_t* sizes) {
+  extern __shared__ char smem[];
+  int32_t idx = blockIdx.x;
+  auto* bool64 = reinterpret_cast<uint64_t*>(bools[idx]);
+  bool256ToIndices(
+      [&](int32_t index8) { return bool64[index8]; },
+      idx * 256,
+      indices[idx],
+      sizes[idx],
+      smem);
+}
+
+void BlockTestStream::testBool256ToIndices(
+    int32_t numBlocks,
+    uint8_t** flags,
+    int32_t** indices,
+    int32_t* sizes) {
+  CUDA_CHECK(cudaGetLastError());
+  auto tempBytes = bool256ToIndicesSize();
+  bool256ToIndicesKernel<<<numBlocks, 256, tempBytes, stream_->stream>>>(
+      flags, indices, sizes);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+__global__ void bool256ToIndicesNoSharedKernel(
+    uint8_t** bools,
+    int32_t** indices,
+    int32_t* sizes,
+    void* temp) {
+  int32_t idx = blockIdx.x;
+  auto* bool64 = reinterpret_cast<uint64_t*>(bools[idx]);
+  char* smem = reinterpret_cast<char*>(temp) + blockIdx.x * 80;
+  bool256ToIndices(
+      [&](int32_t index8) { return bool64[index8]; },
+      idx * 256,
+      indices[idx],
+      sizes[idx],
+      smem);
+}
+
+void BlockTestStream::testBool256ToIndicesNoShared(
+    int32_t numBlocks,
+    uint8_t** flags,
+    int32_t** indices,
+    int32_t* sizes,
+    void* temp) {
+  CUDA_CHECK(cudaGetLastError());
+  bool256ToIndicesNoSharedKernel<<<numBlocks, 256, 0, stream_->stream>>>(
+      flags, indices, sizes, temp);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+int32_t BlockTestStream::bool256ToIndicesSize() {
+  return 80;
+}
+
 __global__ void sum64(int64_t* numbers, int64_t* results) {
   extern __shared__ char smem[];
   int32_t idx = blockIdx.x;
@@ -57,4 +158,417 @@ void BlockTestStream::testSum64(
   CUDA_CHECK(cudaGetLastError());
 }
 
+/// Keys and values are n sections of 8K items. The items in each section get
+/// sorted on the key.
+void __global__ __launch_bounds__(1024)
+    testSort(uint16_t** keys, uint16_t** values) {
+  extern __shared__ __align__(16) char smem[];
+  auto keyBase = keys[blockIdx.x];
+  auto valueBase = values[blockIdx.x];
+  blockSort<256, 32>(
+      [&](auto i) { return keyBase[i]; },
+      [&](auto i) { return valueBase[i]; },
+      keys[blockIdx.x],
+      values[blockIdx.x],
+      smem);
+}
+
+void __global__ __launch_bounds__(1024)
+    testSortNoShared(uint16_t** keys, uint16_t** values, char* smem) {
+  auto keyBase = keys[blockIdx.x];
+  auto valueBase = values[blockIdx.x];
+  char* tbTemp = smem +
+      blockIdx.x *
+          sizeof(typename cub::BlockRadixSort<uint16_t, 256, 32, uint16_t>::
+                     TempStorage);
+
+  blockSort<256, 32>(
+      [&](auto i) { return keyBase[i]; },
+      [&](auto i) { return valueBase[i]; },
+      keys[blockIdx.x],
+      values[blockIdx.x],
+      tbTemp);
+}
+
+int32_t BlockTestStream::sort16SharedSize() {
+  return sizeof(
+      typename cub::BlockRadixSort<uint16_t, 256, 32, uint16_t>::TempStorage);
+}
+
+void BlockTestStream::testSort16(
+    int32_t numBlocks,
+    uint16_t** keys,
+    uint16_t** values) {
+  auto tempBytes = sizeof(
+      typename cub::BlockRadixSort<uint16_t, 256, 32, uint16_t>::TempStorage);
+
+  testSort<<<numBlocks, 256, tempBytes, stream_->stream>>>(keys, values);
+}
+
+void BlockTestStream::testSort16NoShared(
+    int32_t numBlocks,
+    uint16_t** keys,
+    uint16_t** values,
+    char* temp) {
+  testSortNoShared<<<numBlocks, 256, 0, stream_->stream>>>(keys, values, temp);
+}
+
+/// Calls partitionRows on each thread block of 256 threads. The parameters
+/// correspond to 'partitionRows'. Each is an array subscripted by blockIdx.x.
+void __global__ partitionShortsKernel(
+    uint16_t** keys,
+    int32_t* numKeys,
+    int32_t numPartitions,
+    int32_t** ranks,
+    int32_t** partitionStarts,
+    int32_t** partitionedRows) {
+  partitionRows<256>(
+      [&](auto i) { return keys[blockIdx.x][i]; },
+      numKeys[blockIdx.x],
+      numPartitions,
+      ranks[blockIdx.x],
+      partitionStarts[blockIdx.x],
+      partitionedRows[blockIdx.x]);
+}
+
+void BlockTestStream::partitionShorts(
+    int32_t numBlocks,
+    uint16_t** keys,
+    int32_t* numKeys,
+    int32_t numPartitions,
+    int32_t** ranks,
+    int32_t** partitionStarts,
+    int32_t** partitionedRows) {
+  constexpr int32_t kBlockSize = 256;
+  auto shared = partitionRowsSharedSize<kBlockSize>(numPartitions);
+  partitionShortsKernel<<<numBlocks, kBlockSize, shared, stream_->stream>>>(
+      keys, numKeys, numPartitions, ranks, partitionStarts, partitionedRows);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/// A mock complex accumulator update function.
+ProbeState __device__ arrayAgg64Append(
+    ArrayAgg64* accumulator,
+    int64_t arg,
+    RowAllocator* allocator) {
+  auto* last = accumulator->last;
+  if (!last || accumulator->numInLast >= sizeof(last->data) / sizeof(int64_t)) {
+    auto* next = allocator->allocate<ArrayAgg64::Run>(1);
+    if (!next) {
+      return ProbeState::kNeedSpace;
+    }
+    next->next = nullptr;
+    if (accumulator->last) {
+      accumulator->last->next = next;
+      accumulator->last = next;
+    } else {
+      accumulator->first = accumulator->last = next;
+    }
+  }
+  accumulator->last->data[accumulator->numInLast++] = arg;
+  return ProbeState::kDone;
+}
+
+/// An mock Ops parameter class to do group by.
+class MockGroupByOps {
+ public:
+  int32_t __device__ blockBase(HashProbe* probe) {
+    return probe->numRowsPerThread * blockDim.x * blockIdx.x;
+  }
+
+  int32_t __device__ numRowsInBlock(HashProbe* probe) {
+    return probe->numRows[blockIdx.x];
+  }
+
+  uint64_t __device__ hash(int32_t i, HashProbe* probe) {
+    auto key = reinterpret_cast<int64_t**>(probe->keys)[0];
+    return hashMix(1, key[i]);
+  }
+
+  bool __device__
+  compare(GpuHashTable* table, TestingRow* row, int32_t i, HashProbe* probe) {
+    return row->key == reinterpret_cast<int64_t**>(probe->keys)[0][i];
+  }
+
+  TestingRow* __device__
+  newRow(GpuHashTable* table, int32_t partition, int32_t i, HashProbe* probe) {
+    auto* allocator = &table->allocators[partition];
+    auto row = allocator->allocateRow<TestingRow>();
+    if (row) {
+      row->key = reinterpret_cast<int64_t**>(probe->keys)[0][i];
+      row->flags = 0;
+      row->count = 0;
+      new (&row->concatenation) ArrayAgg64();
+    }
+    return row;
+  }
+
+  ProbeState __device__ insert(
+      GpuHashTable* table,
+      int32_t partition,
+      GpuBucket* bucket,
+      uint32_t misses,
+      uint32_t oldTags,
+      uint32_t tagWord,
+      int32_t i,
+      HashProbe* probe,
+      TestingRow*& row) {
+    if (!row) {
+      row = newRow(table, partition, i, probe);
+      if (!row) {
+        return ProbeState::kNeedSpace;
+      }
+    }
+    auto missShift = __ffs(misses) - 1;
+    if (!bucket->addNewTag(tagWord, oldTags, missShift)) {
+      return ProbeState::kRetry;
+    }
+    bucket->store(missShift / 8, row);
+    return ProbeState::kDone;
+  }
+
+  TestingRow* __device__ getExclusive(
+      GpuHashTable* table,
+      GpuBucket* bucket,
+      TestingRow* row,
+      int32_t hitIdx,
+      int32_t warp) {
+    return row;
+    int32_t nanos = 1;
+    for (;;) {
+      if (atomicTryLock(&row->flags)) {
+        return row;
+      }
+      __nanosleep((nanos + threadIdx.x) & 31);
+      nanos += 3;
+    }
+  }
+
+  void __device__ writeDone(TestingRow* row) {
+    // atomicUnlock(&row->flags);
+  }
+
+  ProbeState __device__ update(
+      GpuHashTable* table,
+      GpuBucket* bucket,
+      TestingRow* row,
+      int32_t i,
+      HashProbe* probe) {
+    auto* keys = reinterpret_cast<int64_t**>(probe->keys);
+    atomicAdd((unsigned long long*)&row->count, (unsigned long long)keys[1][i]);
+    return ProbeState::kDone;
+    int64_t arg = keys[1][i];
+    int32_t part = table->partitionIdx(bucket - table->buckets);
+    auto* allocator = &table->allocators[part];
+    auto state = arrayAgg64Append(&row->concatenation, arg, allocator);
+    row->flags = 0;
+    __threadfence();
+    return state;
+  }
+};
+
+void __global__ __launch_bounds__(1024) hashTestKernel(
+    GpuHashTable* table,
+    HashProbe* probe,
+    BlockTestStream::HashCase mode) {
+  switch (mode) {
+    case BlockTestStream::HashCase::kGroup: {
+      table->updatingProbe<TestingRow>(probe, MockGroupByOps());
+      break;
+    }
+    case BlockTestStream::HashCase::kBuild:
+    case BlockTestStream::HashCase::kProbe:
+      *(long*)0 = 0; // Unimplemented.
+  }
+}
+
+void BlockTestStream::hashTest(
+    GpuHashTableBase* table,
+    HashRun& run,
+    HashCase mode) {
+  constexpr int32_t kBlockSize = 256;
+  int32_t shared = 0;
+  if (mode == HashCase::kGroup) {
+    shared = GpuHashTable::updatingProbeSharedSize();
+  }
+  hashTestKernel<<<run.numBlocks, run.blockSize, shared, stream_->stream>>>(
+      reinterpret_cast<GpuHashTable*>(table), run.probe, mode);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+void __global__ allocatorTestKernel(
+    int32_t numAlloc,
+    int32_t numFree,
+    int32_t numStr,
+    AllocatorTestResult* allResults) {
+  auto* result = allResults + threadIdx.x + blockIdx.x * blockDim.x;
+  for (;;) {
+    int32_t maxRows = sizeof(result->rows) / sizeof(result->rows[0]);
+    int32_t maxStrings = sizeof(result->strings) / sizeof(result->strings[0]);
+    for (auto count = 0; count < numAlloc; ++count) {
+      if (result->numRows >= maxRows) {
+        return;
+      }
+      auto newRow = result->allocator->allocateRow<int64_t>();
+      if (newRow == nullptr) {
+        return;
+      }
+      if (reinterpret_cast<uint64_t>(newRow) == result->allocator->base) {
+        printf("");
+      }
+
+      result->rows[result->numRows++] = newRow;
+    }
+    for (auto count = 0; count < numFree; ++count) {
+      if (result->numRows == 0) {
+        return;
+      }
+      auto* toFree = result->rows[--result->numRows];
+      if (reinterpret_cast<uint64_t>(toFree) == result->allocator->base) {
+        printf(""); // GPF();
+      }
+      if (!result->allocator->inRange(toFree)) {
+        GPF();
+      }
+      result->allocator->freeRow(toFree);
+    }
+    for (auto count = 0; count < numStr; ++count) {
+      if (result->numStrings >= maxStrings) {
+        return;
+      }
+      auto str = result->allocator->allocate<char>(11);
+      if (!str) {
+        return;
+      }
+      result->strings[result->numStrings++] = reinterpret_cast<int64_t*>(str);
+    }
+  }
+}
+
+void __global__ initAllocatorKernel(RowAllocator* allocator) {
+  if (threadIdx.x == 0) {
+    if (allocator->freeSet) {
+      reinterpret_cast<FreeSet<uint32_t, 1024>*>(allocator->freeSet)->clear();
+    }
+  }
+}
+
+//  static
+int32_t BlockTestStream::freeSetSize() {
+  return sizeof(FreeSet<uint32_t, 1024>);
+}
+
+void BlockTestStream::initAllocator(HashPartitionAllocator* allocator) {
+  initAllocatorKernel<<<1, 1, 0, stream_->stream>>>(
+      reinterpret_cast<RowAllocator*>(allocator));
+  CUDA_CHECK(cudaGetLastError());
+}
+
+void BlockTestStream::rowAllocatorTest(
+    int32_t numBlocks,
+    int32_t numAlloc,
+    int32_t numFree,
+    int32_t numStr,
+    AllocatorTestResult* results) {
+  allocatorTestKernel<<<numBlocks, 64, 0, stream_->stream>>>(
+      numAlloc, numFree, numStr, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+#define UPDATE_CASE(name, func, smem)                                      \
+  void __global__ name##Kernel(TestingRow* rows, HashProbe* probe) {       \
+    func(rows, probe);                                                     \
+  }                                                                        \
+                                                                           \
+  void BlockTestStream::name(TestingRow* rows, HashRun& run) {             \
+    name##Kernel<<<run.numBlocks, run.blockSize, smem, stream_->stream>>>( \
+        rows, run.probe);                                                  \
+    CUDA_CHECK(cudaGetLastError());                                        \
+  }
+
+UPDATE_CASE(updateSum1NoSync, testSumNoSync, 0);
+UPDATE_CASE(updateSum1Mtx, testSumMtx, 0);
+UPDATE_CASE(updateSum1MtxCoalesce, testSumMtxCoalesce, 0);
+UPDATE_CASE(updateSum1Atomic, testSumAtomic, 0);
+UPDATE_CASE(updateSum1AtomicCoalesce, testSumAtomicCoalesce, 0);
+UPDATE_CASE(updateSum1Exch, testSumExch, sizeof(ProbeShared));
+UPDATE_CASE(updateSum1Order, testSumOrder, 0);
+
+void __global__ __launch_bounds__(1024) update1PartitionKernel(
+    int32_t numRows,
+    int32_t numDistinct,
+    int32_t numParts,
+    int32_t blockStride,
+    HashProbe* probe,
+    int32_t* temp) {
+  auto blockStart = blockStride * blockIdx.x;
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  partitionRows<256, int32_t>(
+      [&](auto i) -> int32_t { return indices[i + blockStart] % numParts; },
+      blockIdx.x == blockDim.x - 1 ? numRows - blockStart : blockStride,
+      numParts,
+      temp + blockIdx.x * blockStride,
+      probe->hostRetries + blockStride * blockIdx.x,
+      probe->kernelRetries1 + blockStride * blockIdx.x);
+}
+
+void __global__ updateSum1PartKernel(
+    TestingRow* rows,
+    int32_t numParts,
+    HashProbe* probe,
+    int32_t numGroups,
+    int32_t groupStride) {
+  testSumPart(
+      rows,
+      numParts,
+      probe,
+      probe->kernelRetries1,
+      probe->hostRetries,
+      numGroups,
+      groupStride);
+}
+
+void BlockTestStream::updateSum1Part(TestingRow* rows, HashRun& run) {
+  auto numParts = std::min<int32_t>(run.numDistinct, 8192);
+  auto groupStride = run.numRows / 32;
+  auto numGroups = run.numRows / groupStride;
+  auto partSmem = partitionRowsSharedSize<256>(numParts);
+  // We use probe->kernelRetries1 as the indices array for partitions. We use
+  // probe->hostRetries as the array of partition starts. So, if we have 10
+  // partitions, then hostRetries[x..y] is the input rows for partition 1 if x
+  // is partitionStarts[0] and y is partitionStarts[1].
+  update1PartitionKernel<<<numGroups, 256, partSmem, stream_->stream>>>(
+      run.numRows,
+      run.numDistinct,
+      numParts,
+      groupStride,
+      run.probe,
+      run.partitionTemp);
+  CUDA_CHECK(cudaGetLastError());
+
+  int32_t blockSize = roundUp(std::min<int32_t>(256, numParts), 32);
+  int32_t numBlocks = numParts / blockSize;
+  // There will be one lane per partition. The last blocks may have empty lanes.
+  if (numBlocks * blockSize < numParts) {
+    ++numBlocks;
+  }
+  updateSum1PartKernel<<<numBlocks, blockSize, 0, stream_->stream>>>(
+      rows, numParts, run.probe, numGroups, groupStride);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+REGISTER_KERNEL("testSort", testSort);
+REGISTER_KERNEL("boolToIndices", boolToIndicesKernel);
+REGISTER_KERNEL("bool256ToIndices", bool256ToIndicesKernel);
+REGISTER_KERNEL("sum64", sum64);
+REGISTER_KERNEL("partitionShorts", partitionShortsKernel);
+REGISTER_KERNEL("hashTest", hashTestKernel);
+REGISTER_KERNEL("allocatorTest", allocatorTestKernel);
+REGISTER_KERNEL("sum1atm", updateSum1AtomicKernel);
+REGISTER_KERNEL("sum1atmCoa", updateSum1AtomicCoalesceKernel);
+REGISTER_KERNEL("sum1Exch", updateSum1ExchKernel);
+REGISTER_KERNEL("sum1Part", updateSum1PartKernel);
+REGISTER_KERNEL("partSum", update1PartitionKernel);
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/BlockTest.h b/velox/experimental/wave/common/tests/BlockTest.h
index bed782ff14ade..63d16b9f66a07 100644
--- a/velox/experimental/wave/common/tests/BlockTest.h
+++ b/velox/experimental/wave/common/tests/BlockTest.h
@@ -17,31 +17,142 @@
 #pragma once
 
 #include "velox/experimental/wave/common/Cuda.h"
+#include "velox/experimental/wave/common/HashTable.h"
+#include "velox/experimental/wave/common/tests/HashTestUtil.h"
 
-/// Sample header for testing Block.cuh
+/// Sample header for testing Wave Utilities.
 
 namespace facebook::velox::wave {
 
+constexpr uint32_t kPrime32 = 1815531889;
+
+/// A mock aggregate that concatenates numbers, like array_agg of bigint.
+struct ArrayAgg64 {
+  struct Run {
+    Run* next;
+    int64_t data[16];
+  };
+
+  Run* first{nullptr};
+  Run* last{nullptr};
+  // Fill of 'last->data', all other runs are full.
+  int8_t numInLast{0};
+};
+
+/// A mock hash table content row to test HashTable.
+struct TestingRow {
+  // Single ke part.
+  int64_t key;
+
+  // Count of updates. Sample aggregate
+  int64_t count{0};
+
+  // A mock concatenating aggregate. Use for testing control flow in
+  // running out of space in updating a group.
+  ArrayAgg64 concatenation;
+
+  // Next pointer in the case simulating a non-unique join table.
+  TestingRow* next{nullptr};
+
+  // flags for updating the row. E.g. probed flag, marker for exclusive write.
+  int32_t flags{0};
+};
+
+/// Result of allocator test kernel.
+struct AllocatorTestResult {
+  RowAllocator* allocator;
+  int32_t numRows;
+  int32_t numStrings;
+  int64_t* rows[200000];
+  int64_t* strings[200000];
+};
+
 class BlockTestStream : public Stream {
  public:
   /// In each block of 256 bools in bools[i], counts the number of
   /// true and writes the indices of true lanes into the corresponding
   /// indices[i]. Stors the number of true values to sizes[i].
   void testBoolToIndices(
+      int32_t numBlocks,
+      uint8_t** flags,
+      int32_t** indices,
+      int32_t* sizes);
+  void testBoolToIndicesNoShared(
       int32_t numBlocks,
       uint8_t** flags,
       int32_t** indices,
       int32_t* sizes,
-      int64_t* times);
+      void*);
+
+  // Returns the smem size for block size 256 of boolToIndices().
+  static int32_t boolToIndicesSize();
+
+  void testBool256ToIndices(
+      int32_t numBlocks,
+      uint8_t** flags,
+      int32_t** indices,
+      int32_t* sizes);
+
+  void testBool256ToIndicesNoShared(
+      int32_t numBlocks,
+      uint8_t** flags,
+      int32_t** indices,
+      int32_t* sizes,
+      void*);
+
+  // Returns the smem size for bool256ToIndices().
+  static int32_t bool256ToIndicesSize();
 
   // calculates the sum over blocks of 256 int64s and returns the result for
   // numbers[i * 256] ... numbers[(i + 1) * 256 - 1] inclusive  in results[i].
   void testSum64(int32_t numBlocks, int64_t* numbers, int64_t* results);
 
-  /// Sorts 'rows'[i] using ids[i] as keys and stores the sorted order in
-  /// 'result[i]'.
-  // void dedup(int32_t numBlocks, uint16_t** ids, uint16_t** rows, uint16_t**
-  // resultRows);
+  static int32_t sort16SharedSize();
+
+  void testSort16(int32_t numBlocks, uint16_t** keys, uint16_t** values);
+  void testSort16NoShared(
+      int32_t numBlocks,
+      uint16_t** keys,
+      uint16_t** values,
+      char* temp);
+
+  void partitionShorts(
+      int32_t numBlocks,
+      uint16_t** keys,
+      int32_t* numKeys,
+      int32_t numPartitions,
+      int32_t** ranks,
+      int32_t** partitionStarts,
+      int32_t** partitionedRows);
+
+  // Operation for hash table tests.
+  enum class HashCase { kGroup, kBuild, kProbe };
+
+  /// Does probe/groupby/build on 'table'. 'probe' contains the parameters and
+  /// temp storage. 'table' and 'probe' are expected to be resident on device.
+  /// 'numBlocks' gives how many TBs are run, the rows per TB are in 'probe'.
+  void hashTest(GpuHashTableBase* table, HashRun& probe, HashCase mode);
+
+  static int32_t freeSetSize();
+
+  void initAllocator(HashPartitionAllocator* allocator);
+
+  /// tests RowAllocator.
+  void rowAllocatorTest(
+      int32_t numBlocks,
+      int32_t numAlloc,
+      int32_t numFree,
+      int32_t numStr,
+      AllocatorTestResult* results);
+
+  void updateSum1Atomic(TestingRow* rows, HashRun& run);
+  void updateSum1Exch(TestingRow* rows, HashRun& run);
+  void updateSum1NoSync(TestingRow* rows, HashRun& run);
+  void updateSum1AtomicCoalesce(TestingRow* rows, HashRun& run);
+  void updateSum1Part(TestingRow* rows, HashRun& run);
+  void updateSum1Mtx(TestingRow* rows, HashRun& run);
+  void updateSum1MtxCoalesce(TestingRow* rows, HashRun& run);
+  void updateSum1Order(TestingRow* rows, HashRun& run);
 };
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/CMakeLists.txt b/velox/experimental/wave/common/tests/CMakeLists.txt
index f9d2a3305eec9..8914f6a5357cd 100644
--- a/velox/experimental/wave/common/tests/CMakeLists.txt
+++ b/velox/experimental/wave/common/tests/CMakeLists.txt
@@ -12,8 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu
-                                      BlockTest.cpp BlockTest.cu)
+add_executable(
+  velox_wave_common_test
+  GpuArenaTest.cpp
+  CudaTest.cpp
+  CudaTest.cu
+  BlockTest.cpp
+  BlockTest.cu
+  HashTableTest.cpp
+  HashTestUtil.cpp)
 
 add_test(velox_wave_common_test velox_wave_common_test)
 
diff --git a/velox/experimental/wave/common/tests/CpuTable.h b/velox/experimental/wave/common/tests/CpuTable.h
new file mode 100644
index 0000000000000..d960e11b61bdb
--- /dev/null
+++ b/velox/experimental/wave/common/tests/CpuTable.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include "velox/common/base/SimdUtil.h"
+
+namespace facebook::velox::wave {
+
+class CpuBucket {
+ public:
+#if XSIMD_WITH_SSE2
+  using TagVector = xsimd::batch<uint8_t, xsimd::sse2>;
+#elif XSIMD_WITH_NEON
+  using TagVector = xsimd::batch<uint8_t, xsimd::neon>;
+#endif
+
+  auto loadTags() {
+#if XSIMD_WITH_SSE2
+    return TagVector(_mm_loadu_si128(reinterpret_cast<__m128i const*>(tags_)));
+#elif XSIMD_WITH_NEON
+    return TagVector(vld1q_u8(tags_));
+#endif
+  }
+
+  void setTag(int32_t idx, uint8_t tag) {
+    tags_[idx] = tag;
+  }
+
+  static inline uint16_t matchTags(TagVector tags, uint8_t tag) {
+    auto flags = TagVector::broadcast(tag) == tags;
+    return simd::toBitMask(flags);
+  }
+
+  template <typename T>
+  T* load(int32_t idx) {
+    uint64_t data = *reinterpret_cast<uint64_t*>(&data_[idx * 6]);
+    return reinterpret_cast<T*>(data & 0xffffffffffff);
+  }
+
+  void store(uint32_t idx, void* row) {
+    auto uptr = reinterpret_cast<uint64_t>(row);
+    uint64_t data = *reinterpret_cast<uint64_t*>(&data_[idx * 6]);
+    *reinterpret_cast<uint64_t*>(&data_[idx * 6]) =
+        (data & 0xffff000000000000) | uptr;
+  }
+
+ private:
+  uint8_t tags_[16];
+  uint8_t data_[128 - 16];
+};
+
+struct CpuHashTable {
+  CpuHashTable() = default;
+
+  CpuHashTable(int32_t numSlots, int32_t rowBytes) {
+    auto numBuckets = bits::nextPowerOfTwo(numSlots) / 16;
+    assert(numBuckets > 0);
+    sizeMask = numBuckets - 1;
+    bucketSpace.resize(numBuckets * sizeof(CpuBucket) + 64);
+    buckets = reinterpret_cast<CpuBucket*>(
+        bits::roundUp(reinterpret_cast<uint64_t>(bucketSpace.data()), 64));
+    rows.resize(rowBytes);
+  }
+
+  std::string bucketSpace;
+
+  CpuBucket* buckets;
+
+  int32_t sizeMask;
+
+  // Preallocated space for rows. Do not resize.
+  std::string rows;
+
+  // Number of used bytes in 'rows'.
+  int32_t spaceUsed{0};
+
+  // Number of entries.
+  int32_t size{0};
+
+  template <typename T>
+  T* newRow() {
+    auto size = sizeof(T);
+    if (spaceUsed + size > rows.size()) {
+      return nullptr;
+    }
+    auto row = reinterpret_cast<T*>(rows.data() + spaceUsed);
+    spaceUsed += size;
+    return row;
+  }
+
+  template <typename RowType, typename Ops>
+  RowType* find(int64_t key, uint64_t h, Ops ops) const {
+    uint8_t tag = 0x80 | (h >> 32);
+    int32_t bucketIdx = h & sizeMask;
+    for (;;) {
+      auto tags = buckets[bucketIdx].loadTags();
+      auto hits = CpuBucket::matchTags(tags, tag);
+      while (hits) {
+        auto idx = bits::getAndClearLastSetBit(hits);
+        auto row = buckets[bucketIdx].load<RowType>(idx);
+        if (ops.compare1(this, row, key)) {
+          return row;
+        }
+      }
+      auto misses = CpuBucket::matchTags(tags, 0);
+      if (misses) {
+        return nullptr;
+      }
+      bucketIdx = (1 + bucketIdx) & sizeMask;
+    }
+  }
+
+  template <typename RowType, typename Ops>
+  void updatingProbe(int32_t numRows, HashProbe* probe, Ops ops) {
+    for (auto i = 0; i < numRows; ++i) {
+      auto h = probe->hashes[i];
+      uint8_t tag = 0x80 | (h >> 32);
+      auto bucketIdx = h & sizeMask;
+      for (;;) {
+        auto tags = buckets[bucketIdx].loadTags();
+        auto hits = CpuBucket::matchTags(tags, tag);
+        while (hits) {
+          auto idx = bits::getAndClearLastSetBit(hits);
+          auto row = buckets[bucketIdx].load<RowType>(idx);
+          if (ops.compare(this, row, i, probe)) {
+            ops.update(this, row, i, probe);
+            goto done;
+          }
+        }
+        auto misses = CpuBucket::matchTags(tags, 0);
+        if (misses) {
+          int32_t idx = bits::getAndClearLastSetBit(misses);
+          buckets[bucketIdx].setTag(idx, tag);
+          auto* newRow = ops.newRow(this, i, probe);
+          buckets[bucketIdx].store(idx, newRow);
+          ++size;
+          ops.update(this, newRow, i, probe);
+          break;
+        }
+        bucketIdx = (bucketIdx + 1) & sizeMask;
+      }
+    done:;
+    }
+  }
+
+  void check() {
+    for (auto i = 0; i <= sizeMask; ++i) {
+      for (auto j = 0; j < 16; j++) {
+        auto row = buckets[i].load<char>(j);
+        if (!row || (row >= rows.data() && row < rows.data() + rows.size())) {
+          continue;
+        }
+        VELOX_FAIL();
+      }
+    }
+  }
+};
+
+} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/CudaTest.cpp b/velox/experimental/wave/common/tests/CudaTest.cpp
index c40351d287c04..167b83c9534af 100644
--- a/velox/experimental/wave/common/tests/CudaTest.cpp
+++ b/velox/experimental/wave/common/tests/CudaTest.cpp
@@ -32,9 +32,14 @@
 #include "velox/common/time/Timer.h"
 #include "velox/experimental/wave/common/GpuArena.h"
 #include "velox/experimental/wave/common/tests/BlockTest.h"
+#include "velox/experimental/wave/common/tests/CpuTable.h"
+#include "velox/experimental/wave/common/tests/HashTestUtil.h"
+#include "velox/experimental/wave/common/tests/Util.h"
 
 #include <iostream>
 
+DEFINE_bool(list_kernels, false, "Lists kernel occupancy and registers");
+
 DEFINE_int32(num_streams, 0, "Number of paralll streams");
 DEFINE_int32(op_size, 0, "Size of invoke kernel (ints read and written)");
 DEFINE_int32(
@@ -480,19 +485,30 @@ struct RoundtripStats {
 
   std::string toString() const {
     return fmt::format(
-        "{}: rps={} gips={}  mode={} threads={} micros={} avgus={} toDev={} GB/s toHost={} GB/s",
+        "{}: rps={:.2f} gips={:.4f}  mode={} threads={} micros={} avgus={:.2f} toDev={:.2f} GB/s toHost={:.2f} GB/s",
         id,
-        (numThreads * numOps) / (micros / 1000000),
-        numAdds / (micros * 1000),
+        (numThreads * numOps) / (micros / 1000000.0),
+        numAdds / (micros * 1000.0),
         mode,
         numThreads,
         micros,
         micros / numOps,
-        toDeviceBytes / (micros * 1000),
-        toHostBytes / (micros * 1000));
+        toDeviceBytes / (micros * 1000.0),
+        toHostBytes / (micros * 1000.0));
   }
 };
 
+// Checks a number for being prime. Returns 0 for prime and a factor for others.
+int64_t factor(int64_t n) {
+  int64_t end = sqrt(n);
+  for (int64_t f = 3; f < end; f += 2) {
+    if (n % f == 0) {
+      return f;
+    }
+  }
+  return 0;
+}
+
 /// Describes one thread of execution in round trip measurement. Each thread
 /// does a sequence of data transfers, kernel calls and synchronizations. The
 /// operations are described in a string of the form:
@@ -510,12 +526,13 @@ struct RoundtripStats {
 /// stream with record event + wait event.
 class RoundtripThread {
  public:
-  // Up to 32 MB of ints.
-  static constexpr int32_t kNumKB = 32 << 10;
+  // Up to 64 MB of ints.
+  static constexpr int32_t kNumKB = 64 << 10;
   static constexpr int32_t kNumInts = kNumKB * 256;
 
   RoundtripThread(int32_t device, ArenaSet* arenas) : arenas_(arenas) {
-    setDevice(getDevice(device));
+    device_ = getDevice(device);
+    setDevice(device_);
     hostBuffer_ = arenas_->host->allocate<int32_t>(kNumInts);
     deviceBuffer_ = arenas_->device->allocate<int32_t>(kNumInts);
     lookupBuffer_ = arenas_->device->allocate<int32_t>(kNumInts);
@@ -536,13 +553,26 @@ class RoundtripThread {
         hostLookup_.get(),
         hostBuffer_->as<int32_t>(),
         kNumInts * sizeof(int32_t));
+    serial_ = ++serialCounter_;
+  }
+
+  ~RoundtripThread() {
+    try {
+      stream_->wait();
+    } catch (const std::exception& e) {
+      LOG(ERROR) << "Error in sync on ~RoundtripThread(): " << e.what();
+    }
   }
 
   enum class OpCode {
     kToDevice,
     kToHost,
     kAdd,
+    kAddShared,
+    kAddReg,
     kAddRandom,
+    kAddRandomEmptyWarps,
+    kAddRandomEmptyThreads,
     kWideAdd,
     kEnd,
     kSync,
@@ -553,6 +583,9 @@ class RoundtripThread {
     OpCode opCode;
     int32_t param1{1};
     int32_t param2{0};
+    int32_t param3{0};
+    int32_t param4{0};
+    int32_t param5{0};
   };
 
   void run(RoundtripStats& stats) {
@@ -606,6 +639,27 @@ class RoundtripThread {
             }
             stats.numAdds += op.param1 * op.param2 * 256;
             break;
+          case OpCode::kAddShared:
+            VELOX_CHECK_LE(op.param1, kNumKB);
+            if (stats.isCpu) {
+              addOneCpu(op.param1 * 256, op.param2);
+            } else {
+              stream_->addOneShared(
+                  deviceBuffer_->as<int32_t>(), op.param1 * 256, op.param2);
+            }
+            stats.numAdds += op.param1 * op.param2 * 256;
+            break;
+          case OpCode::kAddReg:
+            VELOX_CHECK_LE(op.param1, kNumKB);
+            if (stats.isCpu) {
+              addOneCpu(op.param1 * 256, op.param2);
+            } else {
+              stream_->addOneShared(
+                  deviceBuffer_->as<int32_t>(), op.param1 * 256, op.param2);
+            }
+            stats.numAdds += op.param1 * op.param2 * 256;
+            break;
+
           case OpCode::kWideAdd:
             VELOX_CHECK_LE(op.param1, kNumKB);
             if (stats.isCpu) {
@@ -618,15 +672,22 @@ class RoundtripThread {
             break;
 
           case OpCode::kAddRandom:
+          case OpCode::kAddRandomEmptyWarps:
+          case OpCode::kAddRandomEmptyThreads:
             VELOX_CHECK_LE(op.param1, kNumKB);
             if (stats.isCpu) {
-              addOneRandomCpu(op.param1 * 256, op.param2);
+              addOneRandomCpu(op.param1 * 256, op.param2, op.param4, op.param5);
             } else {
               stream_->addOneRandom(
                   deviceBuffer_->as<int32_t>(),
                   lookupBuffer_->as<int32_t>(),
                   op.param1 * 256,
-                  op.param2);
+                  op.param2,
+                  op.param3,
+                  op.param4,
+                  op.param5,
+                  op.opCode == OpCode::kAddRandomEmptyWarps,
+                  op.opCode == OpCode::kAddRandomEmptyThreads);
             }
             stats.numAdds += op.param1 * op.param2 * 256;
             break;
@@ -653,7 +714,7 @@ class RoundtripThread {
     stats.endMicros = getCurrentTimeMicro();
   }
 
-  void addOneCpu(int32_t size, int32_t repeat) {
+  FOLLY_NOINLINE void addOneCpu(int32_t size, int32_t repeat) {
     int32_t* ints = hostInts_.get();
     for (auto counter = 0; counter < repeat; ++counter) {
       for (auto i = 0; i < size; ++i) {
@@ -661,16 +722,23 @@ class RoundtripThread {
       }
     }
   }
-  void addOneRandomCpu(uint32_t size, int32_t repeat) {
+  FOLLY_NOINLINE void addOneRandomCpu(
+      uint32_t size,
+      int32_t repeat,
+      int32_t numLocal,
+      int32_t localStride) {
     int32_t* ints = hostInts_.get();
     int32_t* lookup = hostLookup_.get();
     for (uint32_t counter = 0; counter < repeat; ++counter) {
       for (auto i = 0; i < size; ++i) {
-        auto rnd = (static_cast<uint64_t>(
-                        static_cast<uint32_t>(i * (counter + 1) * 1367836089)) *
-                    size) >>
-            32;
-        ints[i] += lookup[rnd];
+        auto rnd = scale32(i * (counter + 1) * kPrime32, size);
+        auto sum = lookup[rnd];
+        auto limit =
+            std::min<int32_t>(rnd + localStride * (1 + numLocal), size);
+        for (auto j = rnd + localStride; j < limit; j += localStride) {
+          sum += lookup[j];
+        }
+        ints[i] += sum;
       }
     }
   }
@@ -699,6 +767,13 @@ class RoundtripThread {
         case 'a':
           op.opCode = OpCode::kAdd;
           ++position;
+          if (str[position] == 's') {
+            op.opCode = OpCode::kAddShared;
+            ++position;
+          } else if (str[position] == 'r') {
+            op.opCode = OpCode::kAddReg;
+            ++position;
+          }
           op.param1 = parseInt(str, position, 1);
           op.param2 = parseInt(str, position, 1);
           return op;
@@ -710,10 +785,26 @@ class RoundtripThread {
           return op;
 
         case 'r':
-          op.opCode = OpCode::kAddRandom;
           ++position;
+          if (str[position] == 'w') {
+            op.opCode = OpCode::kAddRandomEmptyWarps;
+            ++position;
+          } else if (str[position] == 't') {
+            op.opCode = OpCode::kAddRandomEmptyThreads;
+            ++position;
+          } else {
+            op.opCode = OpCode::kAddRandom;
+          }
+          // Size of data to update and lookup array (KB).
           op.param1 = parseInt(str, position, 1);
+          // Number of repeats.
           op.param2 = parseInt(str, position, 1);
+          // target number of  threads in kernel.
+          op.param3 = parseInt(str, position, 10240);
+          // Number of nearby memory accesses
+          op.param4 = parseInt(str, position, 0);
+          // Stride of nearby memory accesses
+          op.param5 = parseInt(str, position, 0);
           return op;
 
         case 's':
@@ -750,6 +841,7 @@ class RoundtripThread {
   }
 
   ArenaSet* const arenas_;
+  Device* device_{nullptr};
   WaveBufferPtr deviceBuffer_;
   WaveBufferPtr hostBuffer_;
   WaveBufferPtr lookupBuffer_;
@@ -757,6 +849,8 @@ class RoundtripThread {
   std::unique_ptr<int32_t[]> hostInts_;
   std::unique_ptr<TestStream> stream_;
   std::unique_ptr<Event> event_;
+  int32_t serial_{0};
+  static inline std::atomic<int32_t> serialCounter_{0};
 };
 
 class CudaTest : public testing::Test {
@@ -849,7 +943,7 @@ class CudaTest : public testing::Test {
         waitEach(streams, events);
       }
       for (auto i = 0; i < numStreams; ++i) {
-        streams[i]->addOne(ints[i], opSize);
+        streams[i]->incOne(ints[i], opSize);
         if (counter == 0 || counter >= firstNotify) {
           streams[i]->addCallback([&]() {
             auto d = getCurrentTimeMicro() - start;
@@ -1072,7 +1166,7 @@ class CudaTest : public testing::Test {
       int numOps = 10000) {
     auto arenas = getArenas();
     std::vector<RoundtripStats> allStats;
-    std::vector<int32_t> numThreadsValues = {2, 4, 8, 16, 32};
+    std::vector<int32_t> numThreadsValues = {1, 2, 4, 8, 16, 32};
     int32_t ordinal = 0;
     for (auto numThreads : numThreadsValues) {
       std::vector<RoundtripStats> runStats;
@@ -1169,7 +1263,7 @@ TEST_F(CudaTest, stream) {
   stream.prefetch(nullptr, ints, opSize * sizeof(int32_t));
   stream.wait();
   for (auto i = 0; i < opSize; ++i) {
-    ASSERT_EQ(ints[i], i + 1);
+    ASSERT_EQ(ints[i], i + (i & 31));
   }
   allocator_->free(ints, sizeof(int32_t) * opSize);
 }
@@ -1284,9 +1378,8 @@ TEST_F(CudaTest, roundtripMatrix) {
   if (!FLAGS_roundtrip_ops.empty()) {
     std::vector<std::string> modes = {FLAGS_roundtrip_ops};
     roundtripTest(
-        fmt::format("{} GPU, 1000 repeats", modes[0]), modes, false, 1000);
-    roundtripTest(
-        fmt::format("{} CPU, 100 repeats", modes[0]), modes, true, 100);
+        fmt::format("{} GPU, 64 repeats", modes[0]), modes, false, 64);
+    roundtripTest(fmt::format("{} CPU, 32 repeats", modes[0]), modes, true, 32);
     return;
   }
   if (!FLAGS_enable_bm) {
@@ -1313,8 +1406,8 @@ TEST_F(CudaTest, roundtripMatrix) {
       "d1000a1000,30h1sd1a1000,30h1s",
       "d1000a1000,150h1sd1a1000,150h1s",
   };
-  roundtripTest("Seq GPU", seqModeValues, false, 1024);
-  roundtripTest("Seq CPU", seqModeValues, true, 64);
+  roundtripTest("Seq GPU", seqModeValues, false, 32);
+  roundtripTest("Seq CPU", seqModeValues, true, 16);
 
   std::vector<std::string> randomModeValues = {
       "d100r100,10h1s",
@@ -1325,8 +1418,85 @@ TEST_F(CudaTest, roundtripMatrix) {
       "d1000r1000,100h1s",
       "d10000r10000,10h1s",
       "d30000r30000,50h1s"};
-  roundtripTest("Random GPU", randomModeValues, false, 512);
-  roundtripTest("Random CPU", randomModeValues, true, 16);
+  roundtripTest("Random GPU", randomModeValues, false, 16);
+  roundtripTest("Random CPU", randomModeValues, true, 8);
+
+  std::vector<std::string> widthModeValues = {
+      "d100r100,10,256h1s",
+      "d100r100,10,1024",
+      "d100r100,10,8192",
+      "d30000r30000,5,256h1s",
+      "d30000r30000,5,256h1s",
+      "d30000r30000,5,512h1s",
+      "d30000r30000,5,2048h1s",
+      "d30000r30000,5,10240h1s",
+      "d30000rw30000,5,10240h1s",
+      "d30000rt30000,5,10240h1s"};
+  roundtripTest("Random GPU, width and conditional", widthModeValues, false, 8);
+}
+
+TEST_F(CudaTest, addRandom) {
+  constexpr int32_t kNumInts = 16 << 20;
+  auto arenas = getArenas();
+  auto stream = std::make_unique<TestStream>();
+  auto indices = arenas->unified->allocate<int32_t>(kNumInts);
+  auto sourceBuffer = arenas->unified->allocate<int32_t>(kNumInts);
+  auto rawIndices = indices->as<int32_t>();
+  for (auto i = 0; i < kNumInts; ++i) {
+    rawIndices[i] = i + 1;
+  }
+  stream->prefetch(getDevice(), rawIndices, indices->capacity());
+  auto ints1 = arenas->unified->allocate<int32_t>(kNumInts);
+  auto rawInts1 = ints1->as<int32_t>();
+  auto ints2 = arenas->unified->allocate<int32_t>(kNumInts);
+  auto rawInts2 = ints2->as<int32_t>();
+  auto ints3 = arenas->unified->allocate<int32_t>(kNumInts);
+  auto rawInts3 = ints3->as<int32_t>();
+  memset(rawInts1, 0, kNumInts * sizeof(int32_t));
+  memset(rawInts2, 0, kNumInts * sizeof(int32_t));
+  memset(rawInts3, 0, kNumInts * sizeof(int32_t));
+  stream->prefetch(getDevice(), rawInts1, ints1->capacity());
+  stream->prefetch(getDevice(), rawInts2, ints2->capacity());
+  stream->prefetch(getDevice(), rawInts3, ints3->capacity());
+  // Let prefetch finish.
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  // warm up.
+  stream->addOneRandom(rawInts1, rawIndices, kNumInts, 20, 10240);
+  stream->addOneRandom(rawInts2, rawIndices, kNumInts, 20, 10240, true);
+  stream->addOneRandom(rawInts3, rawIndices, kNumInts, 20, 10240, false, true);
+  stream->wait();
+
+  uint64_t time1 = 0;
+  uint64_t time2 = 0;
+  uint64_t time3 = 0;
+  for (auto count = 0; count < 20; ++count) {
+    {
+      MicrosecondTimer t(&time1);
+      stream->addOneRandom(rawInts1, rawIndices, kNumInts, 20, 10240);
+      stream->wait();
+    }
+    {
+      MicrosecondTimer t(&time2);
+      stream->addOneRandom(rawInts2, rawIndices, kNumInts, 20, 10240, true);
+      stream->wait();
+    }
+    {
+      MicrosecondTimer t(&time3);
+      stream->addOneRandom(
+          rawInts3, rawIndices, kNumInts, 20, 10240, false, true);
+      stream->wait();
+    }
+  }
+  std::cout << fmt::format(
+                   "All {}, half warps {} half threads {}", time1, time2, time3)
+            << std::endl;
+
+  stream->prefetch(nullptr, rawInts1, ints1->capacity());
+  stream->prefetch(nullptr, rawInts2, ints2->capacity());
+  stream->prefetch(nullptr, rawInts3, ints3->capacity());
+
+  EXPECT_EQ(0, memcmp(rawInts1, rawInts2, kNumInts * sizeof(int32_t)));
+  EXPECT_EQ(0, memcmp(rawInts1, rawInts3, kNumInts * sizeof(int32_t)));
 }
 
 int main(int argc, char** argv) {
@@ -1336,5 +1506,8 @@ int main(int argc, char** argv) {
     LOG(WARNING) << "No CUDA detected, skipping all tests";
     return 0;
   }
+  if (FLAGS_list_kernels) {
+    printKernels();
+  }
   return RUN_ALL_TESTS();
 }
diff --git a/velox/experimental/wave/common/tests/CudaTest.cu b/velox/experimental/wave/common/tests/CudaTest.cu
index da529db26144a..14f97f577a3c3 100644
--- a/velox/experimental/wave/common/tests/CudaTest.cu
+++ b/velox/experimental/wave/common/tests/CudaTest.cu
@@ -14,57 +14,164 @@
  * limitations under the License.
  */
 
+#include "velox/experimental/wave/common/Block.cuh"
 #include "velox/experimental/wave/common/CudaUtil.cuh"
 #include "velox/experimental/wave/common/tests/CudaTest.h"
 
 namespace facebook::velox::wave {
+constexpr uint32_t kPrime32 = 1815531889;
 
 __global__ void
-addOneKernel(int32_t* numbers, int32_t size, int32_t stride, int32_t repeats) {
-  auto index = blockDim.x * blockIdx.x + threadIdx.x;
+incOneKernel(int32_t* numbers, int32_t size, int32_t stride, int32_t repeats) {
   for (auto counter = 0; counter < repeats; ++counter) {
-    for (; index < size; index += stride) {
+    for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size;
+         index += stride) {
       ++numbers[index];
     }
     __syncthreads();
   }
 }
 
-void TestStream::addOne(int32_t* numbers, int32_t size, int32_t repeats) {
-  constexpr int32_t kWidth = 10240;
+__global__ void
+addOneKernel(int32_t* numbers, int32_t size, int32_t stride, int32_t repeats) {
+  for (auto counter = 0; counter < repeats; ++counter) {
+    for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size;
+         index += stride) {
+      numbers[index] += index & 31;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void addOneSharedKernel(
+    int32_t* numbers,
+    int32_t size,
+    int32_t stride,
+    int32_t repeats) {
+  extern __shared__ __align__(16) char smem[];
+  int32_t* temp = reinterpret_cast<int32_t*>(smem);
+  for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size;
+       index += stride) {
+    temp[threadIdx.x] = numbers[index];
+    for (auto counter = 0; counter < repeats; ++counter) {
+      temp[threadIdx.x] += (index + counter) & 31;
+    }
+    __syncthreads();
+    numbers[index] = temp[threadIdx.x];
+  }
+}
+
+__global__ void addOneRegKernel(
+    int32_t* numbers,
+    int32_t size,
+    int32_t stride,
+    int32_t repeats) {
+  for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size;
+       index += stride) {
+    auto temp = numbers[index];
+    for (auto counter = 0; counter < repeats; ++counter) {
+      temp += (index + counter) & 31;
+    }
+    __syncthreads();
+    numbers[index] = temp;
+  }
+}
+
+void TestStream::incOne(
+    int32_t* numbers,
+    int32_t size,
+    int32_t repeats,
+    int32_t width) {
   constexpr int32_t kBlockSize = 256;
   auto numBlocks = roundUp(size, kBlockSize) / kBlockSize;
   int32_t stride = size;
-  if (numBlocks > kWidth / kBlockSize) {
-    stride = kWidth;
-    numBlocks = kWidth / kBlockSize;
+  if (numBlocks > width / kBlockSize) {
+    stride = width;
+    numBlocks = width / kBlockSize;
+  }
+  incOneKernel<<<numBlocks, kBlockSize, 0, stream_->stream>>>(
+      numbers, size, stride, repeats);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+void TestStream::addOne(
+    int32_t* numbers,
+    int32_t size,
+    int32_t repeats,
+    int32_t width) {
+  constexpr int32_t kBlockSize = 256;
+  auto numBlocks = roundUp(size, kBlockSize) / kBlockSize;
+  int32_t stride = size;
+  if (numBlocks > width / kBlockSize) {
+    stride = width;
+    numBlocks = width / kBlockSize;
   }
   addOneKernel<<<numBlocks, kBlockSize, 0, stream_->stream>>>(
       numbers, size, stride, repeats);
   CUDA_CHECK(cudaGetLastError());
 }
 
+void TestStream::addOneShared(
+    int32_t* numbers,
+    int32_t size,
+    int32_t repeats,
+    int32_t width) {
+  constexpr int32_t kBlockSize = 256;
+  auto numBlocks = roundUp(size, kBlockSize) / kBlockSize;
+  int32_t stride = size;
+  if (numBlocks > width / kBlockSize) {
+    stride = width;
+    numBlocks = width / kBlockSize;
+  }
+  addOneSharedKernel<<<
+      numBlocks,
+      kBlockSize,
+      kBlockSize * sizeof(int32_t),
+      stream_->stream>>>(numbers, size, stride, repeats);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+void TestStream::addOneReg(
+    int32_t* numbers,
+    int32_t size,
+    int32_t repeats,
+    int32_t width) {
+  constexpr int32_t kBlockSize = 256;
+  auto numBlocks = roundUp(size, kBlockSize) / kBlockSize;
+  int32_t stride = size;
+  if (numBlocks > width / kBlockSize) {
+    stride = width;
+    numBlocks = width / kBlockSize;
+  }
+  addOneRegKernel<<<numBlocks, kBlockSize, 0, stream_->stream>>>(
+      numbers, size, stride, repeats);
+  CUDA_CHECK(cudaGetLastError());
+}
+
 __global__ void addOneWideKernel(WideParams params) {
-  auto index = blockDim.x * blockIdx.x + threadIdx.x;
   auto numbers = params.numbers;
   auto size = params.size;
   auto repeat = params.repeat;
   auto stride = params.stride;
   for (auto counter = 0; counter < repeat; ++counter) {
-    for (; index < size; index += stride) {
+    for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size;
+         index += stride) {
       ++numbers[index];
     }
   }
 }
 
-void TestStream::addOneWide(int32_t* numbers, int32_t size, int32_t repeat) {
-  constexpr int32_t kWidth = 10240;
+void TestStream::addOneWide(
+    int32_t* numbers,
+    int32_t size,
+    int32_t repeat,
+    int32_t width) {
   constexpr int32_t kBlockSize = 256;
   auto numBlocks = roundUp(size, kBlockSize) / kBlockSize;
   int32_t stride = size;
-  if (numBlocks > kWidth / kBlockSize) {
-    stride = kWidth;
-    numBlocks = kWidth / kBlockSize;
+  if (numBlocks > width / kBlockSize) {
+    stride = width;
+    numBlocks = width / kBlockSize;
   }
   WideParams params;
   params.numbers = numbers;
@@ -75,41 +182,108 @@ void TestStream::addOneWide(int32_t* numbers, int32_t size, int32_t repeat) {
   CUDA_CHECK(cudaGetLastError());
 }
 
-__global__ void addOneRandomKernel(
+__global__ void __launch_bounds__(1024) addOneRandomKernel(
     int32_t* numbers,
     const int32_t* lookup,
     uint32_t size,
     int32_t stride,
-    int32_t repeats) {
-  auto index = blockDim.x * blockIdx.x + threadIdx.x;
+    int32_t repeats,
+    int32_t numLocal,
+    int32_t localStride,
+    bool emptyWarps,
+    bool emptyThreads) {
   for (uint32_t counter = 0; counter < repeats; ++counter) {
-    for (; index < size; index += stride) {
-      auto rnd = (static_cast<uint64_t>(static_cast<uint32_t>(
-                      index * (counter + 1) * 1367836089)) *
-                  size) >>
-          32;
-      numbers[index] += lookup[rnd];
+    if (emptyWarps) {
+      if (((threadIdx.x / 32) & 1) == 0) {
+        for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size;
+             index += stride) {
+          auto rnd = deviceScale32(index * (counter + 1) * kPrime32, size);
+          auto sum = lookup[rnd];
+          auto limit = min(rnd + localStride * (1 + numLocal), size);
+          for (auto j = rnd + localStride; j < limit; j += localStride) {
+            sum += lookup[j];
+          }
+          numbers[index] += sum;
+
+          rnd = deviceScale32((index + 32) * (counter + 1) * kPrime32, size);
+          sum = lookup[rnd];
+          limit = min(rnd + localStride * (1 + numLocal), size);
+          for (auto j = rnd + localStride; j < limit; j += localStride) {
+            sum += lookup[j];
+          }
+          numbers[index + 32] += sum;
+        }
+      }
+    } else if (emptyThreads) {
+      if ((threadIdx.x & 1) == 0) {
+        for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size;
+             index += stride) {
+          auto rnd = deviceScale32(index * (counter + 1) * kPrime32, size);
+          auto sum = lookup[rnd];
+          auto limit = min(rnd + localStride * (1 + numLocal), size);
+          for (auto j = rnd + localStride; j < limit; j += localStride) {
+            sum += lookup[j];
+          }
+          numbers[index] += sum;
+
+          rnd = deviceScale32((index + 1) * (counter + 1) * kPrime32, size);
+          sum = lookup[rnd];
+          limit = min(rnd + localStride * (1 + numLocal), size);
+          for (auto j = rnd + localStride; j < limit; j += localStride) {
+            sum += lookup[j];
+          }
+          numbers[index + 1] += sum;
+        }
+      }
+    } else {
+      for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < size;
+           index += stride) {
+        auto rnd = deviceScale32(index * (counter + 1) * kPrime32, size);
+        auto sum = lookup[rnd];
+        auto limit = min(rnd + localStride * (1 + numLocal), size);
+        for (auto j = rnd + localStride; j < limit; j += localStride) {
+          sum += lookup[j];
+        }
+        numbers[index] += sum;
+      }
     }
     __syncthreads();
   }
+  __syncthreads();
 }
 
 void TestStream::addOneRandom(
     int32_t* numbers,
     const int32_t* lookup,
     int32_t size,
-    int32_t repeats) {
-  constexpr int32_t kWidth = 10240;
+    int32_t repeats,
+    int32_t width,
+    int32_t numLocal,
+    int32_t localStride,
+    bool emptyWarps,
+    bool emptyThreads) {
   constexpr int32_t kBlockSize = 256;
   auto numBlocks = roundUp(size, kBlockSize) / kBlockSize;
   int32_t stride = size;
-  if (numBlocks > kWidth / kBlockSize) {
-    stride = kWidth;
-    numBlocks = kWidth / kBlockSize;
+  if (numBlocks > width / kBlockSize) {
+    stride = width;
+    numBlocks = width / kBlockSize;
   }
   addOneRandomKernel<<<numBlocks, kBlockSize, 0, stream_->stream>>>(
-      numbers, lookup, size, stride, repeats);
+      numbers,
+      lookup,
+      size,
+      stride,
+      repeats,
+      numLocal,
+      localStride,
+      emptyWarps,
+      emptyThreads);
   CUDA_CHECK(cudaGetLastError());
 }
 
+REGISTER_KERNEL("addOne", addOneKernel);
+REGISTER_KERNEL("addOneWide", addOneWideKernel);
+REGISTER_KERNEL("addOneRandom", addOneRandomKernel);
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/CudaTest.h b/velox/experimental/wave/common/tests/CudaTest.h
index 758ad39a260ed..7d5716f753eac 100644
--- a/velox/experimental/wave/common/tests/CudaTest.h
+++ b/velox/experimental/wave/common/tests/CudaTest.h
@@ -35,15 +35,66 @@ class TestStream : public Stream {
  public:
   // Queues a kernel to add 1 to numbers[0...size - 1]. The kernel repeats
   // 'repeat' times.
-  void addOne(int32_t* numbers, int size, int32_t repeat = 1);
+  void
+  incOne(int32_t* numbers, int size, int32_t repeat = 1, int32_t width = 10240);
 
-  void addOneWide(int32_t* numbers, int32_t size, int32_t repeat = 1);
+  /// Like incOne but adds idx & 31 to numbers[idx].
+  void
+  addOne(int32_t* numbers, int size, int32_t repeat = 1, int32_t width = 10240);
 
+  void addOneWide(
+      int32_t* numbers,
+      int32_t size,
+      int32_t repeat = 1,
+      int32_t width = 10240);
+
+  /// Like addOne but uses shared memory for intermediates, with global
+  /// ead/write at start/end.
+  void addOneShared(
+      int32_t* numbers,
+      int32_t size,
+      int32_t repeat = 1,
+      int32_t width = 10240);
+
+  /// Like addOne but uses registers for intermediates.
+  void addOneReg(
+      int32_t* numbers,
+      int32_t size,
+      int32_t repeat = 1,
+      int32_t width = 10240);
+
+  /// Increments each of 'numbers by a deterministic pseudorandom
+  /// increment from 'lookup'. If 'numLocal is non-0, also accesses
+  /// 'numLocal' adjacent positions in 'lookup' with a stride of
+  /// 'localStride'.  If 'emptyWarps' is true, odd warps do no work
+  /// but still sync with the other ones with __syncthreads().  If
+  /// 'emptyThreads' is true, odd lanes do no work and even lanes do
+  /// their work instead.
   void addOneRandom(
       int32_t* numbers,
       const int32_t* lookup,
       int size,
-      int32_t repeat = 1);
+      int32_t repeat = 1,
+      int32_t width = 10240,
+      int32_t numLocal = 0,
+      int32_t localStride = 0,
+      bool emptyWarps = false,
+      bool emptyLanes = false);
+
+  // Makes random lookup keys and increments, starting at 'startCount'
+  // columns[0] is keys. 'powerOfTwo' is the next power of two from
+  // 'keyRange'. If 'powerOfTwo' is 0 the key columns are set to
+  // zero. Otherwise the key column values are incremented by a a
+  // delta + index of column where delta for element 0 is startCount &
+  // (powerOfTwo - 1).
+  void makeInput(
+      int32_t numRows,
+      int32_t keyRange,
+      int32_t powerOfTwo,
+      int32_t startCount,
+      uint64_t* hash,
+      uint8_t numColumns,
+      int64_t** columns);
 };
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/GpuArenaTest.cpp b/velox/experimental/wave/common/tests/GpuArenaTest.cpp
index f451980020619..a3bca9a2080b4 100644
--- a/velox/experimental/wave/common/tests/GpuArenaTest.cpp
+++ b/velox/experimental/wave/common/tests/GpuArenaTest.cpp
@@ -155,3 +155,23 @@ TEST_F(GpuArenaTest, buffers) {
   buffers.clear();
   EXPECT_EQ(1, arena->slabs().size());
 }
+
+TEST_F(GpuArenaTest, views) {
+  auto arena = std::make_unique<GpuArena>(1 << 20, allocator_.get());
+  WaveBufferPtr buffer = arena->allocate<char>(1024);
+  EXPECT_EQ(1, buffer->refCount());
+  WaveBufferPtr view = WaveBufferView<WaveBufferPtr>::create(
+      buffer->as<uint8_t>() + 10, 10, buffer);
+  EXPECT_EQ(2, buffer->refCount());
+  EXPECT_EQ(1, view->refCount());
+  auto view2 = view;
+  EXPECT_EQ(2, buffer->refCount());
+  EXPECT_EQ(2, view->refCount());
+  auto raw = buffer.get();
+  buffer = nullptr;
+  EXPECT_EQ(1, raw->refCount());
+  view = nullptr;
+  view2 = nullptr;
+  // This is reference to freed but the header is still in the arena.
+  EXPECT_EQ(0, raw->refCount());
+}
diff --git a/velox/experimental/wave/common/tests/HashTableTest.cpp b/velox/experimental/wave/common/tests/HashTableTest.cpp
new file mode 100644
index 0000000000000..0e5662911411c
--- /dev/null
+++ b/velox/experimental/wave/common/tests/HashTableTest.cpp
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "velox/common/time/Timer.h"
+#include "velox/experimental/wave/common/Buffer.h"
+#include "velox/experimental/wave/common/GpuArena.h"
+#include "velox/experimental/wave/common/tests/BlockTest.h"
+#include "velox/experimental/wave/common/tests/CpuTable.h"
+#include "velox/experimental/wave/common/tests/HashTestUtil.h"
+
+#include <iostream>
+
+namespace facebook::velox::wave {
+
+class CpuMockGroupByOps {
+ public:
+  bool
+  compare(CpuHashTable* table, TestingRow* row, int32_t i, HashProbe* probe) {
+    return row->key == reinterpret_cast<int64_t**>(probe->keys)[0][i];
+  }
+
+  bool compare1(const CpuHashTable* table, TestingRow* row, int64_t key) {
+    return key == row->key;
+  }
+
+  TestingRow* newRow(CpuHashTable* table, int32_t i, HashProbe* probe) {
+    auto row = table->newRow<TestingRow>();
+    row->key = reinterpret_cast<int64_t**>(probe->keys)[0][i];
+    row->flags = 0;
+    row->count = 0;
+    new (&row->concatenation) ArrayAgg64();
+    return row;
+  }
+
+  void
+  update(CpuHashTable* table, TestingRow* row, int32_t i, HashProbe* probe) {
+    auto* keys = reinterpret_cast<int64_t**>(probe->keys);
+    row->count += keys[1][i];
+
+#if 0
+      int64_t arg = keys[1][i];
+      int32_t part = table->partitionIdx(bucket - table->buckets);
+      auto* allocator = &table->allocators[part];
+      auto state = arrayAgg64Append(&row->concatenation, arg, allocator);
+#endif
+  }
+};
+
+class HashTableTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    device_ = getDevice();
+    setDevice(device_);
+    allocator_ = getAllocator(device_);
+    arena_ = std::make_unique<GpuArena>(1 << 28, allocator_);
+    streams_.push_back(std::make_unique<BlockTestStream>());
+  }
+
+  void prefetch(Stream& stream, WaveBufferPtr buffer) {
+    stream.prefetch(device_, buffer->as<char>(), buffer->capacity());
+  }
+
+  // Tests different styles of updating a group by. Results are returned in
+  // 'run'.
+  void updateTestCase(int32_t numDistinct, int32_t numRows, HashRun& run) {
+    run.numRows = numRows;
+    run.numDistinct = numDistinct;
+    run.numColumns = 2;
+    run.numRowsPerThread = 32;
+
+    initializeHashTestInput(run, arena_.get());
+    fillHashTestInput(
+        run.numRows,
+        run.numDistinct,
+        bits::nextPowerOfTwo(run.numDistinct),
+        1,
+        run.numColumns,
+        reinterpret_cast<int64_t**>(run.probe->keys));
+    std::vector<TestingRow> reference(run.numDistinct);
+    for (auto i = 0; i < run.numDistinct; ++i) {
+      reference[i].key = i;
+    }
+    gpuRowsBuffer_ = arena_->allocate<TestingRow>(run.numDistinct);
+    TestingRow* gpuRows = gpuRowsBuffer_->as<TestingRow>();
+    memcpy(gpuRows, reference.data(), sizeof(TestingRow) * run.numDistinct);
+    prefetch(*streams_[0], gpuRowsBuffer_);
+    prefetch(*streams_[0], run.gpuData);
+    streams_[0]->wait();
+    updateCpu(reference.data(), run);
+    updateGpu(gpuRows, run, reference.data());
+    std::cout << run.toString() << std::endl;
+  }
+
+  void updateCpu(TestingRow* rows, HashRun& run) {
+    uint64_t micros = 0;
+    {
+      MicrosecondTimer t(&micros);
+      switch (run.testCase) {
+        case HashTestCase::kUpdateSum1: {
+          int64_t** keys = reinterpret_cast<int64_t**>(run.probe->keys);
+          int64_t* indices = keys[0];
+          int64_t* data = keys[1];
+          auto numRows = run.numRows;
+          for (auto i = 0; i < numRows; ++i) {
+            rows[indices[i]].count += data[i];
+          }
+          break;
+        }
+        default:
+          VELOX_FAIL("Unsupported test case");
+      }
+    }
+    run.addScore("cpu1t", micros);
+  }
+
+#define UPDATE_CASE(title, func, expectCorrect, nextFlags) \
+  {                                                        \
+    std::cout << title << std::endl;                       \
+    MicrosecondTimer t(&micros);                           \
+    streams_[0]->func(rows, run);                          \
+    streams_[0]->wait();                                   \
+  }                                                        \
+  run.addScore(title, micros);                             \
+  micros = 0;                                              \
+  compareAndReset(                                         \
+      reference, rows, run.numDistinct, title, expectCorrect, nextFlags);
+
+  void updateGpu(TestingRow* rows, HashRun& run, TestingRow* reference) {
+    uint64_t micros = 0;
+    switch (run.testCase) {
+      case HashTestCase::kUpdateSum1:
+        UPDATE_CASE("sum1Atm", updateSum1Atomic, true, 0);
+        UPDATE_CASE("sum1NoSync", updateSum1NoSync, false, 0);
+        UPDATE_CASE("sum1AtmCoa", updateSum1AtomicCoalesce, true, 1);
+        UPDATE_CASE("sum1Mtx", updateSum1Mtx, true, 1);
+        UPDATE_CASE("sum1MtxCoa", updateSum1MtxCoalesce, true, 0);
+        UPDATE_CASE("sum1Part", updateSum1Part, true, 0);
+        UPDATE_CASE("sum1Order", updateSum1Order, true, 0);
+        // UPDATE_CASE("sum1Exch", updateSum1Exch, false, 0);
+
+        break;
+      default:
+        VELOX_FAIL("Unsupported test case");
+    }
+  }
+
+  void compareAndReset(
+      TestingRow* reference,
+      TestingRow* rows,
+      int32_t numRows,
+      const char* title,
+      bool expectCorrect,
+      int32_t initFlags = 0) {
+    int32_t numError = 0;
+    int64_t errorSigned = 0;
+    int64_t errorDelta = 0;
+    for (auto i = 0; i < numRows; ++i) {
+      if (rows[i].count == reference[i].count) {
+        continue;
+      }
+      if (numError == 0 && expectCorrect) {
+        std::cout << "In " << title << std::endl;
+        EXPECT_EQ(reference[i].count, rows[i].count) << " at " << i;
+      }
+      ++numError;
+      int64_t d = reference[i].count - rows[i].count;
+      errorSigned += d;
+      errorDelta += d < 0 ? -d : d;
+    }
+    if (numError) {
+      std::cout << fmt::format(
+                       "{}: numError={} errorDelta={} errorSigned={}",
+                       title,
+                       numError,
+                       errorDelta,
+                       errorSigned)
+                << std::endl;
+    }
+    for (auto i = 0; i < numRows; ++i) {
+      new (rows + i) TestingRow();
+      rows[i].key = i;
+      rows[i].flags = initFlags;
+    }
+    prefetch(*streams_[0], gpuRowsBuffer_);
+    streams_[0]->wait();
+  }
+
+  void groupTestCase(int32_t numDistinct, int32_t numRows, HashRun& run) {
+    run.numRows = numRows;
+    run.numDistinct = numDistinct;
+    if (!run.numSlots) {
+      run.numSlots = bits::nextPowerOfTwo(numDistinct);
+    }
+    run.numColumns = 2;
+    run.numRowsPerThread = 32;
+
+    initializeHashTestInput(run, arena_.get());
+    fillHashTestInput(
+        run.numRows,
+        run.numDistinct,
+        bits::nextPowerOfTwo(run.numDistinct),
+        1,
+        run.numColumns,
+        reinterpret_cast<int64_t**>(run.probe->keys));
+    CpuHashTable cpuTable(run.numSlots, sizeof(TestingRow) * run.numDistinct);
+    cpuGroupBy(cpuTable, run);
+    gpuGroupBy(cpuTable, run);
+    std::cout << run.toString() << std::endl;
+  }
+
+  void cpuGroupBy(CpuHashTable& table, HashRun& run) {
+    uint64_t time = 0;
+    {
+      MicrosecondTimer t(&time);
+      int64_t* key = reinterpret_cast<int64_t**>(run.probe->keys)[0];
+      auto* hashes = run.probe->hashes;
+      for (auto i = 0; i < run.numRows; ++i) {
+        hashes[i] = bits::hashMix(1, key[i]);
+      }
+      table.updatingProbe<TestingRow>(
+          run.numRows, run.probe, CpuMockGroupByOps());
+    }
+    run.addScore("cpu1T", time);
+  }
+
+  void gpuGroupBy(const CpuHashTable& reference, HashRun& run) {
+    WaveBufferPtr gpuTableBuffer;
+    GpuHashTableBase* gpuTable;
+    setupGpuTable(
+        run.numSlots,
+        run.numRows,
+        sizeof(TestingRow),
+        arena_.get(),
+        gpuTable,
+        gpuTableBuffer);
+    prefetch(*streams_[0], run.gpuData);
+    prefetch(*streams_[0], gpuTableBuffer);
+    streams_[0]->wait();
+    uint64_t micros = 0;
+    {
+      MicrosecondTimer t(&micros);
+      streams_[0]->hashTest(gpuTable, run, BlockTestStream::HashCase::kGroup);
+      streams_[0]->wait();
+    }
+    run.addScore("gpu", micros);
+    checkGroupBy(reference, gpuTable);
+  }
+
+  void checkGroupBy(const CpuHashTable& reference, GpuHashTableBase* table) {
+    int32_t numChecked = 0;
+    for (auto i = 0; i <= table->sizeMask; ++i) {
+      for (auto j = 0; j < 4; ++j) {
+        auto* row = reinterpret_cast<GpuBucketMembers*>(table->buckets)[i]
+                        .testingLoad<TestingRow>(j);
+        if (row == nullptr) {
+          continue;
+        }
+        ++numChecked;
+        auto referenceRow = reference.find<TestingRow>(
+            row->key, bits::hashMix(1, row->key), CpuMockGroupByOps());
+        ASSERT_TRUE(referenceRow != nullptr);
+        EXPECT_EQ(referenceRow->count, row->count);
+      }
+    }
+    EXPECT_EQ(reference.size, numChecked);
+  }
+
+  Device* device_;
+  GpuAllocator* allocator_;
+  std::unique_ptr<GpuArena> arena_;
+  std::vector<std::unique_ptr<BlockTestStream>> streams_;
+  WaveBufferPtr gpuRowsBuffer_;
+};
+
+TEST_F(HashTableTest, allocator) {
+  constexpr int32_t kNumThreads = 256;
+  constexpr int32_t kTotal = 1 << 22;
+  WaveBufferPtr data = arena_->allocate<char>(kTotal);
+  auto* allocator = data->as<HashPartitionAllocator>();
+  auto freeSetSize = BlockTestStream::freeSetSize();
+  new (allocator) HashPartitionAllocator(
+      data->as<char>() + sizeof(HashPartitionAllocator) + freeSetSize,
+      kTotal - sizeof(HashPartitionAllocator) - freeSetSize,
+      16,
+      allocator + 1);
+  memset(allocator->freeSet, 0, freeSetSize);
+  WaveBufferPtr allResults = arena_->allocate<AllocatorTestResult>(kNumThreads);
+  auto results = allResults->as<AllocatorTestResult>();
+  for (auto i = 0; i < kNumThreads; ++i) {
+    results[i].allocator = reinterpret_cast<RowAllocator*>(allocator);
+    results[i].numRows = 0;
+    results[i].numStrings = 0;
+  }
+  auto stream1 = std::make_unique<BlockTestStream>();
+  auto stream2 = std::make_unique<BlockTestStream>();
+  stream1->initAllocator(allocator);
+  stream1->wait();
+  stream1->rowAllocatorTest(2, 4, 3, 2, results);
+  stream2->rowAllocatorTest(2, 4, 3, 2, results + 128);
+
+  stream1->wait();
+  stream2->wait();
+  // Pointer to result idx, position in result;
+  std::unordered_map<int64_t*, int32_t> uniques;
+  for (auto resultIdx = 0; resultIdx < kNumThreads; ++resultIdx) {
+    auto* result = results + resultIdx;
+    for (auto i = 0; i < result->numRows; ++i) {
+      auto row = result->rows[i];
+      EXPECT_GE(reinterpret_cast<uint64_t>(row), allocator->base);
+      EXPECT_LT(
+          reinterpret_cast<uint64_t>(row),
+          allocator->base + allocator->capacity);
+      auto it = uniques.find(row);
+      EXPECT_TRUE(it == uniques.end()) << fmt::format(
+          "row {} is also at {} {}",
+          reinterpret_cast<uint64_t>(row),
+          it->second >> 24,
+          it->second & bits::lowMask(24));
+
+      uniques[row] = (resultIdx << 24) | i;
+    }
+    for (auto i = 0; i < result->numStrings; ++i) {
+      auto string = result->strings[i];
+      EXPECT_GE(reinterpret_cast<uint64_t>(string), allocator->base);
+      EXPECT_LT(
+          reinterpret_cast<uint64_t>(string),
+          allocator->base + allocator->capacity);
+      auto it = uniques.find(string);
+      EXPECT_TRUE(it == uniques.end()) << fmt::format(
+          "String {} is also at {} {}",
+          reinterpret_cast<uint64_t>(string),
+          it->second >> 24,
+          it->second & bits::lowMask(24));
+      uniques[string] = (resultIdx << 24) | i;
+    }
+  }
+}
+
+TEST_F(HashTableTest, update) {
+  {
+    HashRun run;
+    run.testCase = HashTestCase::kUpdateSum1;
+    updateTestCase(1000, 2000000, run);
+  }
+  {
+    HashRun run;
+    run.testCase = HashTestCase::kUpdateSum1;
+    updateTestCase(10000000, 2000000, run);
+  }
+  {
+    HashRun run;
+    run.testCase = HashTestCase::kUpdateSum1;
+    updateTestCase(10, 2000000, run);
+  }
+}
+
+TEST_F(HashTableTest, groupBy) {
+  {
+    HashRun run;
+    run.testCase = HashTestCase::kGroupSum1;
+    run.numSlots = 2048;
+    groupTestCase(1000, 2000000, run);
+  }
+  {
+    HashRun run;
+    run.testCase = HashTestCase::kGroupSum1;
+    run.numSlots = 8 << 20;
+    groupTestCase(5000000, 50000000, run);
+  }
+}
+
+} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/HashTestUtil.cpp b/velox/experimental/wave/common/tests/HashTestUtil.cpp
new file mode 100644
index 0000000000000..60bf3a6a60b29
--- /dev/null
+++ b/velox/experimental/wave/common/tests/HashTestUtil.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/experimental/wave/common/tests/HashTestUtil.h"
+#include <fmt/format.h>
+#include "velox/common/base/BitUtil.h"
+#include "velox/experimental/wave/common/Buffer.h"
+#include "velox/experimental/wave/common/GpuArena.h"
+#include "velox/experimental/wave/common/HashTable.h"
+
+namespace facebook::velox::wave {
+
+constexpr uint32_t kPrime32 = 1815531889;
+inline uint32_t scale32(uint32_t n, uint32_t scale) {
+  return (static_cast<uint64_t>(static_cast<uint32_t>(n)) * scale) >> 32;
+}
+
+// Returns the byte size for a GpuProbe with numRows as first, rounded row count
+// as second.
+std::pair<int64_t, int32_t> probeSize(HashRun& run) {
+  int32_t roundedRows =
+      bits::roundUp(run.numRows, run.blockSize * run.numRowsPerThread);
+  return {
+      sizeof(HashProbe) +
+          // Column data and hash number array.
+          (1 + run.numColumns) * roundedRows * sizeof(int64_t)
+          // Pointers to column starts
+          + sizeof(int64_t*) * run.numColumns
+          // retry lists
+          + 3 * sizeof(int32_t) * roundedRows +
+          // numRows for each block.
+          sizeof(int32_t) * roundedRows /
+              (run.blockSize * run.numRowsPerThread) +
+          // Temp space for partitioning.
+          roundedRows * sizeof(int32_t) +
+          // alignment padding
+          256,
+      roundedRows};
+}
+
+void fillHashTestInput(
+    int32_t numRows,
+    int32_t keyRange,
+    int32_t powerOfTwo,
+    int64_t counter,
+    uint8_t numColumns,
+    int64_t** columns,
+    int32_t numHot,
+    int32_t hotPct) {
+  int32_t delta = counter & (powerOfTwo - 1);
+  for (auto i = 0; i < numRows; ++i) {
+    auto previous = columns[0][i];
+    auto seed = (previous + delta + i) * kPrime32;
+    if (hotPct && scale32(seed >> 32, 100) <= hotPct) {
+      int32_t nth = scale32(seed, numHot);
+      nth = std::min<int64_t>(
+          keyRange - 1, nth * (static_cast<float>(keyRange) / nth));
+      columns[0][i] = nth;
+    } else {
+      columns[0][i] = scale32(seed, keyRange);
+    }
+  }
+  counter += numRows;
+  for (auto c = 1; c < numColumns; ++c) {
+    for (auto r = 0; r < numRows; ++r) {
+      columns[c][r] = 1; // c + (r & 7);
+    }
+  }
+}
+
+void initializeHashTestInput(HashRun& run, GpuArena* arena) {
+  auto [bytes, roundedRows] = probeSize(run);
+  if (!arena) {
+    run.isCpu = true;
+    run.cpuData = std::make_unique<char[]>(bytes);
+    run.input = run.cpuData.get();
+  } else {
+    run.isCpu = false;
+    run.gpuData = arena->allocate<char>(bytes);
+    run.input = run.gpuData->as<char>();
+  }
+  auto data = run.input;
+  auto dataBegin = data;
+  HashProbe* probe = new (data) HashProbe();
+  run.probe = probe;
+  data += sizeof(HashProbe);
+  probe->numRows = reinterpret_cast<int32_t*>(data);
+  data += bits::roundUp(
+      sizeof(int32_t) * roundedRows / (run.numRowsPerThread * run.blockSize),
+      8);
+  if (!arena) {
+    probe->numRows[0] = run.numRows;
+  } else {
+    run.numBlocks = roundedRows / (run.blockSize * run.numRowsPerThread);
+    for (auto i = 0; i < run.numBlocks; ++i) {
+      if (i == run.numBlocks - 1) {
+        probe->numRows[i] =
+            run.numRows - (i * run.blockSize * run.numRowsPerThread);
+        break;
+      }
+      probe->numRows[i] = run.blockSize * run.numRowsPerThread;
+      ;
+    }
+  }
+  probe->numRowsPerThread = run.numRowsPerThread;
+  probe->hashes = reinterpret_cast<uint64_t*>(data);
+  data += sizeof(uint64_t) * roundedRows;
+  probe->keys = data;
+  data += sizeof(void*) * run.numColumns;
+  probe->kernelRetries1 = reinterpret_cast<int32_t*>(data);
+  data += sizeof(int32_t) * roundedRows;
+  probe->kernelRetries2 = reinterpret_cast<int32_t*>(data);
+  data += sizeof(int32_t) * roundedRows;
+  probe->hostRetries = reinterpret_cast<int32_t*>(data);
+  data += sizeof(int32_t) * roundedRows;
+  for (auto i = 0; i < run.numColumns; ++i) {
+    reinterpret_cast<int64_t**>(probe->keys)[i] =
+        reinterpret_cast<int64_t*>(data);
+    data += sizeof(int64_t) * roundedRows;
+  }
+  run.partitionTemp = reinterpret_cast<int32_t*>(data);
+  data += bits::roundUp(sizeof(int32_t) * roundedRows, 8);
+  VELOX_CHECK_LE(data - dataBegin, bytes);
+}
+
+void setupGpuTable(
+    int32_t numSlots,
+    int32_t maxRows,
+    int64_t rowSize,
+    GpuArena* arena,
+    GpuHashTableBase*& table,
+    WaveBufferPtr& buffer) {
+  using FreeSetType = FreeSetBase<void*, 1024>;
+  // GPU cache lines are 128 bytes divided in 4 separately loadable 32 byte
+  // sectors.
+  constexpr int32_t kAlignment = 128;
+  int32_t numBuckets = bits::nextPowerOfTwo(numSlots / 4);
+  int64_t bytes = sizeof(GpuHashTableBase) + sizeof(HashPartitionAllocator) +
+      sizeof(FreeSetType) + sizeof(GpuBucketMembers) * numBuckets +
+      maxRows * rowSize;
+  buffer = arena->allocate<char>(bytes + kAlignment);
+  table = buffer->as<GpuHashTableBase>();
+  new (table) GpuHashTableBase();
+  table->sizeMask = numBuckets - 1;
+  char* data = reinterpret_cast<char*>(table + 1);
+  table->allocators = reinterpret_cast<RowAllocator*>(data);
+  auto allocatorBase =
+      reinterpret_cast<HashPartitionAllocator*>(table->allocators);
+  data += sizeof(HashPartitionAllocator);
+  auto freeSet = reinterpret_cast<FreeSetType*>(data);
+  new (freeSet) FreeSetType();
+  data += sizeof(FreeSetType);
+  // The buckets start at aligned address.
+  data = reinterpret_cast<char*>(
+      bits::roundUp(reinterpret_cast<uint64_t>(data), kAlignment));
+  table->buckets = reinterpret_cast<GpuBucket*>(data);
+  data += sizeof(GpuBucketMembers) * numBuckets;
+  auto allocator = reinterpret_cast<HashPartitionAllocator*>(table->allocators);
+  new (allocator)
+      HashPartitionAllocator(data, maxRows * rowSize, rowSize, freeSet);
+  table->partitionMask = 0;
+  table->partitionShift = 0;
+  memset(table->buckets, 0, sizeof(GpuBucketMembers) * (table->sizeMask + 1));
+}
+
+std::string HashRun::toString() const {
+  std::stringstream out;
+  std::string opLabel = testCase == HashTestCase::kUpdateSum1 ? "update sum1"
+      : testCase == HashTestCase::kGroupSum1                  ? "groupSum1"
+                                             : "update array_agg1";
+  out << "===" << label << ":" << opLabel << " distinct=" << numDistinct
+      << " rows=" << numRows << " (" << numBlocks << "x" << blockSize << "x"
+      << numRowsPerThread << ") ";
+  if (hotPct) {
+    out << " skew " << hotPct << "% in " << numHot << " ";
+  }
+  auto sorted = scores;
+  std::sort(sorted.begin(), sorted.end(), [](auto& left, auto& right) {
+    return left.second < right.second;
+  });
+  float gb =
+      numRows * sizeof(int64_t) * numColumns / static_cast<float>(1 << 30);
+  for (auto& score : sorted) {
+    out << std::endl
+        << "  * "
+        << fmt::format(
+               " {}={:.2f} rps {:.2f} GB/s {} us {:.2f}x",
+               score.first,
+               numRows / (score.second / 1e6),
+               gb / (score.second / 1e6),
+               score.second,
+               score.second / sorted[0].second);
+  }
+  return out.str();
+}
+
+void HashRun::addScore(const char* label, uint64_t micros) {
+  scores.push_back(std::make_pair<std::string, float>(label, micros));
+}
+} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/HashTestUtil.h b/velox/experimental/wave/common/tests/HashTestUtil.h
new file mode 100644
index 0000000000000..43703f9771274
--- /dev/null
+++ b/velox/experimental/wave/common/tests/HashTestUtil.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include "velox/experimental/wave/common/Buffer.h"
+#include "velox/experimental/wave/common/HashTable.h"
+
+namespace facebook::velox::wave {
+
+/// Identifies operation being tested. A collection of representative hash table
+/// ops like aggregates probes and builds with different functions and layouts.
+enum class HashTestCase {
+  // bigint sum.  Update only, no hash table.
+  kUpdateSum1,
+  // group by with bigint sum.
+  kGroupSum1,
+  // array_agg of bigint. Update only, no hash table.
+  kUpdateArrayAgg1
+};
+
+/// Describes a hashtable benchmark case.
+struct HashRun {
+  // Label of test case. Describes what is done. The labels for different
+  // implementations come from 'scores'.
+  std::string label;
+  // the operation being measured.
+  HashTestCase testCase;
+  // CPU/GPU measurement.
+  bool isCpu;
+
+  // Number of slots in table.
+  int32_t numSlots{0};
+
+  // Number of probe rows.
+  int32_t numRows;
+
+  // Number of distinct keys.
+  int32_t numDistinct;
+
+  // Number of distinct hot keys.
+  int32_t numHot{0};
+
+  // Percentage of hot keys over total keys. e.g. with 1000 distinct and 10 hot
+  // and hotPct of 50, every second key will be one of 10 and the rest are
+  // evenly spread over the 1000.
+  int32_t hotPct{0};
+
+  // Number of keys processed by each thread of each block.
+  int32_t numRowsPerThread;
+
+  int32_t blockSize{256};
+
+  // Number of blocks of 'blockSize' threads.
+  int32_t numBlocks;
+
+  // Number of columns. Key is column 0.
+  uint8_t numColumns{1};
+
+  // Number of independent hash tables.
+  int32_t numTables{1};
+
+  // Result, labeled by implementation alternative.
+  std::vector<std::pair<std::string, float>> scores;
+
+  std::unique_ptr<char[]> cpuData;
+  WaveBufferPtr gpuData;
+
+  // Input data, either cpuData or gpuData.
+  char* input;
+
+  // Initialized probe params, contained in 'input'.
+  HashProbe* probe;
+  // One int per row, use for partitioning intermediates. Uninitialized.
+  int32_t* partitionTemp;
+
+  int32_t* partitionArgs;
+  std::string toString() const;
+  void addScore(const char* label, uint64_t micros);
+  void clearScore() {
+    scores.clear();
+  }
+};
+
+void fillHashTestInput(
+    int32_t numRows,
+    int32_t keyRange,
+    int32_t powerOfTwo,
+    int64_t counter,
+    uint8_t numColumns,
+    int64_t** columns,
+    int32_t numHot = 0,
+    int32_t hotPct = 0);
+
+void initializeHashTestInput(HashRun& run, GpuArena* arena);
+
+void setupGpuTable(
+    int32_t numSlots,
+    int32_t maxRows,
+    int64_t rowSize,
+    GpuArena* arena,
+    GpuHashTableBase*& table,
+    WaveBufferPtr& buffer);
+
+} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/Updates.cuh b/velox/experimental/wave/common/tests/Updates.cuh
new file mode 100644
index 0000000000000..e25b2c918be87
--- /dev/null
+++ b/velox/experimental/wave/common/tests/Updates.cuh
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/experimental/wave/common/HashTable.cuh"
+#include "velox/experimental/wave/common/tests/BlockTest.h"
+
+namespace facebook::velox::wave {
+
+using Mutex = cuda::binary_semaphore<cuda::thread_scope_device>;
+
+inline void __device__ testingLock(int32_t* mtx) {
+  reinterpret_cast<Mutex*>(mtx)->acquire();
+}
+
+inline void __device__ testingUnlock(int32_t* mtx) {
+  reinterpret_cast<Mutex*>(mtx)->release();
+}
+
+void __device__ testSumNoSync(TestingRow* rows, HashProbe* probe) {
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  auto deltas = keys[1];
+  int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x;
+  int32_t end = base + probe->numRows[blockIdx.x];
+
+  for (auto i = base + threadIdx.x; i < end; i += blockDim.x) {
+    auto* row = &rows[indices[i]];
+    row->count += deltas[i];
+  }
+}
+
+void __device__ testSumPart(
+    TestingRow* rows,
+    int32_t numParts,
+    HashProbe* probe,
+    int32_t* part,
+    int32_t* partEnd,
+    int32_t numGroups,
+    int32_t groupStride) {
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  auto deltas = keys[1];
+  for (auto groupIdx = 0; groupIdx < numGroups; ++groupIdx) {
+    auto groupStart = groupIdx * groupStride;
+    int32_t linear = threadIdx.x + blockIdx.x * blockDim.x;
+    if (linear > numParts) {
+      break;
+    }
+    int32_t begin = linear == 0 ? groupStart
+                                : groupStart + partEnd[groupStart + linear - 1];
+    int32_t end = groupStart + partEnd[groupStart + linear];
+
+    for (auto i = begin; i < end; ++i) {
+      auto index = groupStart + part[i];
+      auto* row = &rows[indices[index]];
+      row->count += deltas[index];
+    }
+  }
+  __syncthreads();
+}
+
+void __device__ testSumMtx(TestingRow* rows, HashProbe* probe) {
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  auto deltas = keys[1];
+  int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x;
+  int32_t end = base + probe->numRows[blockIdx.x];
+
+  for (auto i = base + threadIdx.x; i < end; i += blockDim.x) {
+    auto* row = &rows[indices[i]];
+    testingLock(&row->flags);
+    row->count += deltas[i];
+    testingUnlock(&row->flags);
+  }
+}
+
+void __device__ testSumAtomic(TestingRow* rows, HashProbe* probe) {
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  auto deltas = keys[1];
+  int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x;
+  int32_t end = base + probe->numRows[blockIdx.x];
+
+  for (auto i = base + threadIdx.x; i < end; i += blockDim.x) {
+    auto* row = &rows[indices[i]];
+    atomicAdd((unsigned long long*)&row->count, (unsigned long long)deltas[i]);
+  }
+}
+
+void __device__ testSumAtomicCoalesce(TestingRow* rows, HashProbe* probe) {
+  constexpr int32_t kWarpThreads = 32;
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  auto deltas = keys[1];
+  int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x;
+  int32_t lane = cub::LaneId();
+  int32_t end = base + probe->numRows[blockIdx.x];
+
+  for (auto count = base; count < end; count += blockDim.x) {
+    auto i = threadIdx.x + count;
+
+    if (i < end) {
+      uint32_t laneMask = count + kWarpThreads <= end
+          ? 0xffffffff
+          : lowMask<uint32_t>(end - count);
+      auto index = indices[i];
+      auto delta = deltas[i];
+      uint32_t allPeers = __match_any_sync(laneMask, index);
+      int32_t leader = __ffs(allPeers) - 1;
+      auto peers = allPeers;
+      int64_t total = 0;
+      auto currentPeer = leader;
+      for (;;) {
+        total += __shfl_sync(allPeers, delta, currentPeer);
+        peers &= peers - 1;
+        if (peers == 0) {
+          break;
+        }
+        currentPeer = __ffs(peers) - 1;
+      }
+      if (lane == leader) {
+        auto* row = &rows[index];
+        atomicAdd((unsigned long long*)&row->count, (unsigned long long)total);
+      }
+    }
+  }
+}
+
+void __device__ testSumExch(TestingRow* rows, HashProbe* probe) {
+  int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x;
+  int32_t end = base + probe->numRows[blockIdx.x];
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  auto deltas = keys[1];
+
+  extern __shared__ __align__(16) char smem[];
+  ProbeShared* shared = reinterpret_cast<ProbeShared*>(smem);
+  if (threadIdx.x == 0) {
+    shared->init(probe, base);
+    shared->blockEnd = end;
+    shared->toDo = probe->numRows[blockIdx.x];
+    shared->numRounds = 0;
+    shared->numUpdated = 0;
+    shared->numTried = 0;
+  }
+  __syncthreads();
+  for (;;) {
+    if (shared->blockEnd <= shared->blockBase) {
+      GPF();
+    }
+    int32_t counter;
+    for (counter = base; counter < shared->blockEnd; counter += blockDim.x) {
+      auto i = counter + threadIdx.x;
+      if (i < shared->blockEnd) {
+        atomicAdd(&shared->numTried, 1);
+        if (shared->inputRetries) {
+          i = shared->inputRetries[i];
+        }
+        auto* row = &rows[indices[i]];
+        if (0 ==
+            asDeviceAtomic<int32_t>(&row->flags)
+                ->exchange(1, cuda::memory_order_consume)) {
+          atomicAdd(
+              (unsigned long long*)&row->count, (unsigned long long)deltas[i]);
+          atomicAdd(&shared->numUpdated, 1);
+          asDeviceAtomic<int32_t>(&row->flags)
+              ->store(0, cuda::memory_order_release);
+        } else {
+          shared
+              ->outputRetries[base + atomicAdd(&shared->numKernelRetries, 1)] =
+              i;
+        }
+      } else {
+        atomicAdd(&shared->numTried, 1 << 16);
+      }
+      // __syncthreads();
+    }
+    __syncthreads();
+    if (shared->numKernelRetries == 0) {
+      if ((shared->numTried & 0xffff) != shared->blockEnd - shared->blockBase) {
+        GPF();
+      }
+      if (shared->done + (shared->blockEnd - shared->blockBase) !=
+          shared->toDo) {
+        GPF();
+      }
+      // printf("%d %d //%d\n", base, end, counter);
+      return;
+    }
+
+    if (threadIdx.x == 0) {
+      shared->done +=
+          (shared->blockEnd - shared->blockBase) - shared->numKernelRetries;
+      ++shared->numRounds;
+      shared->numTried = 0;
+      shared->blockEnd = base + shared->numKernelRetries;
+      shared->nextRound(probe);
+    }
+    __syncthreads();
+  }
+}
+void __device__ testSumMtxCoalesce(TestingRow* rows, HashProbe* probe) {
+  constexpr int32_t kWarpThreads = 32;
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  auto deltas = keys[1];
+  int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x;
+  int32_t lane = cub::LaneId();
+  int32_t end = base + probe->numRows[blockIdx.x];
+
+  for (auto count = base; count < end; count += blockDim.x) {
+    auto i = threadIdx.x + count;
+
+    if (i < end) {
+      uint32_t laneMask = count + kWarpThreads <= end
+          ? 0xffffffff
+          : lowMask<uint32_t>(end - count);
+      auto index = indices[i];
+      auto delta = deltas[i];
+      uint32_t allPeers = __match_any_sync(laneMask, index);
+      int32_t leader = __ffs(allPeers) - 1;
+      auto peers = allPeers;
+      int64_t total = 0;
+      auto currentPeer = leader;
+      for (;;) {
+        total += __shfl_sync(allPeers, delta, currentPeer);
+        peers &= peers - 1;
+        if (peers == 0) {
+          break;
+        }
+        currentPeer = __ffs(peers) - 1;
+      }
+      if (lane == leader) {
+        auto* row = &rows[index];
+        testingLock(&row->flags);
+        row->count += total;
+        testingUnlock(&row->flags);
+      }
+    }
+  }
+}
+
+void __device__ testSumOrder(TestingRow* rows, HashProbe* probe) {
+  auto keys = reinterpret_cast<int64_t**>(probe->keys);
+  auto indices = keys[0];
+  auto deltas = keys[1];
+  int32_t base = probe->numRowsPerThread * blockDim.x * blockIdx.x;
+  int32_t end = base + probe->numRows[blockIdx.x];
+
+  for (auto i = base + threadIdx.x; i < end; i += blockDim.x) {
+    auto* row = &rows[indices[i]];
+    int32_t waitNano = 1;
+    auto d = deltas[i];
+    for (;;) {
+      if (0 ==
+          asDeviceAtomic<int32_t>(&row->flags)
+              ->exchange(1, cuda::memory_order_consume)) {
+        row->count += d;
+        asDeviceAtomic<int32_t>(&row->flags)
+            ->store(0, cuda::memory_order_release);
+        break;
+      } else {
+        __nanosleep(waitNano);
+        waitNano += threadIdx.x & 31;
+      }
+    }
+  }
+}
+
+} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/Filter.h b/velox/experimental/wave/common/tests/Util.h
similarity index 72%
rename from velox/experimental/wave/exec/Filter.h
rename to velox/experimental/wave/common/tests/Util.h
index a78502306f07e..5b39a26587378 100644
--- a/velox/experimental/wave/exec/Filter.h
+++ b/velox/experimental/wave/common/tests/Util.h
@@ -16,20 +16,12 @@
 
 #pragma once
 
-#include "velox/experimental/wave/exec/WaveOperator.h"
+#include <cstdint>
 
 namespace facebook::velox::wave {
 
-class Filter : public WaveOperator {
- public:
-  Filter(RowTypePtr inputType, exec::ExprSet exprSet);
-
-  bool isStreaming() const override {
-    return true;
-  }
-
- private:
-  std::vector<Subfield> input_;
-};
+inline uint32_t scale32(uint32_t n, uint32_t scale) {
+  return (static_cast<uint64_t>(static_cast<uint32_t>(n)) * scale) >> 32;
+}
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/dwio/ColumnReader.cpp b/velox/experimental/wave/dwio/ColumnReader.cpp
index 7534b853cf586..24162236df549 100644
--- a/velox/experimental/wave/dwio/ColumnReader.cpp
+++ b/velox/experimental/wave/dwio/ColumnReader.cpp
@@ -28,7 +28,8 @@ void ColumnReader::makeOp(
   formatData_->newBatch(readOffset_ + offset);
   op.action = action;
   op.reader = this;
-  op.waveVector = readStream->operandVector(operand_, requestedType_);
+  readStream->setNullable(*operand_, formatData_->hasNulls());
+  op.waveVector = readStream->operandVector(operand_->id, requestedType_);
   op.rows = rows;
   readOffset_ = offset + rows.back() + 1;
 };
diff --git a/velox/experimental/wave/dwio/ColumnReader.h b/velox/experimental/wave/dwio/ColumnReader.h
index 1297e1aec9fe8..8aeaa9da6f29c 100644
--- a/velox/experimental/wave/dwio/ColumnReader.h
+++ b/velox/experimental/wave/dwio/ColumnReader.h
@@ -31,13 +31,16 @@ class ColumnReader {
   ColumnReader(
       const TypePtr& requestedType,
       std::shared_ptr<const dwio::common::TypeWithId> fileType,
-      OperandId operand,
+      AbstractOperand* operand,
       FormatParams& params,
       velox::common::ScanSpec& scanSpec)
       : requestedType_(requestedType),
         fileType_(fileType),
         operand_(operand),
-        formatData_(params.toFormatData(fileType_, scanSpec, operand)),
+        formatData_(params.toFormatData(
+            fileType_,
+            scanSpec,
+            operand ? operand->id : kNoOperand)),
         scanSpec_(&scanSpec) {}
 
   virtual ~ColumnReader() = default;
@@ -54,7 +57,7 @@ class ColumnReader {
     return formatData_->totalRows();
   }
 
-  OperandId operand() const {
+  AbstractOperand* operand() const {
     return operand_;
   }
 
@@ -72,7 +75,7 @@ class ColumnReader {
  protected:
   TypePtr requestedType_;
   std::shared_ptr<const dwio::common::TypeWithId> fileType_;
-  const OperandId operand_;
+  AbstractOperand* const operand_;
   std::unique_ptr<FormatData> formatData_;
   // Specification of filters, value extraction, pruning etc. The
   // spec is assigned at construction and the contents may change at
@@ -95,6 +98,10 @@ class ReadStream : public Executable {
       WaveStream& waveStream,
       const OperandSet* firstColumns = nullptr);
 
+  void setNullable(const AbstractOperand& op, bool nullable) {
+    waveStream->setNullable(op, nullable);
+  }
+
   /// Runs a sequence of kernel invocations until all eagerly produced columns
   /// have their last kernel in flight. Transfers ownership of 'readStream' to
   /// its WaveStream.
@@ -115,11 +122,19 @@ class ReadStream : public Executable {
  private:
   /// Makes column dependencies.
   void makeOps();
+  void makeControl();
 
   StructColumnReader* reader_;
+  std::vector<AbstractOperand*> abstractOperands_;
+
+  // Offset from end of previous read.
   int32_t offset_;
+
+  // Row numbers to read starting after skipping 'offset_'.
   RowSet rows_;
   std::vector<ColumnOp> ops_;
+  // Cout of KBlockSize blocks in max top level rows.
+  int32_t numBlocks_{0};
   std::vector<std::unique_ptr<SplitStaging>> staging_;
   SplitStaging* currentStaging_;
 
@@ -129,6 +144,12 @@ class ReadStream : public Executable {
   ResultStaging deviceStaging_;
   // Reusable control block for launching decode kernels.
   DecodePrograms programs_;
+  // If no filters, the starting RowSet directly initializes the BlockStatus'es
+  // at the end of the ReadStream.
+  bool hasFilters_{false};
+  //  Sequence number of kernel launch.
+  int32_t nthWave_{0};
+  LaunchControl* control_{nullptr};
 };
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/dwio/FormatData.h b/velox/experimental/wave/dwio/FormatData.h
index 9eb40dc1fe4e4..e2479dec1e9ef 100644
--- a/velox/experimental/wave/dwio/FormatData.h
+++ b/velox/experimental/wave/dwio/FormatData.h
@@ -64,6 +64,9 @@ class SplitStaging {
         id, reinterpret_cast<void**>(reinterpret_cast<uint64_t>(pointer)));
   }
 
+  int64_t bytesToDevice() const {
+    return fill_;
+  }
   // Starts the transfers registered with add( on 'stream').
   void transfer(WaveStream& waveStream, Stream& stream);
 
diff --git a/velox/experimental/wave/dwio/ReadStream.cpp b/velox/experimental/wave/dwio/ReadStream.cpp
index 891fb0a653d78..18789ff1f1d10 100644
--- a/velox/experimental/wave/dwio/ReadStream.cpp
+++ b/velox/experimental/wave/dwio/ReadStream.cpp
@@ -18,13 +18,21 @@
 #include "velox/experimental/wave/dwio/StructColumnReader.h"
 
 namespace facebook::velox::wave {
-void allOperands(const ColumnReader* reader, OperandSet& operands) {
+
+void allOperands(
+    const ColumnReader* reader,
+    OperandSet& operands,
+    std::vector<AbstractOperand*>* abstractOperands) {
   auto op = reader->operand();
-  if (op != kNoOperand) {
-    operands.add(op);
+  if (op != nullptr) {
+    operands.add(op->id);
+    if (abstractOperands) {
+      abstractOperands->push_back(op);
+    }
   }
+
   for (auto& child : reader->children()) {
-    allOperands(child, operands);
+    allOperands(child, operands, abstractOperands);
   }
 }
 
@@ -36,7 +44,7 @@ ReadStream::ReadStream(
     const OperandSet* firstColumns)
     : Executable(), offset_(offset), rows_(rows) {
   waveStream = &_waveStream;
-  allOperands(columnReader, outputOperands);
+  allOperands(columnReader, outputOperands, &abstractOperands_);
   output.resize(outputOperands.size());
   reader_ = columnReader;
   staging_.push_back(std::make_unique<SplitStaging>());
@@ -83,6 +91,16 @@ bool ReadStream::makePrograms(bool& needSync) {
       allDone = false;
     }
   }
+  if (!hasFilters_ && allDone) {
+    auto setCount = std::make_unique<GpuDecode>();
+    setCount->step = DecodeStep::kRowCountNoFilter;
+    setCount->data.rowCountNoFilter.numRows = rows_.size();
+    setCount->data.rowCountNoFilter.status =
+        control_->deviceData->as<BlockStatus>();
+    programs_.programs.emplace_back();
+    programs_.programs.back().push_back(std::move(setCount));
+  }
+  ++nthWave_;
   resultStaging_.setReturnBuffer(waveStream->arena(), programs_);
   return allDone;
 }
@@ -90,38 +108,70 @@ bool ReadStream::makePrograms(bool& needSync) {
 // static
 void ReadStream::launch(std::unique_ptr<ReadStream>&& readStream) {
   using UniqueExe = std::unique_ptr<Executable>;
-  readStream->waveStream->installExecutables(
+  // The function of control here is to have a status and row count for each
+  // kBlockSize top level rows of output and to have Operand structs for the
+  // produced column.
+  readStream->makeControl();
+  auto numRows = readStream->rows_.size();
+  auto waveStream = readStream->waveStream;
+  WaveStats& stats = waveStream->stats();
+  waveStream->installExecutables(
       folly::Range<UniqueExe*>(reinterpret_cast<UniqueExe*>(&readStream), 1),
       [&](Stream* stream, folly::Range<Executable**> exes) {
         auto* readStream = reinterpret_cast<ReadStream*>(exes[0]);
         bool needSync = false;
         for (;;) {
           bool done = readStream->makePrograms(needSync);
-          readStream->currentStaging_->transfer(
-              *readStream->waveStream, *stream);
+          stats.bytesToDevice += readStream->currentStaging_->bytesToDevice();
+          ++stats.numKernels;
+          stats.numPrograms += readStream->programs_.programs.size();
+          stats.numThreads += readStream->programs_.programs.size() *
+              std::min<int32_t>(readStream->rows_.size(), kBlockSize);
+          readStream->currentStaging_->transfer(*waveStream, *stream);
           if (done) {
             break;
           }
           WaveBufferPtr extra;
           launchDecode(
-              readStream->programs(),
-              &readStream->waveStream->arena(),
-              extra,
-              stream);
+              readStream->programs(), &waveStream->arena(), extra, stream);
           readStream->staging_.push_back(std::make_unique<SplitStaging>());
           readStream->currentStaging_ = readStream->staging_.back().get();
           if (needSync) {
+            waveStream->setState(WaveStream::State::kWait);
             stream->wait();
+            readStream->waveStream->setState(WaveStream::State::kHost);
+          } else {
+            readStream->waveStream->setState(WaveStream::State::kParallel);
           }
         }
+
         WaveBufferPtr extra;
         launchDecode(
             readStream->programs(),
             &readStream->waveStream->arena(),
             extra,
             stream);
+        readStream->waveStream->setState(WaveStream::State::kParallel);
         readStream->waveStream->markLaunch(*stream, *readStream);
       });
 }
 
+void ReadStream::makeControl() {
+  auto numRows = rows_.size();
+  numBlocks_ = bits::roundUp(numRows, kBlockSize) / kBlockSize;
+  waveStream->setNumRows(numRows);
+  WaveStream::ExeLaunchInfo info;
+  waveStream->exeLaunchInfo(*this, numBlocks_, info);
+  auto statusBytes = sizeof(BlockStatus) * numBlocks_;
+  auto deviceBytes = statusBytes + info.totalBytes;
+  auto control = std::make_unique<LaunchControl>(0, numRows);
+  control->deviceData = waveStream->arena().allocate<char>(deviceBytes);
+  control->status = control->deviceData->as<BlockStatus>();
+
+  operands = waveStream->fillOperands(
+      *this, control->deviceData->as<char>() + statusBytes, info)[0];
+  control_ = control.get();
+  waveStream->addLaunchControl(0, std::move(control));
+}
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/dwio/StructColumnReader.h b/velox/experimental/wave/dwio/StructColumnReader.h
index d047116ff0bed..15895ac8b3135 100644
--- a/velox/experimental/wave/dwio/StructColumnReader.h
+++ b/velox/experimental/wave/dwio/StructColumnReader.h
@@ -25,7 +25,7 @@ class StructColumnReader : public ColumnReader {
   StructColumnReader(
       const TypePtr& requestedType,
       std::shared_ptr<const dwio::common::TypeWithId> fileType,
-      OperandId operand,
+      AbstractOperand* operand,
       FormatParams& params,
       velox::common::ScanSpec& scanSpec,
       bool isRoot)
diff --git a/velox/experimental/wave/dwio/decode/DecodeStep.h b/velox/experimental/wave/dwio/decode/DecodeStep.h
index cb6178fa8e486..9251446f838d1 100644
--- a/velox/experimental/wave/dwio/decode/DecodeStep.h
+++ b/velox/experimental/wave/dwio/decode/DecodeStep.h
@@ -54,6 +54,7 @@ enum class DecodeStep {
   kMap,
   kFlatMap,
   kFlatMapNode,
+  kRowCountNoFilter,
   kUnsupported,
 };
 
@@ -192,6 +193,11 @@ struct GpuDecode {
     int32_t* indicesCount;
   };
 
+  struct RowCountNoFilter {
+    int32_t numRows;
+    BlockStatus* status;
+  };
+
   union {
     Trivial trivial;
     MainlyConstant mainlyConstant;
@@ -201,6 +207,7 @@ struct GpuDecode {
     RleTotalLength rleTotalLength;
     Rle rle;
     MakeScatterIndices makeScatterIndices;
+    RowCountNoFilter rowCountNoFilter;
   } data;
 
   /// Returns the amount f shared memory for standard size thread block for
diff --git a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
index 62c23facac153..87f43b6f1e284 100644
--- a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
+++ b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
@@ -507,6 +507,24 @@ __device__ void makeScatterIndices(GpuDecode::MakeScatterIndices& op) {
     *op.indicesCount = indicesCount;
   }
 }
+
+template <int kBlockSize>
+__device__ void setRowCountNoFilter(GpuDecode::RowCountNoFilter& op) {
+  auto numRows = op.numRows;
+  auto* status = op.status;
+  auto numCounts = roundUp(numRows, kBlockSize) / kBlockSize;
+  for (auto base = 0; base < numCounts; base += kBlockSize) {
+    auto idx = threadIdx.x + base;
+    if (idx < numCounts) {
+      // Every thread writes a row count and errors for kBlockSize rows. All
+      // errors are cleared and all row counts except the last are kBlockSize.
+      status[idx].numRows =
+          idx < numCounts - 1 ? kBlockSize : numRows - idx * kBlockSize;
+      memset(&status[base + threadIdx.x].errors, 0, sizeof(status->errors));
+    }
+  }
+}
+
 template <int32_t kBlockSize>
 __device__ void decodeSwitch(GpuDecode& op) {
   switch (op.step) {
@@ -534,6 +552,9 @@ __device__ void decodeSwitch(GpuDecode& op) {
     case DecodeStep::kMakeScatterIndices:
       detail::makeScatterIndices<kBlockSize>(op.data.makeScatterIndices);
       break;
+    case DecodeStep::kRowCountNoFilter:
+      detail::setRowCountNoFilter<kBlockSize>(op.data.rowCountNoFilter);
+      break;
     default:
       if (threadIdx.x == 0) {
         printf("ERROR: Unsupported DecodeStep (with shared memory)\n");
@@ -554,6 +575,7 @@ int32_t sharedMemorySizeForDecode(DecodeStep step) {
     case DecodeStep::kTrivial:
     case DecodeStep::kDictionaryOnBitpack:
     case DecodeStep::kSparseBool:
+    case DecodeStep::kRowCountNoFilter:
       return 0;
       break;
 
diff --git a/velox/experimental/wave/exec/AggregateFunction.h b/velox/experimental/wave/exec/AggregateFunction.h
index 1d346a885f16a..e73e30be956f5 100644
--- a/velox/experimental/wave/exec/AggregateFunction.h
+++ b/velox/experimental/wave/exec/AggregateFunction.h
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "velox/experimental/wave/exec/ErrorCode.h"
 #include "velox/experimental/wave/vector/Operand.h"
 
 namespace facebook::velox::wave::aggregation {
diff --git a/velox/experimental/wave/exec/Aggregation.cpp b/velox/experimental/wave/exec/Aggregation.cpp
index 27aae3b976e97..c26881505a12d 100644
--- a/velox/experimental/wave/exec/Aggregation.cpp
+++ b/velox/experimental/wave/exec/Aggregation.cpp
@@ -279,7 +279,7 @@ void Aggregation::flush(bool noMoreInput) {
   flushDone_.record(*flushStream_);
 }
 
-int32_t Aggregation::canAdvance() {
+int32_t Aggregation::canAdvance(WaveStream& stream) {
   if (!noMoreInput_ || finished_) {
     return 0;
   }
@@ -299,12 +299,21 @@ void Aggregation::schedule(WaveStream& waveStream, int32_t maxRows) {
       numColumns, exec->deviceData.emplace_back());
   auto* instructions = arena_->allocate<aggregation::Instruction>(
       numColumns, exec->deviceData.emplace_back());
+  auto numBlocks = bits::roundUp(maxRows, kBlockSize) / kBlockSize;
+  auto* rowStatus =
+      arena_->allocate<BlockStatus>(numBlocks, exec->deviceData.emplace_back());
+  bzero(rowStatus, numBlocks * sizeof(BlockStatus));
+  for (auto i = 0; i < numBlocks; ++i) {
+    rowStatus[i].numRows =
+        i == numBlocks - 1 ? maxRows - kBlockSize * i : kBlockSize;
+  }
   auto* status = arena_->allocate<BlockStatus>(
       numColumns, exec->deviceData.emplace_back());
   bzero(status, numColumns * sizeof(BlockStatus));
   exec->operands =
       arena_->allocate<Operand>(numColumns, exec->deviceData.emplace_back());
   exec->outputOperands = outputIds_;
+  exec->firstOutputOperandIdx = 0;
   for (int i = 0; i < numColumns; ++i) {
     auto column = WaveVector::create(outputType_->childAt(i), *arena_);
     column->resize(maxRows, false);
@@ -333,6 +342,9 @@ void Aggregation::schedule(WaveStream& waveStream, int32_t maxRows) {
         int sharedSize = std::max(
             aggregation::ExtractKeys::sharedSize(),
             aggregation::ExtractValues::sharedSize());
+        auto control = std::make_unique<LaunchControl>(id_, maxRows);
+        control->status = rowStatus;
+        waveStream.addLaunchControl(id_, std::move(control));
         aggregation::call(
             *stream, numColumns, programs, nullptr, status, sharedSize);
         waveStream.markLaunch(*stream, *exes[0]);
diff --git a/velox/experimental/wave/exec/Aggregation.h b/velox/experimental/wave/exec/Aggregation.h
index bd208d7e745bc..41cd357532dc8 100644
--- a/velox/experimental/wave/exec/Aggregation.h
+++ b/velox/experimental/wave/exec/Aggregation.h
@@ -43,7 +43,7 @@ class Aggregation : public WaveOperator {
 
   void flush(bool noMoreInput) override;
 
-  int32_t canAdvance() override;
+  int32_t canAdvance(WaveStream& stream) override;
 
   void schedule(WaveStream& stream, int32_t maxRows) override;
 
diff --git a/velox/experimental/wave/exec/AggregationInstructions.cu b/velox/experimental/wave/exec/AggregationInstructions.cu
index ac2a341d8f8bb..1827dcb717e44 100644
--- a/velox/experimental/wave/exec/AggregationInstructions.cu
+++ b/velox/experimental/wave/exec/AggregationInstructions.cu
@@ -91,7 +91,7 @@ normalize(BlockInfo* block, void* idMap, Operand* key, int32_t& result) {
   auto* typedIdMap = reinterpret_cast<IdMap<T>*>(idMap);
   auto id = typedIdMap->makeId(value<T>(key, block->base, block->shared));
   if (id == -1) {
-    return ErrorCode::kInsuffcientMemory;
+    return ErrorCode::kInsufficientMemory;
   }
   assert(typedIdMap->cardinality() <= kNormalizationRadix);
   result = kNormalizationRadix * result + id - 1;
diff --git a/velox/experimental/wave/exec/ErrorCode.h b/velox/experimental/wave/exec/ErrorCode.h
index 3818c4d6fb925..d24d04713db62 100644
--- a/velox/experimental/wave/exec/ErrorCode.h
+++ b/velox/experimental/wave/exec/ErrorCode.h
@@ -18,23 +18,4 @@
 
 #include "velox/experimental/wave/vector/Operand.h"
 
-namespace facebook::velox::wave {
-
-///
-enum class ErrorCode : uint8_t {
-  // All operations completed.
-  kOk = 0,
-
-  // Catchall for runtime errors.
-  kError,
-
-  kInsuffcientMemory,
-};
-
-/// Contains a count of active lanes and a per lane error code.
-struct BlockStatus {
-  int32_t numRows{0};
-  ErrorCode errors[kBlockSize];
-};
-
-} // namespace facebook::velox::wave
+namespace facebook::velox::wave {} // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/ExprKernel.cu b/velox/experimental/wave/exec/ExprKernel.cu
index 825b652179173..9837b79c7c9a2 100644
--- a/velox/experimental/wave/exec/ExprKernel.cu
+++ b/velox/experimental/wave/exec/ExprKernel.cu
@@ -16,10 +16,13 @@
 
 #include "velox/experimental/wave/exec/ExprKernel.h"
 
+#include <gflags/gflags.h>
 #include "velox/experimental/wave/common/Block.cuh"
 #include "velox/experimental/wave/common/CudaUtil.cuh"
 #include "velox/experimental/wave/exec/WaveCore.cuh"
 
+DEFINE_bool(kernel_gdb, false, "Run kernels sequentially for debugging");
+
 namespace facebook::velox::wave {
 
 template <typename T>
@@ -30,7 +33,7 @@ __device__ inline T opFunc_kPlus(T left, T right) {
 template <typename T, typename OpFunc>
 __device__ inline void binaryOpKernel(
     OpFunc func,
-    IBinary& op,
+    IBinary& instr,
     Operand** operands,
     int32_t blockBase,
     char* shared,
@@ -38,9 +41,15 @@ __device__ inline void binaryOpKernel(
   if (threadIdx.x >= status->numRows) {
     return;
   }
-  flatResult<T>(operands, op.result, blockBase, shared) = func(
-      getOperand<T>(operands, op.left, blockBase, shared),
-      getOperand<T>(operands, op.right, blockBase, shared));
+  T left;
+  T right;
+  if (operandOrNull(operands, instr.left, blockBase, shared, left) &&
+      operandOrNull(operands, instr.right, blockBase, shared, right)) {
+    flatResult<decltype(func(left, right))>(
+        operands, instr.result, blockBase, shared) = func(left, right);
+  } else {
+    resultNull(operands, instr.result, blockBase, shared);
+  }
 }
 
 __device__ void filterKernel(
@@ -78,13 +87,45 @@ __device__ void filterKernel(
 }
 
 __device__ void wrapKernel(
-    IWrap& wrap,
+    const IWrap& wrap,
     Operand** operands,
     int32_t blockBase,
-    int32_t& numRows) {}
+    int32_t numRows) {
+  Operand* op = operands[wrap.indices];
+  auto* filterIndices = reinterpret_cast<int32_t*>(op->base);
+  if (filterIndices[blockBase + numRows - 1] == numRows + blockBase - 1) {
+    // There is no cardinality change.
+    return;
+  }
+  bool rowActive = threadIdx.x < numRows;
+  for (auto column = 0; column < wrap.numColumns; ++column) {
+    int32_t newIndex;
+    int32_t** opIndices;
+    bool remap = false;
+    if (rowActive) {
+      auto opIndex = wrap.columns[column];
+      auto* op = operands[opIndex];
+      opIndices = &op->indices[blockBase / kBlockSize];
+      remap = *opIndices != nullptr;
+      if (remap) {
+        newIndex =
+            (*opIndices)[filterIndices[blockBase + threadIdx.x] - blockBase];
+      } else if (threadIdx.x == 0) {
+        *opIndices = filterIndices + blockBase;
+      }
+    }
+    // All threads hit this.
+    __syncthreads();
+    if (remap) {
+      // remap can b true only on activ rows.
+      (*opIndices)[threadIdx.x] = newIndex;
+    }
+  }
+  __syncthreads();
+}
 
 #define BINARY_TYPES(opCode, OP)                             \
-  case OP_MIX(opCode, ScalarType::kInt64):                   \
+  case OP_MIX(opCode, WaveTypeKind::BIGINT):                 \
     binaryOpKernel<int64_t>(                                 \
         [](auto left, auto right) { return left OP right; }, \
         instruction->_.binary,                               \
@@ -108,9 +149,11 @@ __global__ void waveBaseKernel(
   auto* operands = programOperands[programIndex];
   auto* status = &blockStatusArray[blockIdx.x - baseIndices[blockIdx.x]];
   int32_t blockBase = (blockIdx.x - baseIndices[blockIdx.x]) * blockDim.x;
-  for (auto i = 0; i < program->numInstructions; ++i) {
-    auto instruction = program->instructions[i];
+  auto instruction = program->instructions;
+  for (;;) {
     switch (instruction->opCode) {
+      case OpCode::kReturn:
+        return;
       case OpCode::kFilter:
         filterKernel(
             instruction->_.filter,
@@ -125,7 +168,20 @@ __global__ void waveBaseKernel(
         break;
 
         BINARY_TYPES(OpCode::kPlus, +);
+        BINARY_TYPES(OpCode::kLT, <);
     }
+    ++instruction;
+  }
+}
+
+int32_t instructionSharedMemory(const Instruction& instruction) {
+  using ScanAlgorithm = cub::BlockScan<int, 256, cub::BLOCK_SCAN_RAKING>;
+
+  switch (instruction.opCode) {
+    case OpCode::kFilter:
+      return sizeof(ScanAlgorithm::TempStorage);
+    default:
+      return 0;
   }
 }
 
@@ -144,6 +200,9 @@ void WaveKernelStream::call(
       sharedSize,
       alias ? alias->stream()->stream : stream()->stream>>>(
       bases, programIdx, programs, operands, status);
+  if (FLAGS_kernel_gdb) {
+    (alias ? alias : this)->wait();
+  }
 }
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/ExprKernel.h b/velox/experimental/wave/exec/ExprKernel.h
index ec4a2b2747669..40432ebea86fc 100644
--- a/velox/experimental/wave/exec/ExprKernel.h
+++ b/velox/experimental/wave/exec/ExprKernel.h
@@ -29,16 +29,6 @@
 /// be allocated dynamically at kernel invocation.
 namespace facebook::velox::wave {
 
-/// Mixed with opcode to switch between instantiations of instructions for
-/// different types.
-enum class ScalarType {
-  kInt32,
-  kInt64,
-  kReal,
-  kDouble,
-  kString,
-};
-
 /// Opcodes for common instruction set. First all instructions that
 /// do not have operand type variants, then all the ones that
 /// do. For type templated instructions, the case label is opcode *
@@ -47,6 +37,9 @@ enum class OpCode {
   // First all OpCodes that have no operand type specialization.
   kFilter = 0,
   kWrap,
+  kLiteral,
+  kNegate,
+  kReturn,
 
   // From here, only OpCodes that have variants for scalar types.
   kPlus,
@@ -61,9 +54,12 @@ enum class OpCode {
   kNE,
 
 };
+constexpr int32_t kLastScalarKind = static_cast<int32_t>(WaveTypeKind::HUGEINT);
 
-#define OP_MIX(op, t) \
-  static_cast<OpCode>(static_cast<int32_t>(t) + 8 * static_cast<int32_t>(op))
+#define OP_MIX(op, t)           \
+  static_cast<OpCode>(          \
+      static_cast<int32_t>(t) + \
+      (kLastScalarKind + 1) * static_cast<int32_t>(op))
 
 struct IBinary {
   OperandIndex left;
@@ -71,9 +67,6 @@ struct IBinary {
   OperandIndex result;
   // If set, apply operation to lanes where there is a non-zero byte in this.
   OperandIndex predicate{kEmpty};
-  // If true, inverts the meaning of 'predicate', so that the operation is
-  // perfformed on lanes with a zero byte bit. Xored with predicate[idx].
-  uint8_t invert{0};
 };
 
 struct IFilter {
@@ -84,39 +77,33 @@ struct IFilter {
 struct IWrap {
   // The indices to wrap on top of 'columns'.
   OperandIndex indices;
-
-  // Number of items in 'columns', 'targetColumns', 'nuwIndices',
-  // 'mayShareIndices'.
   int32_t numColumns;
 
   // The columns to wrap.
   OperandIndex* columns;
-  // The post wrap columns. If the original is not wrapped, these
-  // have the base of original and indices to wrap and posssibly new
-  // nulls from 'newNulls'. If the original is wrapped and
-  // newIndices[i] is non-nullptr, the combined indices from the
-  // existing wrap and 'indices are stored in
-  // 'newIndices'. 'newIndices[i]' is the indices of
-  // targetColumn[i]. If 'newIndices[i]' is nullptr, the new indices
-  // overwrite the indices in 'column[i]' and the indices are
-  // referenced from targetColunns[i]'.
-  OperandIndex* targetColumns;
-
-  OperandIndex* newIndices;
-
-  // If mayShareIndices[i]' is an index of a previous entry in 'columns' and
-  // columns[mayshareIndices[i]] shares indices of columns[i], then
-  // targetColumns[i] has indices of targetColumn[mayShareIndices[i]]. If the
-  // wrappings were not the same, indices are obtained from newIndices[i].
-  int32_t* mayShareIndices;
 };
 
+struct ILiteral {
+  OperandIndex literal;
+  OperandIndex result;
+  OperandIndex predicate;
+};
+
+struct INegate {
+  OperandIndex value;
+  OperandIndex result;
+  OperandIndex predicate;
+};
+struct IReturn {};
+
 struct Instruction {
   OpCode opCode;
   union {
     IBinary binary;
     IFilter filter;
     IWrap wrap;
+    ILiteral literal;
+    INegate negate;
   } _;
 };
 
@@ -125,10 +112,13 @@ struct ThreadBlockProgram {
   // across the ThreadBlockPrograms.
   int32_t sharedMemorySize{0};
   int32_t numInstructions;
-
-  Instruction** instructions;
+  // Array of instructions. Ends in a kReturn.
+  Instruction* instructions;
 };
 
+/// Returns the shared memory size for instruction for kBlockSize.
+int32_t instructionSharedMemory(const Instruction& instruction);
+
 /// A stream for invoking ExprKernel.
 class WaveKernelStream : public Stream {
  public:
diff --git a/velox/experimental/wave/exec/Instruction.h b/velox/experimental/wave/exec/Instruction.h
index 18616132133c6..cb6b5653507a9 100644
--- a/velox/experimental/wave/exec/Instruction.h
+++ b/velox/experimental/wave/exec/Instruction.h
@@ -24,7 +24,17 @@ namespace facebook::velox::wave {
 /// Abstract representation of Wave instructions. These translate to a device
 /// side ThreadBlockProgram right before execution.
 
+template <typename T, typename U>
+T addBytes(U* p, int32_t bytes) {
+  return reinterpret_cast<T>(reinterpret_cast<uintptr_t>(p) + bytes);
+}
+
+/// Represents an input/output of an instruction or WaveOperator on host. The
+/// device-side Operator is made at launch time based on this.
 struct AbstractOperand {
+  static constexpr int32_t kNoConstant = ~0;
+  static constexpr int32_t kNoWrap = ~0;
+
   AbstractOperand(int32_t id, const TypePtr& type, std::string label)
       : id(id), type(type), label(label) {}
 
@@ -39,42 +49,108 @@ struct AbstractOperand {
   // Label for debugging, e.g. column name or Expr::toString output.
   std::string label;
 
+  // The Operand of this is nullable if the Operand at some nullableIf_ is
+  // nullable.
+  std::vector<OperandId> nullableIf;
+
   // Vector with constant value, else nullptr.
   VectorPtr constant;
 
   // True if bits in nulls or boolean values are as a bit field. Need widening
   // to byte on device.
   bool flagsAsBits{false};
+
+  // Offset of the literal from the block of literals after the instructions.
+  // The base array in Operand will be set to 'constantOffset + end of last
+  // instruction'.
+  int32_t literalOffset{kNoConstant};
+  // true if null literal.
+  bool literalNull{false};
+
+  // True if the data needs no null flags. Applies to some intermediates like
+  // selected rows or flags or values of compile-time known non-nulls.
+  bool notNull{false};
+
+  // True if nullability depends on the run-time nullability of Operands this
+  // depends on. These are in 'nullableIf'.
+  bool conditionalNonNull{false};
+
+  // if true, nullability is set in WaveStream at the time of launching. Given
+  // by e.g. file metadata but not set at plan time.
+  bool sourceNullable{false};
+
+  // Ordinal of the wrap instruction that first wraps this. All operands wrapped
+  // by the same wrap share 'Operand.indices'. All Operands that are wrapped at
+  // some point get indices when first created. When they get wrapped, there is
+  // one wrap for all Operands with the same 'wrappedAt'
+  int32_t wrappedAt{kNoWrap};
+
+  std::string toString() const;
 };
 
 struct AbstractInstruction {
   AbstractInstruction(OpCode opCode) : opCode(opCode) {}
 
+  virtual ~AbstractInstruction() = default;
+
   template <typename T>
   T& as() {
     return *reinterpret_cast<T*>(this);
   }
 
   OpCode opCode;
+
+  virtual std::string toString() const {
+    return fmt::format("OpCode {}", static_cast<int32_t>(opCode));
+  }
+};
+
+struct AbstractReturn : public AbstractInstruction {
+  AbstractReturn() : AbstractInstruction(OpCode::kReturn) {}
 };
 
 struct AbstractFilter : public AbstractInstruction {
+  AbstractFilter(AbstractOperand* flags, AbstractOperand* indices)
+      : AbstractInstruction(OpCode::kFilter), flags(flags), indices(indices) {}
+
   AbstractOperand* flags;
   AbstractOperand* indices;
+
+  std::string toString() const override;
 };
 
 struct AbstractWrap : public AbstractInstruction {
-  AbstractOperand indices;
+  AbstractWrap(AbstractOperand* indices, int32_t id)
+      : AbstractInstruction(OpCode::kWrap), indices(indices), id(id) {}
+  AbstractOperand* indices;
   std::vector<AbstractOperand*> source;
   std::vector<AbstractOperand*> target;
 
+  const int32_t id;
+  // Offset of array of affected operand indices in the literals section of the
+  // TB program. Filled in by first pass of making the TB program.
+  int32_t literalOffset{-1};
+
   void addWrap(AbstractOperand* sourceOp, AbstractOperand* targetOp = nullptr) {
-    if (std::find(source.begin(), source.end(), sourceOp) != source.end()) {
-      return;
+    int newWrap = AbstractOperand::kNoWrap;
+    if (targetOp) {
+      targetOp->wrappedAt = id;
+    } else if (sourceOp->wrappedAt == AbstractOperand::kNoWrap) {
+      sourceOp->wrappedAt = id;
+    }
+
+    for (auto i = 0; i < source.size(); ++i) {
+      // If the operand has the same wrap as another one here, do nothing.
+      if (source[i]->wrappedAt == sourceOp->wrappedAt ||
+          (targetOp && target[i]->wrappedAt == targetOp->wrappedAt)) {
+        return;
+      }
     }
     source.push_back(sourceOp);
     target.push_back(targetOp ? targetOp : sourceOp);
   }
+
+  std::string toString() const override;
 };
 
 struct AbstractBinary : public AbstractInstruction {
@@ -82,14 +158,49 @@ struct AbstractBinary : public AbstractInstruction {
       OpCode opCode,
       AbstractOperand* left,
       AbstractOperand* right,
-      AbstractOperand* result)
-      : AbstractInstruction(opCode), left(left), right(right), result(result) {}
+      AbstractOperand* result,
+      AbstractOperand* predicate = nullptr)
+      : AbstractInstruction(opCode),
+        left(left),
+        right(right),
+        result(result),
+        predicate(predicate) {}
 
   AbstractOperand* left;
   AbstractOperand* right;
   AbstractOperand* result;
-  AbstractOperand* predicate{nullptr};
-  bool invert{false};
+  AbstractOperand* predicate;
+
+  std::string toString() const override;
+};
+
+struct AbstractLiteral : public AbstractInstruction {
+  AbstractLiteral(
+      const VectorPtr& constant,
+      AbstractOperand* result,
+      AbstractOperand* predicate)
+      : AbstractInstruction(OpCode::kLiteral),
+        constant(constant),
+        result(result),
+        predicate(predicate) {}
+  VectorPtr constant;
+  AbstractOperand* result;
+  AbstractOperand* predicate;
+};
+
+struct AbstractUnary : public AbstractInstruction {
+  AbstractUnary(
+      OpCode opcode,
+      AbstractOperand* input,
+      AbstractOperand* result,
+      AbstractOperand* predicate = nullptr)
+      : AbstractInstruction(opcode),
+        input(input),
+        result(result),
+        predicate(predicate) {}
+  AbstractOperand* input;
+  AbstractOperand* result;
+  AbstractOperand* predicate;
 };
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/Project.cpp b/velox/experimental/wave/exec/Project.cpp
index c9cbd8edfb4e8..ac0cb5aba90dc 100644
--- a/velox/experimental/wave/exec/Project.cpp
+++ b/velox/experimental/wave/exec/Project.cpp
@@ -21,6 +21,10 @@
 
 namespace facebook::velox::wave {
 
+AbstractWrap* Project::findWrap() const {
+  return filterWrap_;
+}
+
 void Project::schedule(WaveStream& stream, int32_t maxRows) {
   for (auto& level : levels_) {
     std::vector<std::unique_ptr<Executable>> exes(level.size());
@@ -35,7 +39,7 @@ void Project::schedule(WaveStream& stream, int32_t maxRows) {
         range, [&](Stream* out, folly::Range<Executable**> exes) {
           auto inputControl = driver_->inputControl(stream, id_);
           auto control = stream.prepareProgramLaunch(
-              id_, maxRows, exes, blocksPerExe, false, out);
+              id_, maxRows, exes, blocksPerExe, inputControl, out);
           reinterpret_cast<WaveKernelStream*>(out)->call(
               out,
               exes.size() * blocksPerExe,
@@ -53,8 +57,10 @@ void Project::finalize(CompileState& state) {
   for (auto& level : levels_) {
     for (auto& program : level) {
       program->prepareForDevice(state.arena());
-      for (auto& pair : program->localAndOutput()) {
-        computedSet_.add(pair.first->id);
+      for (auto& pair : program->output()) {
+        if (true /*isProjected(id)*/) {
+          computedSet_.add(pair.first->id);
+        }
       }
     }
   }
diff --git a/velox/experimental/wave/exec/Project.h b/velox/experimental/wave/exec/Project.h
index 2a6137a831666..9e68d1ab5d0ff 100644
--- a/velox/experimental/wave/exec/Project.h
+++ b/velox/experimental/wave/exec/Project.h
@@ -24,8 +24,13 @@ class Project : public WaveOperator {
       CompileState& state,
       RowTypePtr outputType,
       std::vector<AbstractOperand*> operands,
-      std::vector<std::vector<ProgramPtr>> levels)
-      : WaveOperator(state, outputType, ""), levels_(std::move(levels)) {}
+      std::vector<std::vector<ProgramPtr>> levels,
+      AbstractWrap* filterWrap = nullptr)
+      : WaveOperator(state, outputType, ""),
+        levels_(std::move(levels)),
+        filterWrap_(filterWrap) {}
+
+  AbstractWrap* findWrap() const override;
 
   bool isStreaming() const override {
     return true;
@@ -38,7 +43,7 @@ class Project : public WaveOperator {
   void finalize(CompileState& state) override;
 
   std::string toString() const override {
-    return "Project";
+    return fmt::format("Project {}", WaveOperator::toString());
   }
 
   const OperandSet& syncSet() const override {
@@ -48,6 +53,7 @@ class Project : public WaveOperator {
  private:
   std::vector<std::vector<ProgramPtr>> levels_;
   OperandSet computedSet_;
+  AbstractWrap* filterWrap_{nullptr};
 };
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/TableScan.cpp b/velox/experimental/wave/exec/TableScan.cpp
index 3ae6c04509ac5..0c1d78b3e3421 100644
--- a/velox/experimental/wave/exec/TableScan.cpp
+++ b/velox/experimental/wave/exec/TableScan.cpp
@@ -186,6 +186,7 @@ bool TableScan::isFinished() const {
 }
 
 void TableScan::addDynamicFilter(
+    const core::PlanNodeId& producer,
     column_index_t outputChannel,
     const std::shared_ptr<common::Filter>& filter) {
   if (dataSource_) {
diff --git a/velox/experimental/wave/exec/TableScan.h b/velox/experimental/wave/exec/TableScan.h
index 81b925ea8449a..b00d53e3d3a4f 100644
--- a/velox/experimental/wave/exec/TableScan.h
+++ b/velox/experimental/wave/exec/TableScan.h
@@ -25,13 +25,16 @@
 
 namespace facebook::velox::wave {
 
-class TableScan : public WaveOperator {
+class TableScan : public WaveSourceOperator {
  public:
   TableScan(
       CompileState& state,
       int32_t operatorId,
       const core::TableScanNode& tableScanNode)
-      : WaveOperator(state, tableScanNode.outputType(), tableScanNode.id()),
+      : WaveSourceOperator(
+            state,
+            tableScanNode.outputType(),
+            tableScanNode.id()),
         tableHandle_(tableScanNode.tableHandle()),
         columnHandles_(tableScanNode.assignments()),
         driverCtx_(state.driver().driverCtx()),
@@ -47,11 +50,11 @@ class TableScan : public WaveOperator {
     connector_ = connector::getConnector(tableHandle_->connectorId());
   }
 
-  int32_t canAdvance() override {
+  int32_t canAdvance(WaveStream& stream) override {
     if (!dataSource_) {
       return 0;
     }
-    return waveDataSource_->canAdvance();
+    return waveDataSource_->canAdvance(stream);
   }
 
   void schedule(WaveStream& stream, int32_t maxRows = 0) override {
@@ -75,6 +78,7 @@ class TableScan : public WaveOperator {
   }
 
   void addDynamicFilter(
+      const core::PlanNodeId& producer,
       column_index_t outputChannel,
       const std::shared_ptr<common::Filter>& filter) override;
 
diff --git a/velox/experimental/wave/exec/ToWave.cpp b/velox/experimental/wave/exec/ToWave.cpp
index cae8022830a4d..6b9f69bd8d46c 100644
--- a/velox/experimental/wave/exec/ToWave.cpp
+++ b/velox/experimental/wave/exec/ToWave.cpp
@@ -72,21 +72,22 @@ AbstractOperand* CompileState::newOperand(
     const TypePtr& type,
     const std::string& label) {
   operands_.push_back(
-      std::make_unique<AbstractOperand>(operandCounter_++, type, ""));
+      std::make_unique<AbstractOperand>(operandCounter_++, type, label));
   auto op = operands_.back().get();
   return op;
 }
 
-AbstractOperand* CompileState::addIdentityProjections(Value value) {
+AbstractOperand* CompileState::addIdentityProjections(AbstractOperand* source) {
   AbstractOperand* result = nullptr;
-  for (auto i = 0; i < operators_.size(); ++i) {
-    if (auto operand = operators_[i]->defines(value)) {
-      result = operand;
-      continue;
-    }
-    if (!result) {
-      continue;
-    }
+
+  int32_t latest = 0;
+  auto it = operandOperatorIndex_.find(source);
+  VELOX_CHECK(
+      it != operandOperatorIndex_.end(),
+      "The operand being projected through must b defined first");
+  latest = it->second;
+  result = source;
+  for (auto i = latest; i < operators_.size(); ++i) {
     if (auto wrap = operators_[i]->findWrap()) {
       if (operators_[i]->isExpanding()) {
         auto newResult = newOperand(*result);
@@ -102,16 +103,18 @@ AbstractOperand* CompileState::addIdentityProjections(Value value) {
 
 AbstractOperand* CompileState::findCurrentValue(Value value) {
   auto it = projectedTo_.find(value);
+  AbstractOperand* source;
   if (it == projectedTo_.end()) {
     auto originIt = definedBy_.find(value);
     if (originIt == definedBy_.end()) {
       return nullptr;
     }
+    source = originIt->second;
     // The operand is defined earlier, so must get translated through
     // cardinality changes. Or if it is not defined earlier, it is defined in
     // the WaveOperator being constructed, in which case,i.e. the operand in
     // 'definedBy_'.
-    auto projected = addIdentityProjections(value);
+    auto projected = addIdentityProjections(source);
     return projected ? projected : originIt->second;
   }
   return it->second;
@@ -122,6 +125,9 @@ std::optional<OpCode> binaryOpCode(const Expr& expr) {
   if (name == "plus") {
     return OpCode::kPlus;
   }
+  if (name == "lt") {
+    return OpCode::kLT;
+  }
   return std::nullopt;
 }
 
@@ -131,6 +137,17 @@ Program* CompileState::newProgram() {
   return program.get();
 }
 
+Program* CompileState::programOf(AbstractOperand* op, bool create) {
+  auto it = definedIn_.find(op);
+  if (it == definedIn_.end()) {
+    if (!create) {
+      return nullptr;
+    }
+    return newProgram();
+  }
+  return it->second;
+}
+
 void CompileState::addInstruction(
     std::unique_ptr<AbstractInstruction> instruction,
     AbstractOperand* result,
@@ -165,6 +182,32 @@ void CompileState::addInstruction(
   definedIn_[result] = program;
 }
 
+bool maybeNotNull(const AbstractOperand* op) {
+  if (!op) {
+    return true;
+  }
+  return op->conditionalNonNull || op->notNull || op->sourceNullable;
+}
+
+void CompileState::addNullableIf(
+    const AbstractOperand* op,
+    std::vector<OperandId>& nullableIf) {
+  for (auto id : op->nullableIf) {
+    if (std::find(nullableIf.begin(), nullableIf.end(), id) ==
+        nullableIf.end()) {
+      nullableIf.push_back(id);
+    }
+  }
+}
+
+void CompileState::setConditionalNullable(AbstractBinary& binary) {
+  if (maybeNotNull(binary.left) && maybeNotNull(binary.right)) {
+    binary.result->conditionalNonNull = true;
+    addNullableIf(binary.left, binary.result->nullableIf);
+    addNullableIf(binary.right, binary.result->nullableIf);
+  }
+}
+
 AbstractOperand* CompileState::addExpr(const Expr& expr) {
   auto value = toValue(expr);
   auto current = findCurrentValue(value);
@@ -175,9 +218,23 @@ AbstractOperand* CompileState::addExpr(const Expr& expr) {
   if (auto* field = dynamic_cast<const exec::FieldReference*>(&expr)) {
     VELOX_FAIL("Should have been defined");
   } else if (auto* constant = dynamic_cast<const exec::ConstantExpr*>(&expr)) {
-    VELOX_UNSUPPORTED("No constants");
+    if (predicate_) {
+      auto result = newOperand(constant->type(), constant->toString());
+      currentProgram_->add(std::make_unique<AbstractLiteral>(
+          constant->value(), result, predicate_));
+      return result;
+    } else {
+      auto op = newOperand(constant->value()->type(), constant->toString());
+      op->constant = constant->value();
+      if (constant->value()->isNullAt(0)) {
+        op->literalNull = true;
+      } else {
+        op->notNull = true;
+      }
+      return op;
+    }
   } else if (dynamic_cast<const exec::SpecialForm*>(&expr)) {
-    VELOX_UNSUPPORTED("No special forms");
+    VELOX_UNSUPPORTED("No special forms: {}", expr.toString(1));
   }
   auto opCode = binaryOpCode(expr);
   if (!opCode.has_value()) {
@@ -188,6 +245,8 @@ AbstractOperand* CompileState::addExpr(const Expr& expr) {
   auto rightOp = addExpr(*expr.inputs()[1]);
   auto instruction =
       std::make_unique<AbstractBinary>(opCode.value(), leftOp, rightOp, result);
+  setConditionalNullable(*instruction);
+
   auto leftProgram = definedIn_[leftOp];
   auto rightProgram = definedIn_[rightOp];
   std::vector<Program*> sources;
@@ -209,6 +268,7 @@ std::vector<AbstractOperand*> CompileState::addExprSet(
   std::vector<AbstractOperand*> result;
   for (auto i = begin; i < end; ++i) {
     result.push_back(addExpr(*exprs[i]));
+    programOf(result.back())->addLabel(exprs[i]->toString(true));
   }
   return result;
 }
@@ -217,7 +277,7 @@ std::vector<std::vector<ProgramPtr>> CompileState::makeLevels(
     int32_t startIndex) {
   std::vector<std::vector<ProgramPtr>> levels;
   folly::F14FastSet<Program*> toAdd;
-  for (auto i = 0; i < allPrograms_.size(); ++i) {
+  for (auto i = startIndex; i < allPrograms_.size(); ++i) {
     toAdd.insert(allPrograms_[i].get());
   }
   while (!toAdd.empty()) {
@@ -254,23 +314,61 @@ int32_t findOutputChannel(
   VELOX_FAIL("Expr without output channel");
 }
 
+void CompileState::addFilter(const Expr& expr, const RowTypePtr& outputType) {
+  int32_t numPrograms = allPrograms_.size();
+  auto condition = addExpr(expr);
+  auto indices = newOperand(INTEGER(), "indices");
+  indices->notNull = true;
+  auto program = programOf(condition);
+  program->addLabel(expr.toString(true));
+  program->markOutput(indices->id);
+  program->add(std::make_unique<AbstractFilter>(condition, indices));
+  auto wrapUnique = std::make_unique<AbstractWrap>(indices, wrapCounter_++);
+  auto wrap = wrapUnique.get();
+  program->add(std::move(wrapUnique));
+  auto levels = makeLevels(numPrograms);
+  operators_.push_back(std::make_unique<Project>(
+      *this, outputType, std::vector<AbstractOperand*>{}, levels, wrap));
+}
+
 void CompileState::addFilterProject(
     exec::Operator* op,
-    RowTypePtr outputType,
+    RowTypePtr& outputType,
     int32_t& nodeIndex) {
   auto filterProject = reinterpret_cast<exec::FilterProject*>(op);
+  outputType = driverFactory_.planNodes[nodeIndex]->outputType();
   auto data = filterProject->exprsAndProjection();
-  VELOX_CHECK(!data.hasFilter);
+  auto& identityProjections = filterProject->identityProjections();
+  int32_t firstProjection = 0;
+  if (data.hasFilter) {
+    addFilter(*data.exprs->exprs()[0], outputType);
+    firstProjection = 1;
+    ++nodeIndex;
+    outputType = driverFactory_.planNodes[nodeIndex]->outputType();
+  }
   int32_t numPrograms = allPrograms_.size();
-  auto operands = addExprSet(*data.exprs, 0, data.exprs->exprs().size());
+  auto operands =
+      addExprSet(*data.exprs, firstProjection, data.exprs->exprs().size());
+  std::vector<std::pair<Value, AbstractOperand*>> pairs;
   for (auto i = 0; i < operands.size(); ++i) {
-    int32_t channel = findOutputChannel(*data.resultProjections, i);
+    int32_t channel =
+        findOutputChannel(*data.resultProjections, i + firstProjection);
     auto subfield = toSubfield(outputType->nameOf(channel));
-    definedBy_[Value(subfield)] = operands[i];
+    auto program = programOf(operands[i], false);
+    if (program) {
+      program->markOutput(operands[i]->id);
+      definedIn_[operands[i]] = program;
+    }
+    Value value(subfield);
+    definedBy_[value] = operands[i];
+    pairs.push_back(std::make_pair(value, operands[i]));
   }
   auto levels = makeLevels(numPrograms);
   operators_.push_back(
       std::make_unique<Project>(*this, outputType, operands, levels));
+  for (auto& [value, operand] : pairs) {
+    operators_.back()->defined(value, operand);
+  }
 }
 
 bool CompileState::reserveMemory() {
@@ -314,8 +412,6 @@ bool CompileState::addOperator(
     if (!reserveMemory()) {
       return false;
     }
-
-    outputType = driverFactory_.planNodes[nodeIndex]->outputType();
     addFilterProject(op, outputType, nodeIndex);
   } else if (name == "Aggregation") {
     if (!reserveMemory()) {
@@ -346,9 +442,11 @@ bool CompileState::addOperator(
 
 bool isProjectedThrough(
     const std::vector<exec::IdentityProjection>& projectedThrough,
-    int32_t i) {
+    int32_t i,
+    int32_t& inputChannel) {
   for (auto& projection : projectedThrough) {
     if (projection.outputChannel == i) {
+      inputChannel = projection.inputChannel;
       return true;
     }
   }
@@ -366,20 +464,52 @@ bool CompileState::compile() {
   // Make sure operator states are initialized.  We will need to inspect some of
   // them during the transformation.
   driver_.initializeOperators();
+  RowTypePtr inputType;
   for (; operatorIndex < operators.size(); ++operatorIndex) {
+    int32_t previousNumOperators = operators_.size();
+    auto& identity = operators[operatorIndex]->identityProjections();
+    // The columns that are projected through are renamed. They may also get an
+    // indirection after the new operator is placed.
+    std::vector<std::pair<AbstractOperand*, int32_t>> identityProjected;
+    for (auto& projection : identity) {
+      identityProjected.push_back(std::make_pair(
+          findCurrentValue(
+              Value(toSubfield(inputType->nameOf(projection.inputChannel)))),
+          projection.outputChannel));
+    }
     if (!addOperator(operators[operatorIndex], nodeIndex, outputType)) {
       break;
     }
     ++nodeIndex;
-    auto& identity = operators[operatorIndex]->identityProjections();
-    for (auto i = 0; i < outputType->size(); ++i) {
-      Value value = Value(toSubfield(outputType->nameOf(i)));
-      if (isProjectedThrough(identity, i)) {
-        continue;
+    for (auto newIndex = previousNumOperators; newIndex < operators_.size();
+         ++newIndex) {
+      for (auto i = 0; i < outputType->size(); ++i) {
+        auto& name = outputType->nameOf(i);
+        Value value = Value(toSubfield(name));
+        int32_t inputChannel;
+        if (isProjectedThrough(identity, i, inputChannel)) {
+          continue;
+        }
+        auto operand = operators_[newIndex]->defines(value);
+        if (!operand &&
+            (operators_[newIndex]->isSource() ||
+             !operators_[newIndex]->isStreaming())) {
+          operand = operators_[newIndex]->definesSubfield(
+              *this, outputType->childAt(i), name, newIndex == 0);
+        }
+        if (operand) {
+          operators_[newIndex]->addOutputId(operand->id);
+          definedBy_[value] = operand;
+          operandOperatorIndex_[operand] = operators_.size() - 1;
+        }
       }
-      auto operand = operators_.back()->defines(value);
-      definedBy_[value] = operand;
     }
+    for (auto& [op, channel] : identityProjected) {
+      Value value(toSubfield(outputType->nameOf(channel)));
+      auto newOp = addIdentityProjections(op);
+      projectedTo_[value] = newOp;
+    }
+    inputType = outputType;
   }
   if (operators_.empty()) {
     return false;
diff --git a/velox/experimental/wave/exec/ToWave.h b/velox/experimental/wave/exec/ToWave.h
index 814943f85325b..d90ff1639c8f6 100644
--- a/velox/experimental/wave/exec/ToWave.h
+++ b/velox/experimental/wave/exec/ToWave.h
@@ -53,7 +53,7 @@ class CompileState {
 
   Value toValue(const exec::Expr& expr);
 
-  AbstractOperand* addIdentityProjections(Value value);
+  AbstractOperand* addIdentityProjections(AbstractOperand* source);
   AbstractOperand* findCurrentValue(Value value);
   AbstractOperand* addExpr(const exec::Expr& expr);
 
@@ -82,9 +82,11 @@ class CompileState {
   bool
   addOperator(exec::Operator* op, int32_t& nodeIndex, RowTypePtr& outputType);
 
+  void addFilter(const exec::Expr& expr, const RowTypePtr& outputType);
+
   void addFilterProject(
       exec::Operator* op,
-      RowTypePtr outputType,
+      RowTypePtr& outputType,
       int32_t& nodeIndex);
 
   bool reserveMemory();
@@ -101,17 +103,26 @@ class CompileState {
       const AbstractOperand* result,
       const std::vector<Program*>& inputs);
 
+  void setConditionalNullable(AbstractBinary& binary);
+
+  void addNullableIf(
+      const AbstractOperand* op,
+      std::vector<OperandId>& nullableIf);
+
+  Program* programOf(AbstractOperand* op, bool create = true);
+
   const std::shared_ptr<aggregation::AggregateFunctionRegistry>&
   aggregateFunctionRegistry();
 
   std::unique_ptr<GpuArena> arena_;
   // The operator and output operand where the Value is first defined.
-  folly::F14FastMap<Value, AbstractOperand*, ValueHasher, ValueComparer>
-      definedBy_;
+  DefinesMap definedBy_;
 
   // The Operand where Value is available after all projections placed to date.
-  folly::F14FastMap<Value, AbstractOperand*, ValueHasher, ValueComparer>
-      projectedTo_;
+  DefinesMap projectedTo_;
+
+  // Index of WaveOperator producing the operand.
+  folly::F14FastMap<AbstractOperand*, int32_t> operandOperatorIndex_;
 
   folly::F14FastMap<AbstractOperand*, Program*> definedIn_;
 
@@ -130,8 +141,12 @@ class CompileState {
   // The program being generated.
   std::shared_ptr<Program> currentProgram_;
 
+  // Boolean to select the instruction. Set for conditionl sections.
+  AbstractOperand* predicate_{nullptr};
+
   // Sequence number for operands.
   int32_t operandCounter_{0};
+  int32_t wrapCounter_{0};
 
   std::shared_ptr<aggregation::AggregateFunctionRegistry>
       aggregateFunctionRegistry_;
diff --git a/velox/experimental/wave/exec/Values.cpp b/velox/experimental/wave/exec/Values.cpp
index e78f0a791a2f8..76cfd20d2ca2d 100644
--- a/velox/experimental/wave/exec/Values.cpp
+++ b/velox/experimental/wave/exec/Values.cpp
@@ -21,11 +21,11 @@
 namespace facebook::velox::wave {
 
 Values::Values(CompileState& state, const core::ValuesNode& values)
-    : WaveOperator(state, values.outputType(), values.id()),
+    : WaveSourceOperator(state, values.outputType(), values.id()),
       values_(values.values()),
       roundsLeft_(values.repeatTimes()) {}
 
-int32_t Values::canAdvance() {
+int32_t Values::canAdvance(WaveStream& stream) {
   if (current_ < values_.size()) {
     return values_[current_]->size();
   }
@@ -52,10 +52,16 @@ void Values::schedule(WaveStream& stream, int32_t maxRows) {
   for (auto i = 0; i < subfields_.size(); ++i) {
     sources.push_back(data->childAt(i).get());
   }
+  int32_t counter = 0;
+  outputIds_.forEach([&](auto id) {
+    stream.setNullable(*stream.operandAt(id), sources[counter]->mayHaveNulls());
+    ++counter;
+  });
   folly::Range<Executable**> empty(nullptr, nullptr);
   auto numBlocks = bits::roundUp(data->size(), kBlockSize) / kBlockSize;
+  stream.setNumRows(data->size());
   stream.prepareProgramLaunch(
-      id_, data->size(), empty, numBlocks, true, nullptr);
+      id_, data->size(), empty, numBlocks, nullptr, nullptr);
   vectorsToDevice(
       folly::Range(sources.data(), sources.size()), outputIds_, stream);
 }
diff --git a/velox/experimental/wave/exec/Values.h b/velox/experimental/wave/exec/Values.h
index b7c80229d89b6..880f46a0a0e26 100644
--- a/velox/experimental/wave/exec/Values.h
+++ b/velox/experimental/wave/exec/Values.h
@@ -20,11 +20,11 @@
 
 namespace facebook::velox::wave {
 
-class Values : public WaveOperator {
+class Values : public WaveSourceOperator {
  public:
   Values(CompileState& state, const core::ValuesNode& values);
 
-  int32_t canAdvance() override;
+  int32_t canAdvance(WaveStream& stream) override;
 
   bool isStreaming() const override {
     return true;
diff --git a/velox/experimental/wave/exec/Vectors.cpp b/velox/experimental/wave/exec/Vectors.cpp
index 1a279055d8110..f8947ec58edf7 100644
--- a/velox/experimental/wave/exec/Vectors.cpp
+++ b/velox/experimental/wave/exec/Vectors.cpp
@@ -119,18 +119,8 @@ void vectorsToDevice(
     transferVector(
         source[i], i, transfers, waveVectors, operandVector, arena, bytes);
   }
-  auto operands = arena.allocate<Operand>(operandVector.size());
-  memcpy(
-      operands->as<Operand>(),
-      operandVector.data(),
-      operandVector.size() * sizeof(Operand));
-  operandVector.clear();
   Executable::startTransfer(
-      ids,
-      std::move(operands),
-      std::move(waveVectors),
-      std::move(transfers),
-      stream);
+      ids, std::move(waveVectors), std::move(transfers), stream);
 }
 
 // Patches the position 'ofet' in 'code' to be a new uninitialized device
diff --git a/velox/experimental/wave/exec/Wave.cpp b/velox/experimental/wave/exec/Wave.cpp
index bc2c6f3ae2807..54d51cb5095f4 100644
--- a/velox/experimental/wave/exec/Wave.cpp
+++ b/velox/experimental/wave/exec/Wave.cpp
@@ -19,6 +19,27 @@
 
 namespace facebook::velox::wave {
 
+std::string WaveTime::toString() const {
+  if (micros < 20) {
+    return fmt::format("{} ({} clocks)", succinctNanos(micros * 1000), clocks);
+  }
+  return succinctNanos(micros * 1000);
+}
+
+void WaveStats::add(const WaveStats& other) {
+  numWaves += other.numWaves;
+  numKernels += other.numKernels;
+  numThreadBlocks += other.numThreadBlocks;
+  numPrograms += other.numPrograms;
+  numThreads += other.numThreads;
+  numSync += other.numSync;
+  bytesToDevice += other.bytesToDevice;
+  bytesToHost += other.bytesToHost;
+  hostOnlyTime += other.hostOnlyTime;
+  hostParallelTime += other.hostParallelTime;
+  waitTime += other.waitTime;
+}
+
 const SubfieldMap*& threadSubfieldMap() {
   thread_local const SubfieldMap* subfields;
   return subfields;
@@ -35,25 +56,25 @@ std::string definesToString(const DefinesMap* map) {
   return out.str();
 }
 
-OperandId pathToOperand(
+AbstractOperand* pathToOperand(
     const DefinesMap& map,
     std::vector<std::unique_ptr<common::Subfield::PathElement>>& path) {
   if (path.empty()) {
-    return kNoOperand;
+    return nullptr;
   }
   common::Subfield field(std::move(path));
   const auto subfieldMap = threadSubfieldMap();
   auto it = threadSubfieldMap()->find(field.toString());
   if (it == subfieldMap->end()) {
-    return kNoOperand;
+    return nullptr;
   }
   Value value(it->second.get());
   auto valueIt = map.find(value);
   path = std::move(field.path());
   if (valueIt == map.end()) {
-    return kNoOperand;
+    return nullptr;
   }
-  return valueIt->second->id;
+  return valueIt->second;
 }
 
 WaveVector* Executable::operandVector(OperandId id) {
@@ -107,6 +128,31 @@ WaveStream::~WaveStream() {
   }
 }
 
+void WaveStream::setState(WaveStream::State state) {
+  if (state == state_) {
+    return;
+  }
+  WaveTime nowTime = WaveTime::now();
+  switch (state_) {
+    case State::kNotRunning:
+      break;
+    case State::kHost:
+      stats_.hostOnlyTime += nowTime - start_;
+      break;
+    case State::kParallel:
+      stats_.hostParallelTime += nowTime - start_;
+      break;
+    case State::kWait:
+      stats_.waitTime += nowTime - start_;
+      break;
+  }
+  start_ = nowTime;
+  state_ = state;
+  if (state_ == State::kWait) {
+    ++stats_.numSync;
+  }
+}
+
 std::mutex WaveStream::reserveMutex_;
 std::vector<std::unique_ptr<Stream>> WaveStream::streamsForReuse_;
 std::vector<std::unique_ptr<Event>> WaveStream::eventsForReuse_;
@@ -175,6 +221,35 @@ void WaveStream::releaseEvent(std::unique_ptr<Event>&& event) {
   eventsForReuse_.push_back(std::move(event));
 }
 
+void WaveStream::markHostOutputOperand(const AbstractOperand& op) {
+  hostOutputOperands_.add(op.id);
+  auto nullable = isNullable(op);
+  auto alignment = WaveVector::alignment(op.type);
+  hostReturnSize_ = bits::roundUp(hostReturnSize_, alignment);
+  hostReturnSize_ += WaveVector::backingSize(op.type, numRows_, nullable);
+}
+
+void WaveStream::setReturnData(bool needStatus) {
+  if (!needStatus && hostReturnSize_ == 0) {
+    return;
+  }
+}
+
+void WaveStream::resultToHost() {
+  if (streams_.size() == 1) {
+    if (hostReturnDataUsed_ > 0) {
+      streams_[0]->deviceToHostAsync(
+          hostReturnData_->as<char>(),
+          deviceReturnData_->as<char>(),
+          hostReturnDataUsed_);
+    }
+    hostReturnEvent_ = newEvent();
+    hostReturnEvent_->record(*streams_[0]);
+  } else {
+    VELOX_NYI();
+  }
+}
+
 namespace {
 // Copies from pageable host to unified address. Multithreaded memcpy is
 // probably best.
@@ -188,17 +263,20 @@ void copyData(std::vector<Transfer>& transfers) {
 
 void Executable::startTransfer(
     OperandSet outputOperands,
-    WaveBufferPtr&& operands,
     std::vector<WaveVectorPtr>&& outputVectors,
     std::vector<Transfer>&& transfers,
     WaveStream& waveStream) {
   auto exe = std::make_unique<Executable>();
+  auto numBlocks = bits::roundUp(waveStream.numRows(), kBlockSize) / kBlockSize;
+  exe->waveStream = &waveStream;
   exe->outputOperands = outputOperands;
+  WaveStream::ExeLaunchInfo info;
+  waveStream.exeLaunchInfo(*exe, numBlocks, info);
   exe->output = std::move(outputVectors);
   exe->transfers = std::move(transfers);
-  exe->deviceData.push_back(operands);
-  exe->operands = operands->as<Operand>();
-  exe->outputOperands = outputOperands;
+  exe->deviceData.push_back(waveStream.arena().allocate<char>(info.totalBytes));
+  auto start = exe->deviceData[0]->as<char>();
+  exe->operands = waveStream.fillOperands(*exe, start, info)[0];
   copyData(exe->transfers);
   auto* device = waveStream.device();
   waveStream.installExecutables(
@@ -206,6 +284,7 @@ void Executable::startTransfer(
       [&](Stream* stream, folly::Range<Executable**> executables) {
         for (auto& transfer : executables[0]->transfers) {
           stream->prefetch(device, transfer.to, transfer.size);
+          waveStream.stats().bytesToDevice += transfer.size;
         }
         waveStream.markLaunch(*stream, *executables[0]);
       });
@@ -220,9 +299,11 @@ void WaveStream::installExecutables(
       OperandSetHasher,
       OperandSetComparer>
       dependences;
+  VELOX_CHECK_NULL(hostReturnEvent_);
   for (auto& exeUnique : executables) {
     executables_.push_back(std::move(exeUnique));
     auto exe = executables_.back().get();
+    exe->waveStream = this;
     VELOX_CHECK(exe->stream == nullptr);
     OperandSet streamSet;
     exe->inputOperands.forEach([&](int32_t id) {
@@ -243,13 +324,17 @@ void WaveStream::installExecutables(
   }
 
   // exes with no dependences go on a new stream. Streams with dependent compute
-  // get an event. The dependent computes ggo on new streams that first wait for
+  // get an event. The dependent computes go on new streams that first wait for
   // the events.
   folly::F14FastMap<int32_t, Event*> streamEvents;
   for (auto& [ids, exeVector] : dependences) {
     folly::Range<Executable**> exes(exeVector.data(), exeVector.size());
     std::vector<Stream*> required;
     ids.forEach([&](int32_t id) { required.push_back(streams_[id].get()); });
+    if (required.size() == 1) {
+      launch(required[0], exes);
+      continue;
+    }
     if (required.empty()) {
       auto stream = newStream();
       launch(stream, exes);
@@ -275,9 +360,12 @@ bool WaveStream::isArrived(
     int32_t sleepMicro,
     int32_t timeoutMicro) {
   OperandSet waitSet;
+  if (hostReturnEvent_) {
+    return hostReturnEvent_->query();
+  }
   ids.forEach([&](int32_t id) {
     auto exe = operandToExecutable_[id];
-    VELOX_CHECK_NOT_NULL(exe);
+    VELOX_CHECK_NOT_NULL(exe, "No exe produces operand {} in stream", id);
     if (!exe->stream) {
       return;
     }
@@ -315,9 +403,161 @@ bool WaveStream::isArrived(
   return false;
 }
 
-template <typename T, typename U>
-T addBytes(U* p, int32_t bytes) {
-  return reinterpret_cast<T>(reinterpret_cast<uintptr_t>(p) + bytes);
+void WaveStream::ensureVector(
+    const AbstractOperand& op,
+    WaveVectorPtr& vector,
+    int32_t numRows) {
+  if (!vector) {
+    vector = std::make_unique<WaveVector>(op.type, arena());
+  }
+  bool nullable = isNullable(op);
+  if (false /*hostOutputOperands_.contains(op.id)*/) {
+    VELOX_NYI();
+  } else {
+    vector->resize(numRows < 0 ? numRows_ : numRows, nullable);
+  }
+}
+
+bool WaveStream::isNullable(const AbstractOperand& op) const {
+  bool notNull = op.notNull;
+  if (!notNull) {
+    if (op.sourceNullable) {
+      notNull = !operandNullable_[op.id];
+    } else {
+      notNull = true;
+      for (auto i : op.nullableIf) {
+        if (operandNullable_[i]) {
+          notNull = false;
+          break;
+        }
+      }
+    }
+  }
+  return !notNull;
+}
+
+void WaveStream::exeLaunchInfo(
+    Executable& exe,
+    int32_t numBlocks,
+    ExeLaunchInfo& info) {
+  // The exe has an Operand* for each input/local/output/literal
+  // op. It has an Operand for each local/output/literal op. It has
+  // an array of numBlock int32_t*'s for every distinct wrapAt in
+  // its local/output operands where the wrapAt does not occur in
+  // any of the input Operands.
+  info.numBlocks = numBlocks;
+  info.numInput = exe.inputOperands.size();
+  exe.inputOperands.forEach([&](auto id) {
+    auto op = operandAt(id);
+    auto* inputExe = operandExecutable(op->id);
+    if (op->wrappedAt != AbstractOperand::kNoWrap) {
+      auto* indices = inputExe->wraps[op->wrappedAt];
+      VELOX_CHECK_NOT_NULL(indices);
+      info.inputWrap[op->wrappedAt] = indices;
+    }
+  });
+
+  exe.localOperands.forEach([&](auto id) {
+    auto op = operandAt(id);
+    if (op->wrappedAt != AbstractOperand::kNoWrap) {
+      if (info.inputWrap.find(id) == info.inputWrap.end()) {
+        if (info.localWrap.find(op->wrappedAt) == info.localWrap.end()) {
+          info.localWrap[op->wrappedAt] = reinterpret_cast<int32_t**>(
+              info.localWrap.size() * numBlocks * sizeof(void*));
+        }
+      }
+    }
+  });
+  exe.outputOperands.forEach([&](auto id) {
+    auto op = operandAt(id);
+    if (op->wrappedAt != AbstractOperand::kNoWrap) {
+      if (info.inputWrap.find(id) == info.inputWrap.end()) {
+        if (info.localWrap.find(op->wrappedAt) == info.localWrap.end()) {
+          info.localWrap[op->wrappedAt] = reinterpret_cast<int32_t**>(
+              info.localWrap.size() * numBlocks * sizeof(void*));
+        }
+      }
+    }
+  });
+  auto numLiteral = exe.literals ? exe.literals->size() : 0;
+  info.numLocalOps =
+      exe.localOperands.size() + exe.outputOperands.size() + numLiteral;
+  info.totalBytes =
+      // Pointer to Operand for input and local Operands.
+      sizeof(void*) * (info.numLocalOps + exe.inputOperands.size()) +
+      // Flat array of Operand for all but input.
+      sizeof(Operand) * info.numLocalOps +
+      // Space for the 'indices' for each distinct wrappedAt.
+      (info.localWrap.size() * numBlocks * sizeof(void*));
+}
+
+Operand**
+WaveStream::fillOperands(Executable& exe, char* start, ExeLaunchInfo& info) {
+  Operand** operandPtrBegin = addBytes<Operand**>(start, 0);
+  exe.inputOperands.forEach([&](int32_t id) {
+    auto* inputExe = operandToExecutable_[id];
+    int32_t ordinal = inputExe->outputOperands.ordinal(id);
+    *operandPtrBegin =
+        &inputExe->operands[inputExe->firstOutputOperandIdx + ordinal];
+    ++operandPtrBegin;
+  });
+  Operand* operandBegin = addBytes<Operand*>(
+      start, (info.numInput + info.numLocalOps) * sizeof(void*));
+  int32_t* indicesBegin =
+      addBytes<int32_t*>(operandBegin, info.numLocalOps * sizeof(Operand));
+  for (auto& [id, ptr] : info.localWrap) {
+    info.localWrap[id] =
+        addBytes<int32_t**>(indicesBegin, reinterpret_cast<int64_t>(ptr));
+  }
+  exe.wraps = std::move(info.localWrap);
+  for (auto& [id, ptr] : info.inputWrap) {
+    exe.wraps[id] = ptr;
+  }
+  exe.intermediates.resize(exe.localOperands.size());
+  int32_t fill = 0;
+  exe.localOperands.forEach([&](auto id) {
+    auto op = operandAt(id);
+    ensureVector(*op, exe.intermediates[fill]);
+    auto vec = exe.intermediates[fill].get();
+    ++fill;
+    vec->toOperand(operandBegin);
+    if (op->wrappedAt != AbstractOperand::kNoWrap) {
+      operandBegin->indices = exe.wraps[op->wrappedAt];
+      VELOX_CHECK_NOT_NULL(operandBegin->indices);
+    }
+    *operandPtrBegin = operandBegin;
+    ++operandPtrBegin;
+    ++operandBegin;
+  });
+  exe.firstOutputOperandIdx = exe.intermediates.size();
+  exe.output.resize(exe.outputOperands.size());
+  fill = 0;
+  exe.outputOperands.forEach([&](auto id) {
+    auto op = operandAt(id);
+    ensureVector(*op, exe.output[fill]);
+    auto vec = exe.output[fill].get();
+    ++fill;
+    vec->toOperand(operandBegin);
+    if (op->wrappedAt != AbstractOperand::kNoWrap) {
+      operandBegin->indices = exe.wraps[op->wrappedAt];
+      VELOX_CHECK_NOT_NULL(operandBegin->indices);
+    }
+    *operandPtrBegin = operandBegin;
+    ++operandPtrBegin;
+    ++operandBegin;
+  });
+
+  auto numConstants = exe.literals ? exe.literals->size() : 0;
+  if (numConstants) {
+    memcpy(operandBegin, exe.literals->data(), numConstants * sizeof(Operand));
+    for (auto i = 0; i < numConstants; ++i) {
+      *operandPtrBegin = operandBegin;
+      ++operandPtrBegin;
+      ++operandBegin;
+    }
+  }
+
+  return addBytes<Operand**>(start, 0);
 }
 
 LaunchControl* WaveStream::prepareProgramLaunch(
@@ -325,42 +565,41 @@ LaunchControl* WaveStream::prepareProgramLaunch(
     int32_t inputRows,
     folly::Range<Executable**> exes,
     int32_t blocksPerExe,
-    bool initStatus,
+    const LaunchControl* inputControl,
     Stream* stream) {
   static_assert(Operand::kPointersInOperand * sizeof(void*) == sizeof(Operand));
-  int32_t shared = 0;
 
   //  First calculate total size.
   // 2 int arrays: blockBase, programIdx.
-  int32_t numBlocks = std::min<int32_t>(1, exes.size()) * blocksPerExe;
+  int32_t numBlocks = std::max<int32_t>(1, exes.size()) * blocksPerExe;
   int32_t size = 2 * numBlocks * sizeof(int32_t);
+  std::vector<ExeLaunchInfo> info(exes.size());
   auto exeOffset = size;
   // 2 pointers per exe: TB program and start of its param array.
   size += exes.size() * sizeof(void*) * 2;
   auto operandOffset = size;
-  // Exe dependent sizes for parameters.
-  int32_t numTotalOps = 0;
-  for (auto& exe : exes) {
-    markLaunch(*stream, *exe);
-    shared = std::max(shared, exe->programShared->sharedMemorySize());
-    int32_t numIn = exe->inputOperands.size();
-    int numOps = numIn + exe->intermediates.size() + exe->outputOperands.size();
-    numTotalOps += numOps;
-    size += numOps * sizeof(void*) + (numOps - numIn) * sizeof(Operand);
+  // Exe dependent sizes for operands.
+  int32_t operandBytes = 0;
+  int32_t shared = 0;
+  for (auto i = 0; i < exes.size(); ++i) {
+    exeLaunchInfo(*exes[i], numBlocks, info[i]);
+    operandBytes += info[i].totalBytes;
+    markLaunch(*stream, *exes[i]);
+    shared = std::max(shared, exes[i]->programShared->sharedMemorySize());
   }
+  size += operandBytes;
   int32_t statusOffset = 0;
-  if (initStatus) {
+  if (!inputControl) {
     statusOffset = size;
     //  Pointer to return block for each tB.
     size += blocksPerExe * sizeof(BlockStatus);
   }
   auto buffer = arena_.allocate<char>(size);
+  memset(buffer->as<char>(), 0, size);
 
-  auto controlUnique = std::make_unique<LaunchControl>();
+  auto controlUnique = std::make_unique<LaunchControl>(key, inputRows);
   auto& control = *controlUnique;
 
-  control.key = key;
-  control.inputRows = inputRows;
   control.sharedMemorySize = shared;
   // Now we fill in the various arrays and put their start addresses in
   // 'control'.
@@ -371,11 +610,7 @@ LaunchControl* WaveStream::prepareProgramLaunch(
       control.programIdx, numBlocks * sizeof(int32_t));
   control.operands =
       addBytes<Operand***>(control.programs, exes.size() * sizeof(void*));
-  int32_t fill = 0;
-  Operand** operandPtrBegin = addBytes<Operand**>(start, operandOffset);
-  Operand* operandArrayBegin =
-      addBytes<Operand*>(operandPtrBegin, numTotalOps * sizeof(void*));
-  if (initStatus) {
+  if (!inputControl) {
     // If the launch produces new statuses (as opposed to updating status of a
     // previous launch), there is an array with a status for each TB. If there
     // are multiple exes, they all share the same error codes. A launch can have
@@ -383,93 +618,136 @@ LaunchControl* WaveStream::prepareProgramLaunch(
     // Writing errors is not serialized but each lane with at least one error
     // will show one error.
     control.status = addBytes<BlockStatus*>(start, statusOffset);
-    memset(control.status, 0, blocksPerExe * sizeof(BlockStatus));
+    // Memory is already set to all 0.
     for (auto i = 0; i < blocksPerExe; ++i) {
       auto status = &control.status[i];
       status->numRows =
           i == blocksPerExe - 1 ? inputRows % kBlockSize : kBlockSize;
     }
   } else {
-    control.status = nullptr;
-  }
-  for (auto exeIdx = 0; exeIdx < exes.size(); ++exeIdx) {
-    auto exe = exes[exeIdx];
-    int32_t numIn = exe->inputOperands.size();
-    int32_t numLocal = exe->intermediates.size() + exe->outputOperands.size();
-    control.programs[exeIdx] = exe->program;
-    control.operands[exeIdx] = operandPtrBegin;
-    // We get the actual input operands for the exe from the exes this depends
-    // on
-    exe->inputOperands.forEach([&](int32_t id) {
-      auto* inputExe = operandToExecutable_[id];
-      int32_t ordinal = inputExe->outputOperands.ordinal(id);
-      *operandPtrBegin = &inputExe->operands[ordinal];
-      ++operandPtrBegin;
-    });
-    // We install the intermediates and outputs from the WaveVectors in the exe.
-    exe->operands = operandArrayBegin;
-    for (auto& vec : exe->intermediates) {
-      *operandPtrBegin = operandArrayBegin;
-      vec->toOperand(operandArrayBegin);
-      ++operandPtrBegin;
-      ++operandArrayBegin;
-    }
-    for (auto& vec : exe->output) {
-      *operandPtrBegin = operandArrayBegin;
-      vec->toOperand(operandArrayBegin);
-      ++operandPtrBegin;
-      ++operandArrayBegin;
-    }
+    control.status = inputControl->status;
+  }
+  char* operandStart = addBytes<char*>(start, operandOffset);
+  int32_t fill = 0;
+  for (auto i = 0; i < exes.size(); ++i) {
+    control.programs[i] = exes[i]->program;
+
+    auto operandPtrs = fillOperands(*exes[i], operandStart, info[i]);
+    control.operands[i] = operandPtrs;
+    // The operands defined by the exe start after the input operands and are
+    // all consecutive.
+    exes[i]->operands = operandPtrs[exes[i]->inputOperands.size()];
+    operandStart += info[i].totalBytes;
     for (auto tbIdx = 0; tbIdx < blocksPerExe; ++tbIdx) {
-      control.blockBase[fill] = exeIdx * blocksPerExe;
-      control.programIdx[fill] = exeIdx;
+      control.blockBase[fill] = i * blocksPerExe;
+      control.programIdx[fill] = i;
+      ++fill;
     }
   }
+  if (!exes.empty()) {
+    ++stats_.numKernels;
+  }
+  stats_.numPrograms += exes.size();
+  stats_.numThreadBlocks += blocksPerExe * exes.size();
+  stats_.numThreads += numRows_ * exes.size();
+
   control.deviceData = std::move(buffer);
   launchControl_[key].push_back(std::move(controlUnique));
   return &control;
 }
 
-void WaveStream::getOutput(
+int32_t WaveStream::getOutput(
+    int32_t operatorId,
+    memory::MemoryPool& pool,
     folly::Range<const OperandId*> operands,
-    WaveVectorPtr* waveVectors) {
+    VectorPtr* vectors) {
+  auto it = launchControl_.find(operatorId);
+  VELOX_CHECK(it != launchControl_.end());
+  auto* control = it->second[0].get();
+  auto* status = control->status;
+  auto numBlocks = bits::roundUp(control->inputRows, kBlockSize) / kBlockSize;
+  if (operands.empty()) {
+    return statusNumRows(status, numBlocks);
+  }
   for (auto i = 0; i < operands.size(); ++i) {
     auto id = operands[i];
     auto exe = operandExecutable(id);
     VELOX_CHECK_NOT_NULL(exe);
     auto ordinal = exe->outputOperands.ordinal(id);
-    waveVectors[i] = std::move(exe->output[ordinal]);
-    if (waveVectors[i] == nullptr) {
+    auto waveVectorPtr = &exe->output[ordinal];
+    if (!waveVectorPtr->get()) {
       exe->ensureLazyArrived(operands);
-      waveVectors[i] = std::move(exe->output[ordinal]);
-      VELOX_CHECK_NOT_NULL(waveVectors[i]);
+      VELOX_CHECK_NOT_NULL(
+          waveVectorPtr->get(), "Lazy load should have filled in the result");
     }
+    vectors[i] = waveVectorPtr->get()->toVelox(
+        &pool,
+        numBlocks,
+        status,
+        &exe->operands[exe->firstOutputOperandIdx + ordinal]);
   }
+  return vectors[0]->size();
 }
 
-ScalarType typeKindCode(TypeKind kind) {
-  switch (kind) {
-    case TypeKind::BIGINT:
-      return ScalarType::kInt64;
-    default:
-      VELOX_UNSUPPORTED("Bad TypeKind {}", kind);
-  }
+WaveTypeKind typeKindCode(TypeKind kind) {
+  return static_cast<WaveTypeKind>(kind);
 }
 
+#define IN_HEAD(abstract, physical, _op)             \
+  auto* abstractInst = &instruction->as<abstract>(); \
+  space->opCode = _op;                               \
+  auto physicalInst = new (&space->_) physical();
+
+#define IN_OPERAND(member) \
+  physicalInst->member = operandIndex(abstractInst->member)
+
 void Program::prepareForDevice(GpuArena& arena) {
-  int32_t codeSize = 0;
-  int32_t sharedMemorySize = 0;
+  VELOX_CHECK(!instructions_.empty());
+  if (instructions_.back()->opCode != OpCode::kReturn) {
+    instructions_.push_back(std::make_unique<AbstractReturn>());
+  }
+  int32_t codeSize = sizeof(Instruction) * instructions_.size();
   for (auto& instruction : instructions_)
     switch (instruction->opCode) {
-      case OpCode::kPlus: {
+      case OpCode::kFilter: {
+        auto& filter = instruction->as<AbstractFilter>();
+        markInput(filter.flags);
+        markResult(filter.indices);
+
+        break;
+      }
+      case OpCode::kWrap: {
+        auto& wrap = instruction->as<AbstractWrap>();
+        markInput(wrap.indices);
+        std::vector<OperandIndex> indices(wrap.target.size());
+        wrap.literalOffset = addLiteral(indices.data(), indices.size());
+        for (auto i = 0; i < wrap.target.size(); ++i) {
+          auto target = wrap.target[i];
+          markInput(wrap.source[i]);
+          if (target != wrap.source[i]) {
+            markResult(target);
+          }
+        }
+        break;
+      }
+      case OpCode::kPlus:
+      case OpCode::kLT: {
         auto& bin = instruction->as<AbstractBinary>();
         markInput(bin.left);
         markInput(bin.right);
         markResult(bin.result);
         markInput(bin.predicate);
-        codeSize += sizeof(Instruction);
         break;
       }
+      case OpCode::kNegate: {
+        auto& un = instruction->as<AbstractUnary>();
+        markInput(un.input);
+        markResult(un.result);
+        markInput(un.predicate);
+        break;
+      }
+      case OpCode::kReturn:
+        break;
       default:
         VELOX_UNSUPPORTED(
             "OpCode {}", static_cast<int32_t>(instruction->opCode));
@@ -477,58 +755,94 @@ void Program::prepareForDevice(GpuArena& arena) {
   sortSlots();
   arena_ = &arena;
   deviceData_ = arena.allocate<char>(
-      codeSize + instructions_.size() * sizeof(void*) +
-      sizeof(ThreadBlockProgram));
+      codeSize + literalArea_.size() + sizeof(ThreadBlockProgram));
+  uintptr_t end = reinterpret_cast<uintptr_t>(
+      deviceData_->as<char>() + deviceData_->size());
   program_ = deviceData_->as<ThreadBlockProgram>();
-  auto instructionArray = addBytes<Instruction**>(program_, sizeof(*program_));
-  program_->sharedMemorySize = sharedMemorySize;
+  auto instructionArray = addBytes<Instruction*>(program_, sizeof(*program_));
   program_->numInstructions = instructions_.size();
   program_->instructions = instructionArray;
-  Instruction* space = addBytes<Instruction*>(
-      instructionArray, instructions_.size() * sizeof(void*));
+  Instruction* space = instructionArray;
+  deviceLiterals_ = reinterpret_cast<char*>(space) +
+      sizeof(Instruction) * instructions_.size();
+  VELOX_CHECK_LE(
+      reinterpret_cast<uintptr_t>(deviceLiterals_) + literalArea_.size(), end);
+  memcpy(deviceLiterals_, literalArea_.data(), literalArea_.size());
+
   for (auto& instruction : instructions_) {
-    *instructionArray = space;
-    ++instructionArray;
     switch (instruction->opCode) {
-      case OpCode::kPlus: {
-        auto& bin = instruction->as<AbstractBinary>();
-        auto typeCode = typeKindCode(bin.left->type->kind());
-        // Comstructed on host, no vtable.
-        space->opCode = OP_MIX(instruction->opCode, typeCode);
-        new (&space->_.binary) IBinary();
-        space->_.binary.left = operandIndex(bin.left);
-        space->_.binary.right = operandIndex(bin.right);
-        space->_.binary.result = operandIndex(bin.result);
-        ++space;
+      case OpCode::kPlus:
+      case OpCode::kLT: {
+        IN_HEAD(
+            AbstractBinary,
+            IBinary,
+            OP_MIX(
+                instruction->opCode,
+                instruction->as<AbstractBinary>().left->type->kind()));
+
+        IN_OPERAND(left);
+        IN_OPERAND(right);
+        IN_OPERAND(result);
+        IN_OPERAND(predicate);
+        break;
+      }
+      case OpCode::kFilter: {
+        IN_HEAD(AbstractFilter, IFilter, OpCode::kFilter);
+        IN_OPERAND(flags);
+        IN_OPERAND(indices);
+        break;
+      }
+      case OpCode::kWrap: {
+        IN_HEAD(AbstractWrap, IWrap, OpCode::kWrap);
+        IN_OPERAND(indices);
+        physicalInst->numColumns = abstractInst->source.size();
+        physicalInst->columns = reinterpret_cast<OperandIndex*>(
+            deviceLiterals_ + abstractInst->literalOffset);
+        for (auto i = 0; i < abstractInst->source.size(); ++i) {
+          physicalInst->columns[i] = operandIndex(abstractInst->source[i]);
+        }
+        break;
+      }
+      case OpCode::kReturn: {
+        IN_HEAD(AbstractReturn, IReturn, OpCode::kReturn);
         break;
       }
       default:
         VELOX_UNSUPPORTED("Bad OpCode");
     }
+    sharedMemorySize_ =
+        std::max(sharedMemorySize_, instructionSharedMemory(*space));
+    ++space;
+    VELOX_CHECK_LE(
+        reinterpret_cast<uintptr_t>(space),
+        reinterpret_cast<uintptr_t>(deviceLiterals_));
+  }
+  program_->sharedMemorySize = sharedMemorySize_;
+  literalOperands_.resize(literal_.size());
+  for (auto& [op, index] : literal_) {
+    literalToOperand(op, literalOperands_[index - firstLiteralIdx_]);
   }
 }
 
-void Program::sortSlots() {
-  // Assigns offsets to input and local/output slots so that all
-  // input is first and output next and within input and output, the
-  // slots are ordered with lower operand id first. So, if inputs
-  // are slots 88 and 22 and outputs are 77 and 33, then the
-  // complete order is 22, 88, 33, 77.
-  std::vector<AbstractOperand*> ids;
-  for (auto& pair : input_) {
-    ids.push_back(pair.first);
-  }
-  std::sort(
-      ids.begin(),
-      ids.end(),
-      [](AbstractOperand*& left, AbstractOperand*& right) {
-        return left->id < right->id;
-      });
-  for (auto i = 0; i < ids.size(); ++i) {
-    input_[ids[i]] = i;
+void Program::literalToOperand(AbstractOperand* abstractOp, Operand& op) {
+  op.indexMask = 0;
+  op.indices = nullptr;
+  if (abstractOp->literalNull) {
+    op.nulls =
+        reinterpret_cast<uint8_t*>(deviceLiterals_ + abstractOp->literalOffset);
+  } else {
+    op.base = deviceLiterals_ + abstractOp->literalOffset;
   }
-  ids.clear();
-  for (auto& pair : local_) {
+}
+
+namespace {
+// Sorts 'map' by id. Inserts back into map with second as ordinal number
+// starting at 'startAt'. Returns 1 + the highest assigned number.
+int32_t sortAndRenumber(
+    int32_t startAt,
+    folly::F14FastMap<AbstractOperand*, int32_t>& map) {
+  std::vector<AbstractOperand*> ids;
+  for (auto& pair : map) {
     ids.push_back(pair.first);
   }
   std::sort(
@@ -538,32 +852,114 @@ void Program::sortSlots() {
         return left->id < right->id;
       });
   for (auto i = 0; i < ids.size(); ++i) {
-    local_[ids[i]] = i + input_.size();
+    map[ids[i]] = i + startAt;
   }
+  return startAt + ids.size();
+}
+} // namespace
+
+void Program::sortSlots() {
+  // Assigns offsets to input and local/output slots so that all
+  // input is first and output next and within input and output, the
+  // slots are ordered with lower operand id first. So, if inputs
+  // are slots 88 and 22 and outputs are 77 and 33, then the
+  // complete order is 22, 88, 33, 77. Constants are sorted after everything
+  // else.
+
+  auto start = sortAndRenumber(0, input_);
+  start = sortAndRenumber(start, local_);
+  start = sortAndRenumber(start, output_);
+  firstLiteralIdx_ = start;
+  sortAndRenumber(start, literal_);
 }
 
 OperandIndex Program::operandIndex(AbstractOperand* op) const {
+  if (!op) {
+    return kEmpty;
+  }
   auto it = input_.find(op);
   if (it != input_.end()) {
     return it->second;
   }
   it = local_.find(op);
-  if (it == local_.end()) {
-    VELOX_FAIL("Bad operand, offset not known");
+  if (it != local_.end()) {
+    return it->second;
+  }
+  it = output_.find(op);
+  if (it != local_.end()) {
+    return it->second;
+  }
+
+  it = literal_.find(op);
+  if (it != literal_.end()) {
+    return it->second;
+  }
+  VELOX_FAIL("Operand not found");
+}
+
+template <typename T>
+int32_t Program::addLiteral(T* value, int32_t count) {
+  nextLiteral_ = bits::roundUp(nextLiteral_, sizeof(T));
+  auto start = nextLiteral_;
+  nextLiteral_ += sizeof(T) * count;
+  literalArea_.resize(nextLiteral_);
+  memcpy(literalArea_.data() + start, value, sizeof(T) * count);
+  return start;
+}
+
+template <TypeKind kind>
+int32_t Program::addLiteralTyped(AbstractOperand* op) {
+  if (op->literalOffset != AbstractOperand::kNoConstant) {
+    return op->literalOffset;
+  }
+  using T = typename TypeTraits<kind>::NativeType;
+  if (op->constant->isNullAt(0)) {
+    op->literalNull = true;
+    char zero = 0;
+    return op->literalOffset = addLiteral<char>(&zero, 1);
+  }
+  T value = op->constant->as<SimpleVector<T>>()->valueAt(0);
+  if constexpr (std::is_same_v<T, StringView>) {
+    int64_t inlined = 0;
+    StringView* stringView = reinterpret_cast<StringView*>(&value);
+    if (stringView->size() <= 6) {
+      int64_t inlined = static_cast<int64_t>(stringView->size()) << 48;
+      memcpy(
+          reinterpret_cast<char*>(&inlined) + 2,
+          stringView->data(),
+          stringView->size());
+      op->literalOffset = addLiteral(&inlined, 1);
+    } else {
+      int64_t zero = 0;
+      op->literalOffset = addLiteral(&zero, 1);
+      addLiteral(stringView->data(), stringView->size());
+    }
+  } else {
+    op->literalOffset = addLiteral(&value, 1);
   }
-  return it->second;
+  return op->literalOffset;
 }
 
 void Program::markInput(AbstractOperand* op) {
   if (!op) {
     return;
   }
-  if (!local_.count(op)) {
+  if (op->constant) {
+    VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+        addLiteralTyped, op->constant->type()->kind(), op);
+    literal_[op] = literal_.size();
+    return;
+  }
+  if (!local_.count(op) && !output_.count(op)) {
     input_[op] = input_.size();
   }
 }
 
 void Program::markResult(AbstractOperand* op) {
+  if (outputIds_.contains(op->id)) {
+    output_[op] = outputIds_.ordinal(op->id);
+    return;
+  }
   if (!local_.count(op)) {
     local_[op] = local_.size();
   }
@@ -588,22 +984,81 @@ std::unique_ptr<Executable> Program::getExecutable(
       exe->inputOperands.add(pair.first->id);
     }
     for (auto& pair : local_) {
+      exe->localOperands.add(pair.first->id);
+    }
+    for (auto& pair : output_) {
       exe->outputOperands.add(pair.first->id);
     }
-    exe->output.resize(local_.size());
+
+    exe->literals = &literalOperands_;
     exe->releaser = [](std::unique_ptr<Executable>& ptr) {
       auto program = ptr->programShared.get();
       ptr->reuse();
       program->releaseExe(std::move(ptr));
     };
+  }
+  return exe;
+}
+
+std::string AbstractOperand::toString() const {
+  if (constant) {
+    return fmt::format(
+        "<literal {} {}>", constant->toString(0), type->toString());
+  }
+  return fmt::format("<{}: {} {}>", id, label, type->toString());
+}
 
-  } // We have an exe, whether new or reused. Check the vectors.
-  int32_t nth = 0;
-  exe->outputOperands.forEach([&](int32_t id) {
-    ensureWaveVector(
-        exe->output[nth], operands[id]->type, maxRows, true, *arena_);
-    ++nth;
+std::string Executable::toString() const {
+  std::stringstream out;
+  out << "{Exe produces ";
+  bool first = true;
+  outputOperands.forEach([&](auto id) {
+    if (!first) {
+      out << ", ";
+    };
+    first = false;
+    out << waveStream->operandAt(id)->toString();
   });
-  return exe;
+  if (programShared) {
+    out << std::endl;
+    out << "program " << programShared->label();
+  }
+  return out.str();
 }
+
+std::string Program::toString() const {
+  std::stringstream out;
+  out << "{ program" << std::endl;
+  for (auto& instruction : instructions_) {
+    out << instruction->toString() << std::endl;
+  }
+  out << "}" << std::endl;
+  return out.str();
+}
+
+std::string AbstractFilter::toString() const {
+  return fmt::format("filter {} -> {}", flags->toString(), indices->toString());
+  ;
+}
+
+std::string AbstractWrap::toString() const {
+  std::stringstream out;
+  out << "wrap indices=" << indices->toString() << " {";
+  for (auto& op : source) {
+    out << op->toString() << " ";
+  }
+  out << "}";
+  return out.str();
+}
+
+std::string AbstractBinary::toString() const {
+  return fmt::format(
+      "{} = {} {} {} {}",
+      result->toString(),
+      left->toString(),
+      static_cast<int32_t>(opCode),
+      right->toString(),
+      predicate ? fmt::format(" if {}", predicate->toString()) : "");
+}
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/Wave.h b/velox/experimental/wave/exec/Wave.h
index 64754caafb14e..d9e6506e27660 100644
--- a/velox/experimental/wave/exec/Wave.h
+++ b/velox/experimental/wave/exec/Wave.h
@@ -28,6 +28,79 @@
 
 namespace facebook::velox::wave {
 
+/// A host side time point for measuring wait and launch prepare latency. Counts
+/// both wall microseconds and clocks.
+struct WaveTime {
+  size_t micros{0};
+  uint64_t clocks{0};
+
+  static WaveTime now() {
+    return {getCurrentTimeMicro(), folly::hardware_timestamp()};
+  }
+
+  WaveTime operator-(const WaveTime right) const {
+    return {right.micros - micros, right.clocks - clocks};
+  }
+
+  WaveTime operator+(const WaveTime right) const {
+    return {right.micros + micros, right.clocks + clocks};
+  }
+  void operator+=(const WaveTime& other) {
+    micros += other.micros;
+    clocks += other.clocks;
+  }
+  std::string toString() const;
+};
+
+class WaveTimer {
+  WaveTimer(WaveTime& accumulator)
+      : accumulator_(accumulator), start_(WaveTime::now()) {}
+  ~WaveTimer() {
+    accumulator_ = accumulator_ + (WaveTime::now() - start_);
+  }
+
+ private:
+  WaveTime& accumulator_;
+  WaveTime start_;
+};
+
+struct WaveStats {
+  /// Count of WaveStreams.
+  int64_t numWaves{1};
+
+  // Count of kernel launches.
+  int64_t numKernels{0};
+
+  // Count of thread blocks in all kernel launches.
+  int64_t numThreadBlocks{0};
+
+  /// Number of programs. One launch typically has several programs, roughly one
+  /// per output column.
+  int64_t numPrograms{0};
+
+  /// Number of starting lanes in kernel launches. This is not exactly thread
+  /// blocks because the last block per program is not full.
+  int64_t numThreads{0};
+
+  /// Data transfer from host to device.
+  int64_t bytesToDevice{0};
+
+  int64_t bytesToHost{0};
+
+  /// Number of times the host syncs with device.
+  int64_t numSync{0};
+
+  /// Time a host thread runs without activity on device, e.g. after a sync or
+  /// before first launch.
+  WaveTime hostOnlyTime;
+  /// Time a host thread runs after kernel launch preparing the next kernel.
+  WaveTime hostParallelTime;
+  /// Time a host thread waits for device.
+  WaveTime waitTime;
+
+  void add(const WaveStats& other);
+};
+
 // A value a kernel can depend on. Either a dedupped exec::Expr or a dedupped
 // subfield. Subfield between operators, Expr inside  an Expr.
 struct Value {
@@ -38,13 +111,8 @@ struct Value {
   ~Value() = default;
 
   bool operator==(const Value& other) const {
-    if (expr == other.expr && subfield == other.subfield) {
-      return true;
-    };
-    if (subfield && other.subfield && *subfield == *other.subfield) {
-      return true;
-    }
-    return false;
+    // Both exprs and subfields are deduplicated.
+    return expr == other.expr && subfield == other.subfield;
   }
 
   const exec::Expr* expr;
@@ -53,6 +121,7 @@ struct Value {
 
 struct ValueHasher {
   size_t operator()(const Value& value) const {
+    // Hash the addresses because both exprs and subfields are deduplicated.
     return folly::hasher<uint64_t>()(
                reinterpret_cast<uintptr_t>(value.subfield)) ^
         folly::hasher<uint64_t>()(reinterpret_cast<uintptr_t>(value.expr));
@@ -74,7 +143,7 @@ using DefinesMap =
 /// Translates a set of path steps to an OperandId or kNoOperand if
 /// none found. The path is not const because it is temporarily
 /// moved into a Subfield. Not thread safe for 'path'.
-OperandId pathToOperand(
+AbstractOperand* pathToOperand(
     const DefinesMap& map,
     std::vector<std::unique_ptr<common::Subfield::PathElement>>& path);
 
@@ -124,7 +193,6 @@ struct Executable {
   /// addTransfer().
   static void startTransfer(
       OperandSet outputOperands,
-      WaveBufferPtr&& operands,
       std::vector<WaveVectorPtr>&& outputVectors,
       std::vector<Transfer>&& transfers,
       WaveStream& stream);
@@ -146,8 +214,12 @@ struct Executable {
   void reuse() {
     operands = nullptr;
     stream = nullptr;
+    wraps.clear();
   }
-  // The containing WaveStream, if needed.
+
+  virtual std::string toString() const;
+
+  // The containing WaveStream.
   WaveStream* waveStream{nullptr};
 
   // The Program this is an invocationn of. nullptr if 'this' represents a data
@@ -168,18 +240,29 @@ struct Executable {
   // Operand ids for outputs.
   OperandSet outputOperands;
 
-  // Unified memory Operand structs for intermediates/outputs. These
+  // Unified memory Operand structs for intermediates/outputs/literals. These
   // are a contiguous array of Operand in LaunchControl of 'this'
   Operand* operands;
 
+  // Index of first output operand in 'operands'.
+  int32_t firstOutputOperandIdx{-1};
+
+  // Map from wrapAt in AbstractOperand to device side 'indices' with one
+  // int32_t* per thread block.
+  folly::F14FastMap<int32_t, int32_t**> wraps;
+
+  // Host side array of literals. These refer to literal data in device side
+  // ThreadBlockProgram. These are copied at the end of 'operands' at launch.
+  const std::vector<Operand>* literals;
+
   // Backing memory for intermediate Operands. Free when 'this' arrives. If
   // scheduling follow up work that is synchronized with arrival of 'this', the
   // intermediates can be moved to the dependent executable at time of
   // scheduling.
   std::vector<WaveVectorPtr> intermediates;
 
-  // Backing device memory   for 'output' Can be moved to intermediates or
-  // output of a dependent executables.
+  // Backing device memory   for 'output'. These are accessed by dependent
+  // executables and must not be written to until out of scope.
   std::vector<WaveVectorPtr> output;
 
   // If this represents data transfer, the ranges to transfer.
@@ -203,6 +286,11 @@ class Program : public std::enable_shared_from_this<Program> {
     instructions_.push_back(std::move(instruction));
   }
 
+  /// Specifies that Operand with 'id' is used by a dependent operation.
+  void markOutput(OperandId id) {
+    outputIds_.add(id);
+  }
+
   const std::vector<Program*>& dependsOn() const {
     return dependsOn_;
   }
@@ -215,8 +303,8 @@ class Program : public std::enable_shared_from_this<Program> {
     dependsOn_.push_back(source);
   }
 
-  // Initializes executableImage and relocation information and places for
-  // parameters.
+  // Initializes executableImage and relocation information and places
+  // the result on device.
   void prepareForDevice(GpuArena& arena);
 
   std::unique_ptr<Executable> getExecutable(
@@ -247,11 +335,31 @@ class Program : public std::enable_shared_from_this<Program> {
     return sharedMemorySize_;
   }
 
-  const folly::F14FastMap<AbstractOperand*, int32_t>& localAndOutput() const {
-    return local_;
+  const folly::F14FastMap<AbstractOperand*, int32_t>& output() const {
+    return output_;
+  }
+
+  const std::string& label() const {
+    return label_;
+  }
+
+  void addLabel(const std::string& label) {
+    label_ = label_ + " " + label;
   }
 
+  std::string toString() const;
+
  private:
+  template <TypeKind kind>
+  int32_t addLiteralTyped(AbstractOperand* op);
+  /// Returns a starting offset to a constant with 'count' elements of T,
+  /// initialized from 'value[]' The values are copied to device side
+  /// ThreadBlockProgram.
+  template <typename T>
+  int32_t addLiteral(T* value, int32_t count);
+
+  void literalToOperand(AbstractOperand* abstractOp, Operand& op);
+
   GpuArena* arena_{nullptr};
   std::vector<Program*> dependsOn_;
   DefinesMap produces_;
@@ -260,7 +368,8 @@ class Program : public std::enable_shared_from_this<Program> {
 
   // Adds 'op' to 'input' if it is not produced by one in 'local'
   void markInput(AbstractOperand* op);
-  // Adds 'op' to 'local_'
+
+  // Adds 'op' to 'local_' or 'output_'.
   void markResult(AbstractOperand* op);
   void sortSlots();
 
@@ -269,8 +378,28 @@ class Program : public std::enable_shared_from_this<Program> {
   // Input Operand  to offset in operands array.
   folly::F14FastMap<AbstractOperand*, int32_t> input_;
 
-  // Local/output Operand offset in operands array.
+  /// Set of OperandIds for outputs. These must come after intermediates in
+  /// Operands array.
+  OperandSet outputIds_;
+
+  // Local Operand offset in operands array.
   folly::F14FastMap<AbstractOperand*, int32_t> local_;
+  // Output Operand offset in operands array.
+  folly::F14FastMap<AbstractOperand*, int32_t> output_;
+
+  // OperandIdx for first literal operand.
+  int32_t firstLiteralIdx_{-1};
+
+  // Constant Operand  to offset in operands array.
+  folly::F14FastMap<AbstractOperand*, int32_t> literal_;
+
+  // Offset of first unused constant area byte from start of constant area.
+  int32_t nextLiteral_{0};
+
+  // Binary data for constants to be embedded in ThreadBlockProgram. Must be
+  // relocatable, i.e. does not contain non-relative pointers within the
+  // constant area.
+  std::string literalArea_;
 
   // Owns device side 'threadBlockProgram_'
   WaveBufferPtr deviceData_;
@@ -279,6 +408,15 @@ class Program : public std::enable_shared_from_this<Program> {
   ThreadBlockProgram* program_;
 
   int32_t sharedMemorySize_{0};
+
+  // Host side image of device side Operands that reference 'constantArea_'.
+  // These are copied at the end of the operand block created at kernel launch.
+  std::vector<Operand> literalOperands_;
+
+  std::string label_;
+
+  // Start of device side constant area.
+  char* deviceLiterals_{nullptr};
   // Serializes 'prepared_'. Access on WaveStrea, is single threaded but sharing
   // Programs across WaveDrivers makes sense, so make the preallocated resource
   // thread safe.
@@ -295,7 +433,25 @@ struct LaunchControl;
 /// Represents consecutive data dependent kernel launches.
 class WaveStream {
  public:
-  WaveStream(GpuArena& arena) : arena_(arena) {}
+  /// Describes what 'this' is doing for purposes of stats collection.
+  enum class State {
+    // Not runnable, e.g. another WaveStream is being processed by WaveDriver.
+    kNotRunning,
+    // Running on host only, e.g. preparing for first kernel launch.
+    kHost,
+    // Running on host with device side work submitted.
+    kParallel,
+    // Waiting on host thread for device results.
+    kWait
+  };
+
+  WaveStream(
+      GpuArena& arena,
+      GpuArena& hostArena,
+      const std::vector<std::unique_ptr<AbstractOperand>>* operands)
+      : arena_(arena), hostArena_(hostArena), operands_(operands) {
+    operandNullable_.resize(operands_->size(), true);
+  }
 
   ~WaveStream();
 
@@ -310,9 +466,48 @@ class WaveStream {
     return arena_;
   }
 
-  void getOutput(
+  void setNullable(const AbstractOperand& op, bool nullable) {
+    operandNullable_[op.id] = nullable;
+  }
+
+  int32_t numRows() const {
+    return numRows_;
+  }
+
+  // Sets the size of top-level vectors to be prepared for the next launch.
+  void setNumRows(int32_t numRows) {
+    numRows_ = numRows;
+  }
+
+  /// Sets 'vector' to ' a WaveVector of suitable type, size and
+  /// nullability. May reuse 'vector' if not nullptr. The size comes
+  /// from setNumRows() if not given as parameter.
+  void ensureVector(
+      const AbstractOperand& operand,
+      WaveVectorPtr& vector,
+      int32_t numRows = -1);
+
+  /// Marks 'op' as being later copied to host.  Allocates these together.
+  void markHostOutputOperand(const AbstractOperand& op);
+
+  /// Finalizes return state. setNumRows and markHostOutputOperand may not be
+  /// called after this. If 'needStatus' is false and no columns are marked for
+  /// host return there is no need for any data transfer at the end of the
+  /// stream.
+  void setReturnData(bool needStatus);
+
+  /// Enqueus copy of device side results to host.
+  void resultToHost();
+
+  /// Updates 'vectors' to reference the data in 'operands'. 'id' is the id of
+  /// the last WaveOperator. It identifies the LaunchControl with the final
+  /// BlockStatus with errors and cardinalities. Returns the number of rows
+  /// after possible selection.
+  int32_t getOutput(
+      int32_t operatorId,
+      memory::MemoryPool& pool,
       folly::Range<const OperandId*> operands,
-      WaveVectorPtr* waveVectors);
+      VectorPtr* vectors);
 
   Executable* operandExecutable(OperandId id) {
     auto it = operandToExecutable_.find(id);
@@ -385,7 +580,7 @@ class WaveStream {
       int32_t inputRows,
       folly::Range<Executable**> exes,
       int32_t blocksPerExe,
-      bool initstatus,
+      const LaunchControl* inputStatus,
       Stream* stream);
 
   const std::vector<std::unique_ptr<LaunchControl>>& launchControls(
@@ -393,7 +588,46 @@ class WaveStream {
     return launchControl_[key];
   }
 
+  void addLaunchControl(int32_t key, std::unique_ptr<LaunchControl> control) {
+    launchControl_[key].push_back(std::move(control));
+  }
+
+  const AbstractOperand* operandAt(int32_t id) {
+    VELOX_CHECK_LT(id, operands_->size());
+    return (*operands_)[id].get();
+  }
+
+  // Describes an exe in a multi-program launch.
+  struct ExeLaunchInfo {
+    int32_t numBlocks;
+    int32_t numInput{0};
+    int32_t numLocalOps{0};
+    int32_t numLocalWrap{0};
+    int32_t totalBytes{0};
+    folly::F14FastMap<int32_t, int32_t**> inputWrap;
+    folly::F14FastMap<int32_t, int32_t**> localWrap;
+  };
+
+  void
+  exeLaunchInfo(Executable& exe, int32_t blocksPerExe, ExeLaunchInfo& info);
+
+  Operand** fillOperands(Executable& exe, char* start, ExeLaunchInfo& info);
+
+  /// Sets the state for stats collection.
+  void setState(WaveStream::State state);
+
+  const WaveStats& stats() const {
+    return stats_;
+  }
+
+  WaveStats& stats() {
+    return stats_;
+  }
+
  private:
+  // true if 'op' is nullable in the context of 'this'.
+  bool isNullable(const AbstractOperand& op) const;
+
   Event* newEvent();
 
   static std::unique_ptr<Event> eventFromReserve();
@@ -408,15 +642,28 @@ class WaveStream {
   static void clearReusable();
 
   GpuArena& arena_;
+  GpuArena& hostArena_;
+  const std::vector<std::unique_ptr<AbstractOperand>>* const operands_;
+  // True at '[i]' if in this stream 'operands_[i]' should have null flags.
+  std::vector<bool> operandNullable_;
+
+  // Number of rows to allocate for top level vectors for the next kernel
+  // launch.
+  int32_t numRows_{0};
+
   folly::F14FastMap<OperandId, Executable*> operandToExecutable_;
   std::vector<std::unique_ptr<Executable>> executables_;
 
   // Currently active streams, each at the position given by its
   // stream->userData().
   std::vector<std::unique_ptr<Stream>> streams_;
+
   // The most recent event recorded on the pairwise corresponding element of
   // 'streams_'.
   std::vector<Event*> lastEvent_;
+  // If status return copy has been initiated, then this is th event to sync
+  // with before accessing the 'hostReturnData_'
+  Event* hostReturnEvent_{nullptr};
 
   // all events recorded on any stream. Events, once seen realized, are moved
   // back to reserve from here.
@@ -428,6 +675,34 @@ class WaveStream {
       launchControl_;
 
   folly::F14FastMap<int32_t, WaveBufferPtr> extraData_;
+
+  // ids of operands that need their memory to be in the host return area.
+  OperandSet hostOutputOperands_;
+
+  // Offset of the operand in 'hostReturnData_' and 'deviceReturnData_'.
+  folly::F14FastMap<OperandId, int64_t> hostReturnOffset_;
+
+  // Size of data returned at end of stream.
+  int64_t hostReturnSize_{0};
+
+  int64_t hostReturnDataUsed_{0};
+
+  // Device side data for all returnable data, like BlockStatus and Vector
+  // bodies to be copied to host.
+  WaveBufferPtr deviceReturnData_;
+
+  // Host pinned memory to which 'deviceReturnData' is copied.
+  WaveBufferPtr hostReturnData_;
+
+  // Pointer to statuses inside 'hostReturnData_'.
+  BlockStatus* hostStatus_{nullptr};
+
+  // Time when host side activity last started on 'this'.
+  WaveTime start_;
+
+  State state_{State::kNotRunning};
+
+  WaveStats stats_;
 };
 
 /// Describes all the control data for launching a kernel executing
@@ -443,25 +718,33 @@ class WaveStream {
 //// WaveVectors in each exe. Array of TB return status blocks, one
 //// per TB.
 struct LaunchControl {
-  int32_t key;
+  LaunchControl(int32_t _key, int32_t _inputRows)
+      : key(_key), inputRows(_inputRows) {}
+
+  // Id of the initiating operator.
+  const int32_t key;
 
-  int32_t inputRows;
+  // Number of rows the programs get as input. Initializes the BlockStatus'es on
+  // device in prepareProgamLaunch().
+  const int32_t inputRows;
 
-  /// The first thread block with the program.
-  int32_t* blockBase;
+  /// The first thread block with the program. Subscript is blockIdx.x.
+  int32_t* blockBase{0};
   // The ordinal of the program. All blocks with the same program have the same
-  // number here.
-  int32_t* programIdx;
+  // number here. Subscript is blockIdx.x.
+  int32_t* programIdx{nullptr};
 
-  // The TB program for each exe.
-  ThreadBlockProgram** programs;
+  // The TB program for each exe. The subscript is programIdx[blockIdx.x].
+  ThreadBlockProgram** programs{nullptr};
 
   // For each exe, the start of the array of Operand*. Instructions reference
-  // operands via offset in this array.//
-  Operand*** operands;
+  // operands via offset in this array. The subscript is
+  // programIndx[blockIdx.x].
+  Operand*** operands{nullptr};
 
-  // the status return block for each TB.
-  BlockStatus* status;
+  // the status return block for each TB. The subscript is blockIdx.x -
+  // (blockBase[blockIdx.x] / kBlockSize). Shared between all programs.
+  BlockStatus* status{nullptr};
   int32_t sharedMemorySize{0};
 
   // Storage for all the above in a contiguous unified memory piece.
diff --git a/velox/experimental/wave/exec/WaveCore.cuh b/velox/experimental/wave/exec/WaveCore.cuh
index 058b89bb18627..d000cf5081315 100644
--- a/velox/experimental/wave/exec/WaveCore.cuh
+++ b/velox/experimental/wave/exec/WaveCore.cuh
@@ -27,8 +27,45 @@ __device__ inline T& flatValue(void* base, int32_t blockBase) {
   return reinterpret_cast<T*>(base)[blockBase + threadIdx.x];
 }
 
-__device__ inline bool isNull(Operand* op, int32_t blockBase) {
-  return op->nulls == nullptr || !op->nulls[blockBase + threadIdx.x];
+template <typename T>
+__device__ T& sharedMemoryOperand(char* shared, OperandIndex op) {
+  return reinterpret_cast<T*>(
+      shared + ((op & kSharedOperandMask) << 1))[blockIdx.x];
+}
+/// Returns true if operand is non null. Sets 'value' to the value of the
+/// operand.
+template <typename T>
+__device__ inline bool operandOrNull(
+    Operand** operands,
+    OperandIndex opIdx,
+    int32_t blockBase,
+    char* shared,
+    T& value) {
+  if (opIdx > kMinSharedMemIndex) {
+    uint16_t mask = opIdx & kSharedNullMask;
+    if (mask > 0 && shared[kBlockSize * (mask - 1) + blockIdx.x] == kNull) {
+      return false;
+    }
+    value = sharedMemoryOperand<T>(shared, opIdx);
+    return true;
+  }
+  auto op = operands[opIdx];
+  int32_t index = threadIdx.x;
+  if (auto indicesInOp = op->indices) {
+    auto indices = indicesInOp[blockBase / kBlockSize];
+    if (indices) {
+      index = indices[index];
+    } else {
+      index += blockBase;
+    }
+  } else {
+    index = (index + blockBase) & op->indexMask;
+  }
+  if (op->nulls && op->nulls[index] == kNull) {
+    return false;
+  }
+  value = reinterpret_cast<const T*>(op->base)[index];
+  return true;
 }
 
 template <typename T>
@@ -38,8 +75,7 @@ __device__ inline T getOperand(
     int32_t blockBase,
     char* shared) {
   if (opIdx > kMinSharedMemIndex) {
-    return reinterpret_cast<T*>(
-        shared + opIdx - kMinSharedMemIndex)[blockIdx.x];
+    return sharedMemoryOperand<T>(shared, opIdx);
   }
   auto op = operands[opIdx];
   int32_t index = (threadIdx.x + blockBase) & op->indexMask;
@@ -68,6 +104,21 @@ __device__ inline T value(Operand* op, int index) {
   return reinterpret_cast<const T*>(op->base)[index];
 }
 
+/// Sets the lane's result to null for opIdx.
+__device__ inline void resultNull(
+    Operand** operands,
+    OperandIndex opIdx,
+    int32_t blockBase,
+    char* shared) {
+  if (opIdx >= kMinSharedMemIndex) {
+    auto offset = (opIdx & kSharedNullMask) - 1;
+    shared[(kBlockSize * offset) + blockIdx.x] = kNull;
+  } else {
+    auto* op = operands[opIdx];
+    op->nulls[blockBase + threadIdx.x] = kNull;
+  }
+}
+
 template <typename T>
 __device__ inline T& flatResult(
     Operand** operands,
@@ -75,8 +126,10 @@ __device__ inline T& flatResult(
     int32_t blockBase,
     char* shared) {
   if (opIdx >= kMinSharedMemIndex) {
-    return reinterpret_cast<T*>(
-        shared + opIdx - kMinSharedMemIndex)[threadIdx.x];
+    if (auto mask = (opIdx & kSharedNullMask)) {
+      shared[(kBlockSize * (mask - 1)) + blockIdx.x] = kNotNull;
+    }
+    return sharedMemoryOperand<T>(shared, opIdx);
   }
   auto* op = operands[opIdx];
   if (op->nulls) {
diff --git a/velox/experimental/wave/exec/WaveDataSource.h b/velox/experimental/wave/exec/WaveDataSource.h
index 6272b824ea315..ad478c002f27a 100644
--- a/velox/experimental/wave/exec/WaveDataSource.h
+++ b/velox/experimental/wave/exec/WaveDataSource.h
@@ -42,7 +42,7 @@ class WaveDataSource {
 
   virtual void addSplit(std::shared_ptr<connector::ConnectorSplit> split) = 0;
 
-  virtual int32_t canAdvance() = 0;
+  virtual int32_t canAdvance(WaveStream& stream) = 0;
 
   virtual void schedule(WaveStream& stream, int32_t maxRows = 0) = 0;
 
diff --git a/velox/experimental/wave/exec/WaveDriver.cpp b/velox/experimental/wave/exec/WaveDriver.cpp
index 18347d01ea7cb..93802e64ec6ca 100644
--- a/velox/experimental/wave/exec/WaveDriver.cpp
+++ b/velox/experimental/wave/exec/WaveDriver.cpp
@@ -41,6 +41,9 @@ WaveDriver::WaveDriver(
       subfields_(std::move(subfields)),
       operands_(std::move(operands)) {
   VELOX_CHECK(!waveOperators.empty());
+  auto returnBatchSize = 10000 * outputType_->size() * 10;
+  hostArena_ = std::make_unique<GpuArena>(
+      returnBatchSize * 10, getHostAllocator(getDevice()));
   pipelines_.emplace_back();
   for (auto& op : waveOperators) {
     op->setDriver(this);
@@ -49,6 +52,7 @@ WaveDriver::WaveDriver(
     }
     pipelines_.back().operators.push_back(std::move(op));
   }
+  pipelines_.back().needStatus = true;
 }
 
 RowVectorPtr WaveDriver::getOutput() {
@@ -69,6 +73,7 @@ RowVectorPtr WaveDriver::getOutput() {
           ++it;
           continue;
         }
+        stream->setState(WaveStream::State::kNotRunning);
         RowVectorPtr result;
         if (i + 1 < pipelines_.size()) {
           auto waveResult = makeWaveResult(op.outputType(), *stream, lastSet);
@@ -80,6 +85,7 @@ RowVectorPtr WaveDriver::getOutput() {
           VLOG(1) << "Final output size: " << result->size();
         }
         if (streamAtEnd(*stream)) {
+          waveStats_.add(stream->stats());
           it = streams.erase(it);
         } else {
           ++it;
@@ -97,6 +103,7 @@ RowVectorPtr WaveDriver::getOutput() {
     }
     if (!running) {
       VLOG(1) << "No more output";
+      updateStats();
       finished_ = true;
       return nullptr;
     }
@@ -127,19 +134,19 @@ RowVectorPtr WaveDriver::makeResult(
     const OperandSet& lastSet) {
   auto& last = *pipelines_.back().operators.back();
   auto& rowType = last.outputType();
+  auto operatorId = last.operatorId();
   std::vector<VectorPtr> children(rowType->size());
+  int32_t numRows = stream.getOutput(
+      operatorId, *operatorCtx_->pool(), resultOrder_, children.data());
   auto result = std::make_shared<RowVector>(
       operatorCtx_->pool(),
       rowType,
       BufferPtr(nullptr),
-      last.outputSize(stream),
+      numRows,
       std::move(children));
-  int32_t nthChild = 0;
-  std::vector<WaveVectorPtr> waveVectors(resultOrder_.size());
-  stream.getOutput(resultOrder_, waveVectors.data());
-  for (auto& item : waveVectors) {
-    result->childAt(nthChild++) = item->toVelox(operatorCtx_->pool());
-  };
+  if (!numRows) {
+    return nullptr;
+  }
   return result;
 }
 
@@ -150,25 +157,32 @@ void WaveDriver::startMore() {
     if (blockingReason_ != exec::BlockingReason::kNotBlocked) {
       return;
     }
-    if (auto rows = ops[0]->canAdvance()) {
+    auto stream =
+        std::make_unique<WaveStream>(*arena_, *hostArena_, &operands());
+    stream->setState(WaveStream::State::kHost);
+
+    if (auto rows = ops[0]->canAdvance(*stream)) {
       VLOG(1) << "Advance " << rows << " rows in pipeline " << i;
-      auto stream = std::make_unique<WaveStream>(*arena_);
+      stream->setNumRows(rows);
+      if (i == pipelines_.size() - 1) {
+        for (auto i : resultOrder_) {
+          stream->markHostOutputOperand(*operands_[i]);
+        }
+      }
+      stream->setReturnData(pipelines_[i].needStatus);
       for (auto& op : ops) {
         op->schedule(*stream, rows);
       }
-      if (i == pipelines_.size() - 1) {
-        prefetchReturn(*stream);
+      if (pipelines_[i].needStatus) {
+        stream->resultToHost();
       }
+      stream->setState(WaveStream::State::kNotRunning);
       pipelines_[i].streams.push_back(std::move(stream));
       break;
     }
   }
 }
 
-void WaveDriver::prefetchReturn(WaveStream& stream) {
-  // Schedule return buffers from last op to be on host side.
-}
-
 LaunchControl* WaveDriver::inputControl(
     WaveStream& stream,
     int32_t operatorId) {
@@ -200,4 +214,39 @@ std::string WaveDriver::toString() const {
   return out.str();
 }
 
+void WaveDriver::updateStats() {
+  auto lockedStats = stats_.wlock();
+  lockedStats->addRuntimeStat(
+      "wave.numWaves", RuntimeCounter(waveStats_.numWaves));
+  lockedStats->addRuntimeStat(
+      "wave.numKernels", RuntimeCounter(waveStats_.numKernels));
+  lockedStats->addRuntimeStat(
+      "wave.numThreadBlocks", RuntimeCounter(waveStats_.numThreadBlocks));
+  lockedStats->addRuntimeStat(
+      "wave.numThreads", RuntimeCounter(waveStats_.numThreads));
+  lockedStats->addRuntimeStat(
+      "wave.numPrograms", RuntimeCounter(waveStats_.numPrograms));
+  lockedStats->addRuntimeStat(
+      "wave.numSync", RuntimeCounter(waveStats_.numSync));
+  lockedStats->addRuntimeStat(
+      "wave.bytesToDevice",
+      RuntimeCounter(waveStats_.bytesToDevice, RuntimeCounter::Unit::kBytes));
+  lockedStats->addRuntimeStat(
+      "wave.bytesToHost",
+      RuntimeCounter(waveStats_.bytesToHost, RuntimeCounter::Unit::kBytes));
+  lockedStats->addRuntimeStat(
+      "wave.hostOnlyTime",
+      RuntimeCounter(
+          waveStats_.hostOnlyTime.micros * 1000, RuntimeCounter::Unit::kNanos));
+  lockedStats->addRuntimeStat(
+      "wave.hostParallelTime",
+      RuntimeCounter(
+          waveStats_.hostParallelTime.micros * 1000,
+          RuntimeCounter::Unit::kNanos));
+  lockedStats->addRuntimeStat(
+      "wave.waitTime",
+      RuntimeCounter(
+          waveStats_.waitTime.micros * 1000, RuntimeCounter::Unit::kNanos));
+}
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/WaveDriver.h b/velox/experimental/wave/exec/WaveDriver.h
index ea77b336f24eb..c80a5d8a2493c 100644
--- a/velox/experimental/wave/exec/WaveDriver.h
+++ b/velox/experimental/wave/exec/WaveDriver.h
@@ -57,6 +57,10 @@ class WaveDriver : public exec::SourceOperator {
     return *arena_;
   }
 
+  GpuArena& hostArena() const {
+    return *hostArena_;
+  }
+
   const std::vector<std::unique_ptr<AbstractOperand>>& operands() {
     return operands_;
   }
@@ -73,9 +77,11 @@ class WaveDriver : public exec::SourceOperator {
   std::string toString() const override;
 
   void addDynamicFilter(
+      const core::PlanNodeId& producer,
       column_index_t outputChannel,
       const std::shared_ptr<common::Filter>& filter) override {
-    pipelines_[0].operators[0]->addDynamicFilter(outputChannel, filter);
+    pipelines_[0].operators[0]->addDynamicFilter(
+        producer, outputChannel, filter);
   }
 
   exec::OperatorCtx* operatorCtx() const {
@@ -99,8 +105,7 @@ class WaveDriver : public exec::SourceOperator {
   // and there is space in the arena.
   void startMore();
 
-  // Enqueus a prefetch from device to host for the buffers of output vectors.
-  void prefetchReturn(WaveStream& stream);
+  void updateStats();
 
   std::unique_ptr<GpuArena> arena_;
   std::unique_ptr<GpuArena> deviceArena_;
@@ -121,6 +126,10 @@ class WaveDriver : public exec::SourceOperator {
     // independently of each other.  This is bounded by device memory and the
     // speed at which the source can produce new batches.
     std::list<std::unique_ptr<WaveStream>> streams;
+    /// True if status copy to host is needed after the last kernel. True if
+    /// returns vectors to host or if can produce multiple batches of output for
+    /// one input.
+    bool needStatus{false};
   };
 
   std::vector<Pipeline> pipelines_;
@@ -135,6 +144,7 @@ class WaveDriver : public exec::SourceOperator {
   SubfieldMap subfields_;
   // Operands handed over by compilation.
   std::vector<std::unique_ptr<AbstractOperand>> operands_;
+  WaveStats waveStats_;
 };
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/WaveHiveDataSource.cpp b/velox/experimental/wave/exec/WaveHiveDataSource.cpp
index ddf5224287c6a..d6120f90b774f 100644
--- a/velox/experimental/wave/exec/WaveHiveDataSource.cpp
+++ b/velox/experimental/wave/exec/WaveHiveDataSource.cpp
@@ -94,8 +94,8 @@ void WaveHiveDataSource::addSplit(
   splitReader_->prepareSplit(metadataFilter_, runtimeStats_);
 }
 
-int32_t WaveHiveDataSource::canAdvance() {
-  return splitReader_ != nullptr ? splitReader_->canAdvance() : 0;
+int32_t WaveHiveDataSource::canAdvance(WaveStream& stream) {
+  return splitReader_ != nullptr ? splitReader_->canAdvance(stream) : 0;
 }
 
 void WaveHiveDataSource::schedule(WaveStream& stream, int32_t maxRows) {
diff --git a/velox/experimental/wave/exec/WaveHiveDataSource.h b/velox/experimental/wave/exec/WaveHiveDataSource.h
index 45ad75621d304..01d2719b21c91 100644
--- a/velox/experimental/wave/exec/WaveHiveDataSource.h
+++ b/velox/experimental/wave/exec/WaveHiveDataSource.h
@@ -46,7 +46,7 @@ class WaveHiveDataSource : public WaveDataSource {
 
   void setFromDataSource(std::shared_ptr<WaveDataSource> dataSource) override;
 
-  int32_t canAdvance() override;
+  int32_t canAdvance(WaveStream& stream) override;
 
   void schedule(WaveStream& stream, int32_t maxRows) override;
 
diff --git a/velox/experimental/wave/exec/WaveOperator.cpp b/velox/experimental/wave/exec/WaveOperator.cpp
index 66ed0f2dc506f..3ed89becfe652 100644
--- a/velox/experimental/wave/exec/WaveOperator.cpp
+++ b/velox/experimental/wave/exec/WaveOperator.cpp
@@ -24,35 +24,39 @@ WaveOperator::WaveOperator(
     CompileState& state,
     const RowTypePtr& type,
     const std::string& planNodeId)
-    : id_(state.numOperators()), planNodeId_(planNodeId), outputType_(type) {
-  definesSubfields(state, outputType_);
-}
+    : id_(state.numOperators()), planNodeId_(planNodeId), outputType_(type) {}
 
-void WaveOperator::definesSubfields(
+AbstractOperand* WaveOperator::definesSubfield(
     CompileState& state,
     const TypePtr& type,
-    const std::string& parentPath) {
+    const std::string& parentPath,
+    bool sourceNullable) {
   switch (type->kind()) {
     case TypeKind::ROW: {
       auto& row = type->as<TypeKind::ROW>();
       for (auto i = 0; i < type->size(); ++i) {
         auto& child = row.childAt(i);
         auto name = row.nameOf(i);
-        auto field = state.toSubfield(name);
-        subfields_.push_back(field);
-        types_.push_back(child);
-        auto operand = state.findCurrentValue(Value(field));
-        if (!operand) {
-          operand = state.newOperand(child, name);
-        }
-        outputIds_.add(operand->id);
-        defines_[Value(field)] = operand;
+        std::string childPath = fmt::format("{}.{}", parentPath, name);
+        definesSubfield(state, child, childPath, sourceNullable);
       }
     }
       [[fallthrough]];
       // TODO:Add cases for nested types.
     default: {
-      return;
+      auto field = state.toSubfield(parentPath);
+      subfields_.push_back(field);
+      types_.push_back(type);
+      auto operand = state.findCurrentValue(Value(field));
+      if (!operand) {
+        operand = state.newOperand(type, parentPath);
+      }
+      if (sourceNullable && !operand->notNull && !operand->conditionalNonNull) {
+        operand->sourceNullable = true;
+      }
+      defines_[Value(field)] = operand;
+
+      return operand;
     }
   }
 }
@@ -61,4 +65,10 @@ folly::Synchronized<exec::OperatorStats>& WaveOperator::stats() {
   return driver_->stats();
 }
 
+std::string WaveOperator::toString() const {
+  std::stringstream out;
+  out << "Id: " << id_ << " produces " << outputIds_.toString() << std::endl;
+  return out.str();
+}
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/WaveOperator.h b/velox/experimental/wave/exec/WaveOperator.h
index 70f98e357246c..f0f157bcdd790 100644
--- a/velox/experimental/wave/exec/WaveOperator.h
+++ b/velox/experimental/wave/exec/WaveOperator.h
@@ -53,6 +53,10 @@ class WaveOperator {
     return isExpanding_;
   }
 
+  virtual bool isSource() const {
+    return false;
+  }
+
   virtual bool isStreaming() const = 0;
 
   virtual void enqueue(WaveVectorPtr) {
@@ -74,7 +78,7 @@ class WaveOperator {
   /// Returns how many rows of output are available from 'this'. Source
   /// operators and cardinality increasing operators must return a correct
   /// answer if they are ready to produce data. Others should return 0.
-  virtual int32_t canAdvance() {
+  virtual int32_t canAdvance(WaveStream& stream) {
     return 0;
   }
 
@@ -93,12 +97,13 @@ class WaveOperator {
     VELOX_FAIL("Override for source or blocking operator");
   }
 
-  virtual std::string toString() const = 0;
+  virtual std::string toString() const;
 
-  void definesSubfields(
+  AbstractOperand* definesSubfield(
       CompileState& state,
       const TypePtr& type,
-      const std::string& parentPath = "");
+      const std::string& parentPath = "",
+      bool sourceNullable = false);
 
   /// Returns the operand if this is defined by 'this'.
   AbstractOperand* defines(Value value) {
@@ -109,6 +114,11 @@ class WaveOperator {
     return it->second;
   }
 
+  /// Marks 'operand' as defined here.
+  void defined(Value value, AbstractOperand* op) {
+    defines_[value] = op;
+  }
+
   void setDriver(WaveDriver* driver) {
     driver_ = driver;
   }
@@ -124,12 +134,16 @@ class WaveOperator {
     return outputIds_;
   }
 
+  void addOutputId(OperandId id) {
+    outputIds_.add(id);
+  }
+
   // The set of output operands that must have arrived for there to be a result.
   virtual const OperandSet& syncSet() const {
     return outputIds_;
   }
 
-  /// Called once on each Operator, fiest to last, after no more
+  /// Called once on each Operator, first to last, after no more
   /// Operators will be added to the WaveDriver plan. Can be used for
   /// e.g. making executable images of Programs since their content
   /// and dependences will no longer change.
@@ -144,8 +158,9 @@ class WaveOperator {
   }
 
   virtual void addDynamicFilter(
-      column_index_t outputChannel,
-      const std::shared_ptr<common::Filter>& filter) {
+      const core::PlanNodeId& /*producer*/,
+      column_index_t /*outputChannel*/,
+      const std::shared_ptr<common::Filter>& /*filter*/) {
     VELOX_UNSUPPORTED();
   }
 
@@ -199,4 +214,16 @@ class WaveOperator {
   std::vector<WaveBufferPtr> executableMemory_;
 };
 
+class WaveSourceOperator : public WaveOperator {
+ public:
+  WaveSourceOperator(
+      CompileState& state,
+      const RowTypePtr& outputType,
+      const std::string& planNodeId)
+      : WaveOperator(state, outputType, planNodeId) {}
+  bool isSource() const override {
+    return true;
+  }
+};
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/WaveSplitReader.h b/velox/experimental/wave/exec/WaveSplitReader.h
index 6ba1a7adb25a4..769f4ec500615 100644
--- a/velox/experimental/wave/exec/WaveSplitReader.h
+++ b/velox/experimental/wave/exec/WaveSplitReader.h
@@ -56,7 +56,7 @@ class WaveSplitReader {
 
   virtual bool emptySplit() = 0;
 
-  virtual int32_t canAdvance() = 0;
+  virtual int32_t canAdvance(WaveStream& stream) = 0;
 
   virtual void schedule(WaveStream& stream, int32_t maxRows) = 0;
 
diff --git a/velox/experimental/wave/exec/tests/AggregationTest.cpp b/velox/experimental/wave/exec/tests/AggregationTest.cpp
index 499cc36bad55a..61d8400e3c38c 100644
--- a/velox/experimental/wave/exec/tests/AggregationTest.cpp
+++ b/velox/experimental/wave/exec/tests/AggregationTest.cpp
@@ -227,9 +227,3 @@ TEST_F(AggregationTest, tpchQ1) {
 
 } // namespace
 } // namespace facebook::velox::wave
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  folly::Init follyInit(&argc, &argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/velox/experimental/wave/exec/tests/CMakeLists.txt b/velox/experimental/wave/exec/tests/CMakeLists.txt
index c6e8f81145e87..008261b7f1d44 100644
--- a/velox/experimental/wave/exec/tests/CMakeLists.txt
+++ b/velox/experimental/wave/exec/tests/CMakeLists.txt
@@ -15,7 +15,7 @@
 add_subdirectory(utils)
 
 add_executable(velox_wave_exec_test FilterProjectTest.cpp TableScanTest.cpp
-                                    Main.cpp)
+                                    AggregationTest.cpp Main.cpp)
 
 add_test(velox_wave_exec_test velox_wave_exec_test)
 
diff --git a/velox/experimental/wave/exec/tests/FilterProjectTest.cpp b/velox/experimental/wave/exec/tests/FilterProjectTest.cpp
index 3b7fdb3a94ce9..230eba2fad8c0 100644
--- a/velox/experimental/wave/exec/tests/FilterProjectTest.cpp
+++ b/velox/experimental/wave/exec/tests/FilterProjectTest.cpp
@@ -67,6 +67,26 @@ class FilterProjectTest : public OperatorTestBase {
     auto task = assertQuery(plan, "SELECT c0, c1, c0 + c1 FROM tmp");
   }
 
+  std::shared_ptr<Task> assertFilterProject(
+      const std::string& filter,
+      const std::vector<std::string>& projections,
+      const std::vector<RowVectorPtr>& vectors) {
+    auto plan = PlanBuilder()
+                    .values(vectors)
+                    .filter(filter)
+                    .project(projections)
+                    .planNode();
+    std::stringstream sql;
+    sql << "SELECT ";
+    for (auto i = 0; i < projections.size(); ++i) {
+      sql << " " << projections[i] << (i == projections.size() - 1 ? "" : ",");
+    }
+    sql << " FROM tmp WHERE " << filter;
+
+    auto task = assertQuery(plan, sql.str());
+    return task;
+  }
+
   std::shared_ptr<const RowType> rowType_{
       ROW({"c0", "c1", "c2", "c3"},
           {BIGINT(), BIGINT(), SMALLINT(), DOUBLE()})};
@@ -96,3 +116,19 @@ TEST_F(FilterProjectTest, project) {
 
   assertProject(vectors);
 }
+
+TEST_F(FilterProjectTest, filterProject) {
+  std::vector<RowVectorPtr> vectors;
+  for (int32_t i = 0; i < 1; ++i) {
+    auto vector = std::dynamic_pointer_cast<RowVector>(
+        BatchMaker::createBatch(rowType_, 100, *pool_));
+    makeNotNull(vector, 1000000000);
+    vectors.push_back(vector);
+  }
+  createDuckDbTable(vectors);
+
+  assertFilterProject(
+      "c0 < 400000000",
+      std::vector<std::string>{"c0", "c1", "c1 + c0 as s", "c2", "c3"},
+      vectors);
+}
diff --git a/velox/experimental/wave/exec/tests/Main.cpp b/velox/experimental/wave/exec/tests/Main.cpp
index 8bf768f54a66b..a54054a9a2667 100644
--- a/velox/experimental/wave/exec/tests/Main.cpp
+++ b/velox/experimental/wave/exec/tests/Main.cpp
@@ -17,7 +17,11 @@
 
 #include <folly/Unit.h>
 #include <folly/init/Init.h>
+#include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include "velox/experimental/wave/common/Cuda.h"
+
+DEFINE_bool(list_kernels, false, "Print register use of kernels");
 
 // This main is needed for some tests on linux.
 int main(int argc, char** argv) {
@@ -25,5 +29,8 @@ int main(int argc, char** argv) {
   // Signal handler required for ThreadDebugInfoTest
   facebook::velox::process::addDefaultFatalSignalHandler();
   folly::Init init{&argc, &argv, false};
+  if (FLAGS_list_kernels) {
+    facebook::velox::wave::printKernels();
+  }
   return RUN_ALL_TESTS();
 }
diff --git a/velox/experimental/wave/exec/tests/TableScanTest.cpp b/velox/experimental/wave/exec/tests/TableScanTest.cpp
index 0e6bbebdbd41e..955a3413f2b8b 100644
--- a/velox/experimental/wave/exec/tests/TableScanTest.cpp
+++ b/velox/experimental/wave/exec/tests/TableScanTest.cpp
@@ -52,6 +52,7 @@ class TableScanTest : public virtual HiveConnectorTestBase {
 
   void TearDown() override {
     wave::test::Table::dropAll();
+    HiveConnectorTestBase::TearDown();
   }
 
   std::vector<RowVectorPtr> makeVectors(
@@ -69,6 +70,20 @@ class TableScanTest : public virtual HiveConnectorTestBase {
     return vectors;
   }
 
+  void makeNotNull(
+      RowVectorPtr row,
+      int64_t mod = std::numeric_limits<int64_t>::max()) {
+    for (auto i = 0; i < row->type()->size(); ++i) {
+      auto child = row->childAt(i);
+      if (auto ints = child->as<FlatVector<int64_t>>()) {
+        for (auto i = 0; i < child->size(); ++i) {
+          ints->set(i, ints->valueAt(i) % mod);
+        }
+      }
+      child->clearNulls(0, row->size());
+    }
+  }
+
   wave::test::SplitVector makeTable(
       const std::string& name,
       std::vector<RowVectorPtr>& rows) {
@@ -144,10 +159,31 @@ TEST_F(TableScanTest, basic) {
   auto plan = tableScanNode(type);
   auto task = assertQuery(plan, splits, "SELECT * FROM tmp");
 
-  // A quick sanity check for memory usage reporting. Check that peak total
-  // memory usage for the project node is > 0.
   auto planStats = toPlanStats(task->taskStats());
   auto scanNodeId = plan->id();
   auto it = planStats.find(scanNodeId);
   ASSERT_TRUE(it != planStats.end());
 }
+
+TEST_F(TableScanTest, filter) {
+  auto type =
+      ROW({"c0", "c1", "c2", "c3"}, {BIGINT(), BIGINT(), BIGINT(), BIGINT()});
+  auto vectors = makeVectors(type, 1, 1'000);
+  for (auto& vector : vectors) {
+    makeNotNull(vector, 1000000000);
+  }
+  auto splits = makeTable("test", vectors);
+  createDuckDbTable(vectors);
+
+  auto plan = PlanBuilder(pool_.get())
+                  .tableScan(type)
+                  .filter("c0 < 500000000")
+                  .project({"c0", "c1 + 100000000 as c1", "c2", "c3"})
+                  .filter("c1 < 500000000")
+                  .project({"c0", "c1", "c2 + 1", "c3", "c3 + 2"})
+                  .planNode();
+  auto task = assertQuery(
+      plan,
+      splits,
+      "SELECT c0, c1 + 100000000, c2 + 1, c3, c3 + 2 FROM tmp where c0 < 500000000 and c1 + 100000000 < 500000000");
+}
diff --git a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp
index 487c1cded066d..d4ae6a8cab049 100644
--- a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp
+++ b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp
@@ -43,7 +43,7 @@ WaveTestSplitReader::WaveTestSplitReader(
       true);
 }
 
-int32_t WaveTestSplitReader::canAdvance() {
+int32_t WaveTestSplitReader::canAdvance(WaveStream& stream) {
   if (!stripe_) {
     return 0;
   }
diff --git a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h
index 726a36677de10..2881f6de03f1e 100644
--- a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h
+++ b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h
@@ -34,7 +34,7 @@ class WaveTestSplitReader : public WaveSplitReader {
     return !stripe_ || stripe_->columns[0]->numValues == 0;
   }
 
-  int32_t canAdvance() override;
+  int32_t canAdvance(WaveStream& stream) override;
 
   void schedule(WaveStream& stream, int32_t maxRows = 0) override;
 
diff --git a/velox/experimental/wave/vector/Operand.h b/velox/experimental/wave/vector/Operand.h
index cc3cdb8b75b9f..6a67798d0ae40 100644
--- a/velox/experimental/wave/vector/Operand.h
+++ b/velox/experimental/wave/vector/Operand.h
@@ -81,10 +81,18 @@ constexpr OperandId kNoOperand = ~0;
 using OperandIndex = uint16_t;
 constexpr OperandIndex kEmpty = ~0;
 
-// operand indices above this are offsets into TB shared memory arrays. The
-// value to use is the item at blockIx.x.
+// operand indices above this are offsets into TB shared memory arrays.
 constexpr OperandIndex kMinSharedMemIndex = 0x8000;
 
+// Number of nullable locals in shared memory. Each has kBlockSize null bytes at
+// the start of the TB shared memory. 0 means no nulls. 1 means first kBlockSize
+// bytes are nulls, 2 means second kBlockSize  bytes are null flags etc.
+constexpr uint16_t kSharedNullMask = 3;
+
+/// Start of the parameter array in the TB shared memory. 13 bits. Shift 1 left
+/// to get offset.
+constexpr uint16_t kSharedOperandMask = 0x7ffc;
+
 /// Describes an operand for a Wave kernel instruction. The same
 /// insttruction is interpreted by multiple thread blocks in the
 /// kernel invocation. When accessing an operand, we have the base
@@ -108,15 +116,35 @@ struct Operand {
   // Array of flat base values. Cast to pod type or StringView.
   void* base;
 
+  // Array of null indicators. No nulls if nullptr.  A 1 means not-null, for
+  // consistency with Velox.
+  uint8_t* nulls;
+
   // If non-nullptr, provides index into 'base. Subscripted with the
   // blockIdx - idx of first bllock wit this instruction
   // stream. Different thread blocks may or may not have indices for
   // a given operand.
   int32_t** indices;
+};
 
-  // Array of null indicators. No nulls if nullptr.  A 1 means not-null, for
-  // consistency with Velox.
-  uint8_t* nulls;
+/// Per-lane error code.
+enum class ErrorCode : uint8_t {
+  // All operations completed.
+  kOk = 0,
+
+  // Catchall for runtime errors.
+  kError,
+
+  kInsufficientMemory,
 };
 
+/// Contains a count of active lanes and a per lane error code.
+struct BlockStatus {
+  int32_t numRows{0};
+  ErrorCode errors[kBlockSize];
+};
+
+/// Returns the number of active rows in 'status' for 'numBlocks'.
+int32_t statusNumRows(const BlockStatus* status, int32_t numBlocks);
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/vector/WaveVector.cpp b/velox/experimental/wave/vector/WaveVector.cpp
index 544cfa536cd34..94a5cb4fffa73 100644
--- a/velox/experimental/wave/vector/WaveVector.cpp
+++ b/velox/experimental/wave/vector/WaveVector.cpp
@@ -24,7 +24,8 @@ namespace facebook::velox::wave {
 WaveVector::WaveVector(
     const TypePtr& type,
     GpuArena& arena,
-    std::vector<std::unique_ptr<WaveVector>> children)
+    std::vector<std::unique_ptr<WaveVector>> children,
+    bool notNull)
     : type_(type),
       kind_(type_->kind()),
       arena_(&arena),
@@ -47,31 +48,33 @@ WaveVector::WaveVector(
   }
 }
 
-void WaveVector::resize(vector_size_t size, bool nullable) {
-  if (size > size_) {
-    int64_t bytes;
-    if (type_->kind() == TypeKind::VARCHAR) {
-      bytes = sizeof(StringView) * size;
-    } else {
-      bytes = type_->cppSizeInBytes() * size;
-    }
-    if (!values_ || bytes > values_->capacity()) {
-      values_ = arena_->allocateBytes(bytes);
-    }
-    if (nullable) {
-      if (!nulls_ || nulls_->capacity() < size) {
-        nulls_ = arena_->allocateBytes(size);
-      }
+void WaveVector::resize(
+    vector_size_t size,
+    bool nullable,
+    WaveBufferPtr* backing,
+    int64_t* backingOffset) {
+  auto capacity = values_ ? values_->capacity() : 0;
+  size_ = size;
+  int32_t bytesNeeded = backingSize(type_, size, nullable);
+  if (bytesNeeded > capacity) {
+    if (backing) {
+      values_ = WaveBufferView<WaveBufferPtr>::create(
+          (*backing)->as<uint8_t>() + *backingOffset, bytesNeeded, *backing);
+      *backingOffset += bytesNeeded;
     } else {
-      nulls_.reset();
+      values_ = arena_->allocateBytes(bytesNeeded);
     }
-    size_ = size;
+  }
+  if (nullable) {
+    nulls_ = values_->as<uint8_t>() + bytesNeeded - size;
+  } else {
+    nulls_ = nullptr;
   }
 }
 
 void WaveVector::toOperand(Operand* operand) const {
   operand->size = size_;
-  operand->nulls = nulls_ ? nulls_->as<uint8_t>() : nullptr;
+  operand->nulls = nulls_;
   if (encoding_ == VectorEncoding::Simple::CONSTANT) {
     operand->indexMask = 0;
     operand->base = values_->as<uint64_t>();
@@ -97,25 +100,32 @@ void toBits(uint64_t* words, int32_t numBytes) {
   }
 }
 
+namespace {
+class NoReleaser {
+ public:
+  void addRef() const {};
+  void release() const {};
+};
+
 template <TypeKind kind>
 static VectorPtr toVeloxTyped(
     vector_size_t size,
     velox::memory::MemoryPool* pool,
     const TypePtr& type,
     const WaveBufferPtr& values,
-    const WaveBufferPtr& nulls) {
+    const uint8_t* nulls) {
   using T = typename TypeTraits<kind>::NativeType;
 
   BufferPtr nullsView;
   if (nulls) {
-    nullsView = WaveBufferView::create(nulls);
+    nullsView = BufferView<NoReleaser>::create(nulls, size, NoReleaser());
     toBits(
         const_cast<uint64_t*>(nullsView->as<uint64_t>()),
         nullsView->capacity());
   }
   BufferPtr valuesView;
   if (values) {
-    valuesView = WaveBufferView::create(values);
+    valuesView = VeloxWaveBufferView::create(values);
   }
 
   return std::make_shared<FlatVector<T>>(
@@ -127,9 +137,95 @@ static VectorPtr toVeloxTyped(
       std::vector<BufferPtr>());
 }
 
-VectorPtr WaveVector::toVelox(memory::MemoryPool* pool) {
-  return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL(
+bool isDenselyFilled(const BlockStatus* status, int32_t numBlocks) {
+  for (int32_t i = 0; i < numBlocks - 1; ++i) {
+    if (status[i].numRows != kBlockSize) {
+      return false;
+    }
+  }
+  return true;
+}
+} // namespace
+
+int32_t statusNumRows(const BlockStatus* status, int32_t numBlocks) {
+  int32_t numRows = 0;
+  for (auto i = 0; i < numBlocks; ++i) {
+    numRows += status[i].numRows;
+  }
+  return numRows;
+}
+
+// static
+int32_t WaveVector::alignment(const TypePtr& type) {
+  switch (type->kind()) {
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+      return sizeof(void*);
+    default:
+      return type->cppSizeInBytes();
+  }
+}
+
+//    static
+int64_t
+WaveVector::backingSize(const TypePtr& type, int32_t size, bool nullable) {
+  int64_t bytes;
+  if (type->kind() == TypeKind::VARCHAR) {
+    bytes = sizeof(StringView) * size;
+  } else {
+    bytes = type->cppSizeInBytes() * size;
+  }
+  return bits::roundUp(bytes, sizeof(void*)) + (nullable ? size : 0);
+}
+
+VectorPtr WaveVector::toVelox(
+    memory::MemoryPool* pool,
+    int32_t numBlocks,
+    const BlockStatus* status,
+    const Operand* operand) {
+  auto base = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL(
       toVeloxTyped, type_->kind(), size_, pool, type_, values_, nulls_);
+  if (!status || !operand) {
+    return base;
+  }
+
+  // Translate the BlockStatus and indices in Operand to a host side dictionary
+  // wrap.
+  int maxRow = std::min<int32_t>(size_, numBlocks * kBlockSize);
+  numBlocks = bits::roundUp(maxRow, kBlockSize) / kBlockSize;
+  int numActive = statusNumRows(status, numBlocks);
+  auto operandIndices = operand->indices;
+  if (!operandIndices) {
+    // Vector sizes are >= active in status because they are allocated before
+    // the row count in status becomes known.
+    VELOX_CHECK_LE(
+        numActive,
+        size_,
+        "If there is no indirection in Operand, vector size must be <= BlockStatus");
+    // If all blocks except last are filled we return base without wrap.
+    if (isDenselyFilled(status, numBlocks)) {
+      return base;
+    }
+  }
+  auto indices = AlignedBuffer::allocate<vector_size_t>(numActive, pool);
+  auto rawIndices = indices->asMutable<vector_size_t>();
+  int32_t fill = 0;
+  for (auto block = 0; block < numBlocks; ++block) {
+    auto blockIndices = operandIndices ? operandIndices[block] : nullptr;
+    if (!blockIndices) {
+      for (auto i = 0; i < status[block].numRows; ++i) {
+        rawIndices[fill++] = block * kBlockSize + i;
+      }
+    } else {
+      memcpy(
+          rawIndices + fill,
+          blockIndices,
+          status[block].numRows * sizeof(int32_t));
+      fill += status[block].numRows;
+    }
+  }
+  return BaseVector::wrapInDictionary(
+      BufferPtr(nullptr), indices, numActive, base);
 }
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/vector/WaveVector.h b/velox/experimental/wave/vector/WaveVector.h
index d128bbd3c46af..aed7cfc32f45f 100644
--- a/velox/experimental/wave/vector/WaveVector.h
+++ b/velox/experimental/wave/vector/WaveVector.h
@@ -53,13 +53,14 @@ class WaveVector {
   }
 
   // Constructs a vector. Resize can be used to create buffers for a given size.
-  WaveVector(const TypePtr& type, GpuArena& arena)
-      : type_(type), kind_(type_->kind()), arena_(&arena) {}
+  WaveVector(const TypePtr& type, GpuArena& arena, bool notNull = false)
+      : type_(type), kind_(type_->kind()), arena_(&arena), notNull_(notNull) {}
 
   WaveVector(
       const TypePtr& type,
       GpuArena& arena,
-      std::vector<std::unique_ptr<WaveVector>> children);
+      std::vector<std::unique_ptr<WaveVector>> children,
+      bool notNull = false);
 
   const TypePtr& type() const {
     return type_;
@@ -69,15 +70,28 @@ class WaveVector {
     return size_;
   }
 
-  void resize(vector_size_t sie, bool nullable = true);
+  /// Sets the size to 'size'. Allocates the backing memory from
+  /// 'arena_'. If 'backing' is non-nullptr, uses '*backing' for
+  /// backing store, starting at offset *backingOffset'. Returns the
+  /// offset of the first unused byte in '*backingOffset'. Leaves
+  /// contents uninitialized in all cases.
+  void resize(
+      vector_size_t size,
+      bool nullable = true,
+      WaveBufferPtr* backing = nullptr,
+      int64_t* backingOffset = nullptr);
+
+  /// Returns the needed alignment for backing memory.
+  static int32_t alignment(const TypePtr& type);
+
+  /// Returns the size in bytes for 'size' elements of 'type', including nulls
+  /// if 'nullable' is true. Does not include string buffers.
+  static int64_t backingSize(const TypePtr& type, int32_t size, bool nullable);
 
   bool mayHaveNulls() const {
     return nulls_ != nullptr;
   }
 
-  // Makes sure there is space for nulls. Initial value is undefined.
-  void ensureNulls();
-
   // Frees all allocated buffers. resize() can be used to populate the buffers
   // with a selected size.
   void clear();
@@ -95,15 +109,19 @@ class WaveVector {
   }
 
   uint8_t* nulls() {
-    if (nulls_) {
-      return nulls_->as<uint8_t>();
-    }
-    return nullptr;
+    return nulls_;
   }
 
   /// Returns a Velox vector giving a view on device side data. The device
-  /// buffers stay live while referenced by Velox.
-  VectorPtr toVelox(memory::MemoryPool* pool);
+  /// buffers stay live while referenced by Velox. If there is a selection,
+  /// numBlocks is the number of kBlockSize blocks the vector was allocated for,
+  /// BlockStatus gives the row counts per block and Operand gives the
+  /// dictionary indices representing the selection.
+  VectorPtr toVelox(
+      memory::MemoryPool* pool,
+      int32_t numBlocks = -1,
+      const BlockStatus* status = nullptr,
+      const Operand* operand = nullptr);
 
   /// Sets 'operand' to point to the buffers of 'this'.
   void toOperand(Operand* operand) const;
@@ -126,31 +144,24 @@ class WaveVector {
 
   vector_size_t size_{0};
 
-  // Values array, cast to pod type or StringView
+  // Values array, cast to pod type or StringView. If there are nulls, the null
+  // flags are in this buffer after the values, starting at 'null_'
   WaveBufferPtr values_;
 
-  // Nulls buffer, nullptr if no nulls.
-  WaveBufferPtr nulls_;
+  // Nulls, points to the tail of 'values'. nullptr if no nulls.
+  uint8_t* nulls_{nullptr};
 
   // If dictionary or if wrapped in a selection, vector of indices into
   // 'values'.
   WaveBufferPtr indices_;
 
-  // Thread block level sizes. For each kBlockSize values, contains
-  // one int16_t that indicates how many of 'values' or 'indices' have
-  // a value.
-  WaveBufferPtr blockSizes_;
-  // Thread block level pointers inside 'indices_'. the ith entry is nullptr
-  // if the ith thread block has no row number mapping (all rows pass or none
-  // pass).
-  WaveBufferPtr blockIndices_;
-
   // Lengths and offsets for array/map elements.
   WaveBufferPtr lengths_;
   WaveBufferPtr offsets_;
 
   // Members of a array/map/struct vector.
   std::vector<std::unique_ptr<WaveVector>> children_;
+  bool notNull_{false};
 };
 
 using WaveVectorPtr = std::unique_ptr<WaveVector>;
@@ -170,11 +181,17 @@ struct WaveReleaser {
 };
 
 // A BufferView for velox::BaseVector for a view on unified memory.
-class WaveBufferView : public BufferView<WaveReleaser> {
+class VeloxWaveBufferView : public BufferView<WaveReleaser> {
  public:
-  static BufferPtr create(WaveBufferPtr buffer) {
+  /// Takes an additional reference to buffer. 'offset' and 'size'
+  /// allow sharing one allocation for many views. This is done when many
+  /// vectors have to be moved as a unit between device and host.
+  static BufferPtr
+  create(WaveBufferPtr buffer, int64_t offset = 0, int32_t size = -1) {
     return BufferView<WaveReleaser>::create(
-        buffer->as<uint8_t>(), buffer->capacity(), WaveReleaser(buffer));
+        buffer->as<uint8_t>() + offset,
+        size == -1 ? buffer->capacity() - offset : size,
+        WaveReleaser(buffer));
   }
 };
 
diff --git a/velox/expression/CMakeLists.txt b/velox/expression/CMakeLists.txt
index 3166487983811..c06c5be23fcb5 100644
--- a/velox/expression/CMakeLists.txt
+++ b/velox/expression/CMakeLists.txt
@@ -58,6 +58,5 @@ add_subdirectory(signature_parser)
 
 if(${VELOX_BUILD_TESTING})
   add_subdirectory(tests)
-elseif(${VELOX_BUILD_TEST_UTILS})
-  add_subdirectory(tests/utils)
+  add_subdirectory(fuzzer)
 endif()
diff --git a/velox/expression/CastExpr.cpp b/velox/expression/CastExpr.cpp
index 6bb3eec0c9b8c..0e6e6fb40d4ad 100644
--- a/velox/expression/CastExpr.cpp
+++ b/velox/expression/CastExpr.cpp
@@ -918,6 +918,8 @@ void CastExpr::evalSpecialForm(
   inTopLevel = true;
   if (nullOnFailure()) {
     ScopedVarSetter holder{context.mutableThrowOnError(), false};
+    ScopedVarSetter captureErrorDetails(
+        context.mutableCaptureErrorDetails(), false);
     apply(rows, input, context, fromType, toType, result);
   } else {
     apply(rows, input, context, fromType, toType, result);
diff --git a/velox/expression/EvalCtx.cpp b/velox/expression/EvalCtx.cpp
index 13c5aa520483d..f22f2fe376fa8 100644
--- a/velox/expression/EvalCtx.cpp
+++ b/velox/expression/EvalCtx.cpp
@@ -170,8 +170,35 @@ namespace {
 auto throwError(const std::exception_ptr& exceptionPtr) {
   std::rethrow_exception(toVeloxException(exceptionPtr));
 }
+
+std::exception_ptr toVeloxUserError(const std::string& message) {
+  return std::make_exception_ptr(VeloxUserError(
+      __FILE__,
+      __LINE__,
+      __FUNCTION__,
+      "",
+      message,
+      error_source::kErrorSourceUser,
+      error_code::kInvalidArgument,
+      false /*retriable*/));
+}
+
 } // namespace
 
+void EvalCtx::setStatus(vector_size_t index, Status status) {
+  VELOX_CHECK(!status.ok(), "Status must be an error");
+
+  static std::exception_ptr kUserError = toVeloxUserError("<not captured>");
+
+  if (status.isUserError()) {
+    setVeloxExceptionError(
+        index,
+        captureErrorDetails_ ? toVeloxUserError(status.message()) : kUserError);
+  } else {
+    VELOX_FAIL(status.message());
+  }
+}
+
 void EvalCtx::setError(
     vector_size_t index,
     const std::exception_ptr& exceptionPtr) {
diff --git a/velox/expression/EvalCtx.h b/velox/expression/EvalCtx.h
index 1f4d0de95cb46..8285553de40a8 100644
--- a/velox/expression/EvalCtx.h
+++ b/velox/expression/EvalCtx.h
@@ -78,6 +78,9 @@ class EvalCtx {
 
   void restore(ContextSaver& saver);
 
+  // @param status Must indicate an error. Cannot be "ok".
+  void setStatus(vector_size_t index, Status status);
+
   // If exceptionPtr is known to be a VeloxException use setVeloxExceptionError
   // instead.
   void setError(vector_size_t index, const std::exception_ptr& exceptionPtr);
@@ -189,6 +192,8 @@ class EvalCtx {
     errors_.reset();
   }
 
+  /// Boolean indicating whether exceptions that occur during expression
+  /// evaluation should be thrown directly or saved for later processing.
   bool throwOnError() const {
     return throwOnError_;
   }
@@ -197,6 +202,19 @@ class EvalCtx {
     return &throwOnError_;
   }
 
+  /// Boolean indicating whether to capture details when storing exceptions for
+  /// later processing (throwOnError_ == true).
+  ///
+  /// Conjunct expressions (AND, OR) require capturing error details, while TRY
+  /// and TRY_CAST expressions do not.
+  bool captureErrorDetails() const {
+    return captureErrorDetails_;
+  }
+
+  bool* mutableCaptureErrorDetails() {
+    return &captureErrorDetails_;
+  }
+
   bool nullsPruned() const {
     return nullsPruned_;
   }
@@ -352,6 +370,8 @@ class EvalCtx {
   bool nullsPruned_{false};
   bool throwOnError_{true};
 
+  bool captureErrorDetails_{true};
+
   // True if the current set of rows will not grow, e.g. not under and IF or OR.
   bool isFinalSelection_{true};
 
diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp
index ed10a44bd9da6..05921fa432d36 100644
--- a/velox/expression/Expr.cpp
+++ b/velox/expression/Expr.cpp
@@ -696,14 +696,15 @@ std::string onTopLevelException(VeloxException::Type exceptionType, void* arg) {
     basePath = FLAGS_velox_save_input_on_expression_system_failure_path.c_str();
   }
   if (strlen(basePath) == 0) {
-    return context->expr()->toString();
+    return fmt::format("Top-level Expression: {}", context->expr()->toString());
   }
 
   // Save input vector to a file.
   context->persistDataAndSql(basePath);
 
   return fmt::format(
-      "{}. Input data: {}. SQL expression: {}. All SQL expressions: {}.",
+      "Top-level Expression: {}. Input data: {}. SQL expression: {}."
+      " All SQL expressions: {}. ",
       context->expr()->toString(),
       context->dataPath(),
       context->sqlPath(),
@@ -745,8 +746,9 @@ void Expr::evalFlatNoNullsImpl(
     const ExprSet* parentExprSet) {
   ExprExceptionContext exprExceptionContext{this, context.row(), parentExprSet};
   ExceptionContextSetter exceptionContext(
-      {parentExprSet ? onTopLevelException : onException,
-       parentExprSet ? (void*)&exprExceptionContext : this});
+      {.messageFunc = parentExprSet ? onTopLevelException : onException,
+       .arg = parentExprSet ? (void*)&exprExceptionContext : this,
+       .isEssential = parentExprSet != nullptr});
 
   if (!rows.hasSelections()) {
     checkOrSetEmptyResult(type(), context.pool(), result);
@@ -798,8 +800,9 @@ void Expr::eval(
   // exception.
   ExprExceptionContext exprExceptionContext{this, context.row(), parentExprSet};
   ExceptionContextSetter exceptionContext(
-      {parentExprSet ? onTopLevelException : onException,
-       parentExprSet ? (void*)&exprExceptionContext : this});
+      {.messageFunc = parentExprSet ? onTopLevelException : onException,
+       .arg = parentExprSet ? (void*)&exprExceptionContext : this,
+       .isEssential = parentExprSet != nullptr});
 
   if (!rows.hasSelections()) {
     checkOrSetEmptyResult(type(), context.pool(), result);
diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h
index e999a36caaa84..14a947c36206a 100644
--- a/velox/expression/Expr.h
+++ b/velox/expression/Expr.h
@@ -819,14 +819,10 @@ class ExprSetListener {
       const ExprSetCompletionEvent& event) = 0;
 
   /// Called when a batch of rows encounters errors processing one or more
-  /// rows in a try expression to provide information about these errors. This
-  /// function must neither change rows nor errors.
-  /// @param rows Rows where errors exist.
-  /// @param errors Error vector produced inside the try expression.
-  virtual void onError(
-      const SelectivityVector& rows,
-      const ErrorVector& errors,
-      const std::string& queryId) = 0;
+  /// rows in a try expression to provide information about these errors.
+  /// @param numRows Number of rows with errors.
+  /// @param queryId Query ID.
+  virtual void onError(vector_size_t numRows, const std::string& queryId) = 0;
 };
 
 /// Return the ExprSetListeners having been registered.
diff --git a/velox/expression/SimpleFunctionAdapter.h b/velox/expression/SimpleFunctionAdapter.h
index b6b0aae5f59ca..8c0bb54d5b12c 100644
--- a/velox/expression/SimpleFunctionAdapter.h
+++ b/velox/expression/SimpleFunctionAdapter.h
@@ -16,11 +16,13 @@
 
 #pragma once
 
+#include <exception>
 #include <memory>
 #include <optional>
 #include <type_traits>
 
 #include "velox/common/base/Portability.h"
+#include "velox/common/base/Status.h"
 #include "velox/expression/ComplexWriterTypes.h"
 #include "velox/expression/DecodedArgs.h"
 #include "velox/expression/Expr.h"
@@ -195,6 +197,10 @@ class SimpleFunctionAdapter : public VectorFunction {
       context.template applyToSelectedNoThrow<Callable>(*rows, func);
     }
 
+    void setError(vector_size_t row, Status status) {
+      context.setStatus(row, status);
+    }
+
     const SelectivityVector* rows;
     result_vector_t* result;
     VectorWriter<typename FUNC::return_type> resultWriter;
@@ -618,7 +624,11 @@ class SimpleFunctionAdapter : public VectorFunction {
               // Result is NULL because the input contains NULL.
               notNull = false;
             } else {
-              notNull = doApplyNullFree<0>(row, out, readers...);
+              auto status = doApplyNullFree<0>(row, out, notNull, readers...);
+              if UNLIKELY (!status.ok()) {
+                applyContext.setError(row, status);
+                return;
+              }
             }
 
             writeResult(row, notNull, out);
@@ -626,7 +636,12 @@ class SimpleFunctionAdapter : public VectorFunction {
         } else {
           applyContext.applyToSelectedNoThrow([&](auto row) INLINE_LAMBDA {
             typename return_type_traits::NativeType out{};
-            bool notNull = doApplyNullFree<0>(row, out, readers...);
+            bool notNull;
+            auto status = doApplyNullFree<0>(row, out, notNull, readers...);
+            if UNLIKELY (!status.ok()) {
+              applyContext.setError(row, status);
+              return;
+            }
 
             writeResult(row, notNull, out);
           });
@@ -636,7 +651,13 @@ class SimpleFunctionAdapter : public VectorFunction {
           if (applyContext.allAscii) {
             applyContext.applyToSelectedNoThrow([&](auto row) INLINE_LAMBDA {
               typename return_type_traits::NativeType out{};
-              bool notNull = doApplyAsciiNotNull<0>(row, out, readers...);
+              bool notNull;
+              auto status =
+                  doApplyAsciiNotNull<0>(row, out, notNull, readers...);
+              if UNLIKELY (!status.ok()) {
+                applyContext.setError(row, status);
+                return;
+              }
               writeResult(row, notNull, out);
             });
             return;
@@ -648,13 +669,23 @@ class SimpleFunctionAdapter : public VectorFunction {
           // optimization (eliminating the temp) is easier to do by the
           // compiler (assuming the function call is inlined).
           typename return_type_traits::NativeType out{};
-          bool notNull = doApplyNotNull<0>(row, out, readers...);
+          bool notNull;
+          auto status = doApplyNotNull<0>(row, out, notNull, readers...);
+          if UNLIKELY (!status.ok()) {
+            applyContext.setError(row, status);
+            return;
+          }
           writeResult(row, notNull, out);
         });
       } else {
         applyContext.applyToSelectedNoThrow([&](auto row) INLINE_LAMBDA {
           typename return_type_traits::NativeType out{};
-          bool notNull = doApply<0>(row, out, readers...);
+          bool notNull;
+          auto status = doApply<0>(row, out, notNull, readers...);
+          if UNLIKELY (!status.ok()) {
+            applyContext.setError(row, status);
+            return;
+          }
           writeResult(row, notNull, out);
         });
       }
@@ -664,36 +695,47 @@ class SimpleFunctionAdapter : public VectorFunction {
         // once per batch instead of once per row shows a significant
         // performance improvement when there are no nulls.
         if (applyContext.mayHaveNullsRecursive) {
-          applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA {
-            auto containsNull = (readers.containsNull(row) || ...);
-            if (containsNull) {
-              // Result is NULL because the input contains NULL.
-              return false;
-            }
-
-            return doApplyNullFree<0>(row, out, readers...);
-          });
+          applyUdf(
+              applyContext,
+              [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA {
+                auto containsNull = (readers.containsNull(row) || ...);
+                if (containsNull) {
+                  // Result is NULL because the input contains NULL.
+                  notNull = false;
+                  return Status::OK();
+                }
+
+                return doApplyNullFree<0>(row, out, notNull, readers...);
+              });
         } else {
-          applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA {
-            return doApplyNullFree<0>(row, out, readers...);
-          });
+          applyUdf(
+              applyContext,
+              [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA {
+                return doApplyNullFree<0>(row, out, notNull, readers...);
+              });
         }
       } else if (allNotNull) {
         if constexpr (FUNC::has_ascii) {
           if (applyContext.allAscii) {
-            applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA {
-              return doApplyAsciiNotNull<0>(row, out, readers...);
-            });
+            applyUdf(
+                applyContext,
+                [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA {
+                  return doApplyAsciiNotNull<0>(row, out, notNull, readers...);
+                });
             return;
           }
         }
-        applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA {
-          return doApplyNotNull<0>(row, out, readers...);
-        });
+        applyUdf(
+            applyContext,
+            [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA {
+              return doApplyNotNull<0>(row, out, notNull, readers...);
+            });
       } else {
-        applyUdf(applyContext, [&](auto& out, auto row) INLINE_LAMBDA {
-          return doApply<0>(row, out, readers...);
-        });
+        applyUdf(
+            applyContext,
+            [&](auto& out, auto& notNull, auto row) INLINE_LAMBDA {
+              return doApply<0>(row, out, notNull, readers...);
+            });
       }
     }
   }
@@ -709,16 +751,26 @@ class SimpleFunctionAdapter : public VectorFunction {
         applyContext.resultWriter.setOffset(row);
         // Force local copy of proxy.
         auto localWriter = currentWriter;
-        auto notNull = func(localWriter, row);
-        currentWriter = localWriter;
-        applyContext.resultWriter.commit(notNull);
+        bool notNull;
+        auto status = func(localWriter, notNull, row);
+        if UNLIKELY (!status.ok()) {
+          applyContext.setError(row, status);
+        } else {
+          currentWriter = localWriter;
+          applyContext.resultWriter.commit(notNull);
+        }
       });
       applyContext.resultWriter.finish();
     } else {
       applyContext.applyToSelectedNoThrow([&](auto row) INLINE_LAMBDA {
         applyContext.resultWriter.setOffset(row);
-        applyContext.resultWriter.commit(
-            func(applyContext.resultWriter.current(), row));
+        bool notNull;
+        auto status = func(applyContext.resultWriter.current(), notNull, row);
+        if UNLIKELY (!status.ok()) {
+          applyContext.setError(row, status);
+        } else {
+          applyContext.resultWriter.commit(notNull);
+        }
       });
     }
   }
@@ -734,10 +786,11 @@ class SimpleFunctionAdapter : public VectorFunction {
       typename... Values,
       std::enable_if_t<
           POSITION<FUNC::num_args && FUNC::is_default_null_behavior, int32_t> =
-              0> FOLLY_ALWAYS_INLINE bool
+              0> FOLLY_ALWAYS_INLINE Status
           doApply(
               size_t idx,
               T& target,
+              bool& notNull,
               R0& currentReader,
               const Values&... extra) const {
     if (LIKELY(currentReader.isSet(idx))) {
@@ -745,9 +798,10 @@ class SimpleFunctionAdapter : public VectorFunction {
       decltype(currentReader[idx]) v0 = currentReader[idx];
 
       // recurse through the readers to build the arg list at compile time.
-      return doApply<POSITION + 1>(idx, target, extra..., v0);
+      return doApply<POSITION + 1>(idx, target, notNull, extra..., v0);
     } else {
-      return false;
+      notNull = false;
+      return Status::OK();
     }
   }
 
@@ -758,10 +812,11 @@ class SimpleFunctionAdapter : public VectorFunction {
       typename... Values,
       std::enable_if_t<
           POSITION<FUNC::num_args && !FUNC::is_default_null_behavior, int32_t> =
-              0> FOLLY_ALWAYS_INLINE bool
+              0> FOLLY_ALWAYS_INLINE Status
           doApply(
               size_t idx,
               T& target,
+              bool& notNull,
               R0& currentReader,
               const Values&... extra) const {
     // Recurse through all the arguments to build the arg list at compile
@@ -770,16 +825,17 @@ class SimpleFunctionAdapter : public VectorFunction {
       return doApply<POSITION + 1>(
           idx,
           target,
+          notNull,
           extra...,
           (currentReader.isSet(idx) ? &currentReader[idx] : nullptr));
     } else {
       using temp_type = std::remove_reference_t<decltype(currentReader[idx])>;
       if (currentReader.isSet(idx)) {
         temp_type temp = currentReader[idx];
-        return doApply<POSITION + 1>(idx, target, extra..., &temp);
+        return doApply<POSITION + 1>(idx, target, notNull, extra..., &temp);
       } else {
         return doApply<POSITION + 1>(
-            idx, target, extra..., (const temp_type*)nullptr);
+            idx, target, notNull, extra..., (const temp_type*)nullptr);
       }
     }
   }
@@ -791,9 +847,12 @@ class SimpleFunctionAdapter : public VectorFunction {
       std::enable_if_t<
           POSITION == FUNC::num_args && FUNC::is_default_null_behavior,
           int32_t> = 0>
-  FOLLY_ALWAYS_INLINE bool
-  doApply(size_t /*idx*/, T& target, const Values&... values) const {
-    return (*fn_).call(target, values...);
+  FOLLY_ALWAYS_INLINE Status doApply(
+      size_t /*idx*/,
+      T& target,
+      bool& notNull,
+      const Values&... values) const {
+    return (*fn_).call(target, notNull, values...);
   }
 
   // For NOT default null behavior, terminate with UDFHolder::callNullable.
@@ -803,9 +862,12 @@ class SimpleFunctionAdapter : public VectorFunction {
       std::enable_if_t<
           POSITION == FUNC::num_args && !FUNC::is_default_null_behavior,
           int32_t> = 0>
-  FOLLY_ALWAYS_INLINE bool
-  doApply(size_t /*idx*/, T& target, const Values*... values) const {
-    return (*fn_).callNullable(target, values...);
+  FOLLY_ALWAYS_INLINE Status doApply(
+      size_t /*idx*/,
+      T& target,
+      bool& notNull,
+      const Values*... values) const {
+    return (*fn_).callNullable(target, notNull, values...);
   }
 
   // == NOT-NULL VARIANT ==
@@ -822,13 +884,14 @@ class SimpleFunctionAdapter : public VectorFunction {
       typename R0,
       typename... TStuff,
       std::enable_if_t<POSITION != FUNC::num_args, int32_t> = 0>
-  FOLLY_ALWAYS_INLINE bool doApplyNotNull(
+  FOLLY_ALWAYS_INLINE Status doApplyNotNull(
       size_t idx,
       T& target,
+      bool& notNull,
       R0& currentReader,
       const TStuff&... extra) const {
     decltype(currentReader[idx]) v0 = currentReader[idx];
-    return doApplyNotNull<POSITION + 1>(idx, target, extra..., v0);
+    return doApplyNotNull<POSITION + 1>(idx, target, notNull, extra..., v0);
   }
 
   // For default null behavior, Terminate by with UDFHolder::call.
@@ -836,9 +899,12 @@ class SimpleFunctionAdapter : public VectorFunction {
       size_t POSITION,
       typename... Values,
       std::enable_if_t<POSITION == FUNC::num_args, int32_t> = 0>
-  FOLLY_ALWAYS_INLINE bool
-  doApplyNotNull(size_t /*idx*/, T& target, const Values&... values) const {
-    return (*fn_).call(target, values...);
+  FOLLY_ALWAYS_INLINE Status doApplyNotNull(
+      size_t /*idx*/,
+      T& target,
+      bool& notNull,
+      const Values&... values) const {
+    return (*fn_).call(target, notNull, values...);
   }
 
   template <
@@ -846,24 +912,27 @@ class SimpleFunctionAdapter : public VectorFunction {
       typename R0,
       typename... TStuff,
       std::enable_if_t<POSITION != FUNC::num_args, int32_t> = 0>
-  FOLLY_ALWAYS_INLINE bool doApplyAsciiNotNull(
+  FOLLY_ALWAYS_INLINE Status doApplyAsciiNotNull(
       size_t idx,
       T& target,
+      bool& notNull,
       R0& currentReader,
       const TStuff&... extra) const {
     decltype(currentReader[idx]) v0 = currentReader[idx];
-    return doApplyAsciiNotNull<POSITION + 1>(idx, target, extra..., v0);
+    return doApplyAsciiNotNull<POSITION + 1>(
+        idx, target, notNull, extra..., v0);
   }
 
   template <
       size_t POSITION,
       typename... Values,
       std::enable_if_t<POSITION == FUNC::num_args, int32_t> = 0>
-  FOLLY_ALWAYS_INLINE bool doApplyAsciiNotNull(
+  FOLLY_ALWAYS_INLINE Status doApplyAsciiNotNull(
       size_t /*idx*/,
       T& target,
+      bool& notNull,
       const Values&... values) const {
-    return (*fn_).callAscii(target, values...);
+    return (*fn_).callAscii(target, notNull, values...);
   }
 
   template <
@@ -871,22 +940,26 @@ class SimpleFunctionAdapter : public VectorFunction {
       typename R0,
       typename... TStuff,
       std::enable_if_t<POSITION != FUNC::num_args, int32_t> = 0>
-  FOLLY_ALWAYS_INLINE bool doApplyNullFree(
+  FOLLY_ALWAYS_INLINE Status doApplyNullFree(
       size_t idx,
       T& target,
+      bool& notNull,
       R0& currentReader,
       const TStuff&... extra) const {
     auto v0 = currentReader.readNullFree(idx);
-    return doApplyNullFree<POSITION + 1>(idx, target, extra..., v0);
+    return doApplyNullFree<POSITION + 1>(idx, target, notNull, extra..., v0);
   }
 
   template <
       size_t POSITION,
       typename... Values,
       std::enable_if_t<POSITION == FUNC::num_args, int32_t> = 0>
-  FOLLY_ALWAYS_INLINE bool
-  doApplyNullFree(size_t /*idx*/, T& target, const Values&... values) const {
-    return (*fn_).callNullFree(target, values...);
+  FOLLY_ALWAYS_INLINE Status doApplyNullFree(
+      size_t /*idx*/,
+      T& target,
+      bool& notNull,
+      const Values&... values) const {
+    return (*fn_).callNullFree(target, notNull, values...);
   }
 };
 
diff --git a/velox/expression/TryExpr.cpp b/velox/expression/TryExpr.cpp
index fb840eedf9731..8850f3ac4e542 100644
--- a/velox/expression/TryExpr.cpp
+++ b/velox/expression/TryExpr.cpp
@@ -23,6 +23,9 @@ void TryExpr::evalSpecialForm(
     EvalCtx& context,
     VectorPtr& result) {
   ScopedVarSetter throwOnError(context.mutableThrowOnError(), false);
+  ScopedVarSetter captureErrorDetails(
+      context.mutableCaptureErrorDetails(), false);
+
   // It's possible with nested TRY expressions that some rows already threw
   // exceptions in earlier expressions that haven't been handled yet. To avoid
   // incorrectly handling them here, store those errors and temporarily reset
@@ -42,6 +45,9 @@ void TryExpr::evalSpecialFormSimplified(
     EvalCtx& context,
     VectorPtr& result) {
   ScopedVarSetter throwOnError(context.mutableThrowOnError(), false);
+  ScopedVarSetter captureErrorDetails(
+      context.mutableCaptureErrorDetails(), false);
+
   // It's possible with nested TRY expressions that some rows already threw
   // exceptions in earlier expressions that haven't been handled yet. To avoid
   // incorrectly handling them here, store those errors and temporarily reset
@@ -66,24 +72,21 @@ void applyListenersOnError(
   auto errors = context.errors();
   VELOX_CHECK_NOT_NULL(errors);
 
-  exec::LocalSelectivityVector errorRows(context.execCtx(), errors->size());
-  errorRows->clearAll();
+  vector_size_t numErrors = 0;
   rows.applyToSelected([&](auto row) {
     if (row < errors->size() && !errors->isNullAt(row)) {
-      errorRows->setValid(row, true);
+      ++numErrors;
     }
   });
-  errorRows->updateBounds();
 
-  if (!errorRows->hasSelections()) {
+  if (numErrors == 0) {
     return;
   }
 
   exprSetListeners().withRLock([&](auto& listeners) {
     if (!listeners.empty()) {
       for (auto& listener : listeners) {
-        listener->onError(
-            *errorRows, *errors, context.execCtx()->queryCtx()->queryId());
+        listener->onError(numErrors, context.execCtx()->queryCtx()->queryId());
       }
     }
   });
diff --git a/velox/expression/fuzzer/ArgGenerator.h b/velox/expression/fuzzer/ArgGenerator.h
new file mode 100644
index 0000000000000..4c015f7a38c79
--- /dev/null
+++ b/velox/expression/fuzzer/ArgGenerator.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/expression/FunctionSignature.h"
+#include "velox/vector/fuzzer/Utils.h"
+
+namespace facebook::velox::fuzzer {
+
+/// Generates random, but valid input types for a specified function signature
+/// with the return type.
+class ArgGenerator {
+ public:
+  virtual ~ArgGenerator() = default;
+
+  /// Given a signature and a concrete return type returns randomly selected
+  /// valid input types. Returns empty vector if no input types can produce the
+  /// specified result type.
+  virtual std::vector<TypePtr> generateArgs(
+      const exec::FunctionSignature& signature,
+      const TypePtr& returnType,
+      FuzzerGenerator& rng) = 0;
+};
+
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/utils/ArgumentTypeFuzzer.cpp b/velox/expression/fuzzer/ArgumentTypeFuzzer.cpp
similarity index 98%
rename from velox/expression/tests/utils/ArgumentTypeFuzzer.cpp
rename to velox/expression/fuzzer/ArgumentTypeFuzzer.cpp
index 1b32b5c946746..1d7b784f29905 100644
--- a/velox/expression/tests/utils/ArgumentTypeFuzzer.cpp
+++ b/velox/expression/fuzzer/ArgumentTypeFuzzer.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "velox/expression/tests/utils/ArgumentTypeFuzzer.h"
+#include "velox/expression/fuzzer/ArgumentTypeFuzzer.h"
 
 #include <boost/algorithm/string.hpp>
 #include <boost/random/uniform_int_distribution.hpp>
@@ -24,7 +24,7 @@
 #include "velox/type/Type.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 std::string typeToBaseName(const TypePtr& type) {
   if (type->isDecimal()) {
@@ -226,4 +226,4 @@ int32_t ArgumentTypeFuzzer::rand32(int32_t min, int32_t max) {
   return boost::random::uniform_int_distribution<uint32_t>(min, max)(rng_);
 }
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/utils/ArgumentTypeFuzzer.h b/velox/expression/fuzzer/ArgumentTypeFuzzer.h
similarity index 98%
rename from velox/expression/tests/utils/ArgumentTypeFuzzer.h
rename to velox/expression/fuzzer/ArgumentTypeFuzzer.h
index f21ced70c5212..9a01ef9c5e4a2 100644
--- a/velox/expression/tests/utils/ArgumentTypeFuzzer.h
+++ b/velox/expression/fuzzer/ArgumentTypeFuzzer.h
@@ -22,7 +22,7 @@
 #include "velox/expression/SignatureBinder.h"
 #include "velox/type/Type.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 /// For function signatures using type variables, generates a list of
 /// arguments types. Optionally, allows to specify a desired return type. If
@@ -104,4 +104,4 @@ std::string typeToBaseName(const TypePtr& type);
 /// Return the TypeKind that corresponds to typeName.
 std::optional<TypeKind> baseNameToTypeKind(const std::string& typeName);
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/fuzzer/CMakeLists.txt b/velox/expression/fuzzer/CMakeLists.txt
new file mode 100644
index 0000000000000..adb2949e7b40c
--- /dev/null
+++ b/velox/expression/fuzzer/CMakeLists.txt
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(velox_expression_test_utility ArgumentTypeFuzzer.cpp
+                                          FuzzerToolkit.cpp)
+
+target_link_libraries(velox_expression_test_utility velox_type
+                      velox_expression_functions gtest)
+
+add_library(
+  velox_expression_fuzzer
+  ArgumentTypeFuzzer.cpp DecimalArgGeneratorBase.cpp ExpressionFuzzer.cpp
+  FuzzerRunner.cpp ExpressionFuzzerVerifier.cpp)
+
+target_link_libraries(
+  velox_expression_fuzzer
+  velox_expression_verifier
+  velox_type
+  velox_vector_fuzzer
+  velox_vector_test_lib
+  velox_function_registry
+  velox_expression_test_utility)
+
+add_executable(velox_expression_fuzzer_test ExpressionFuzzerTest.cpp)
+
+target_link_libraries(velox_expression_fuzzer_test velox_expression_fuzzer
+                      velox_functions_prestosql gtest gtest_main)
+
+add_executable(spark_expression_fuzzer_test SparkExpressionFuzzerTest.cpp)
+
+target_link_libraries(spark_expression_fuzzer_test velox_expression_fuzzer
+                      velox_functions_spark gtest gtest_main)
+
+if(${VELOX_BUILD_TESTING})
+  add_subdirectory(tests)
+endif()
diff --git a/velox/expression/fuzzer/DecimalArgGeneratorBase.cpp b/velox/expression/fuzzer/DecimalArgGeneratorBase.cpp
new file mode 100644
index 0000000000000..1b65b67658c56
--- /dev/null
+++ b/velox/expression/fuzzer/DecimalArgGeneratorBase.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/expression/fuzzer/DecimalArgGeneratorBase.h"
+#include <boost/random/uniform_int_distribution.hpp>
+
+namespace facebook::velox::fuzzer {
+namespace {
+
+// Returns all the possible decimal types.
+const std::vector<TypePtr>& getAllTypes() {
+  const auto generateAllTypes = []() {
+    std::vector<TypePtr> allTypes;
+    for (auto p = 1; p <= 38; ++p) {
+      for (auto s = 0; s <= p; ++s) {
+        allTypes.push_back(DECIMAL(p, s));
+      }
+    }
+    return allTypes;
+  };
+
+  static const std::vector<TypePtr> allTypes = generateAllTypes();
+  return allTypes;
+}
+
+uint32_t rand32(uint32_t max, FuzzerGenerator& rng) {
+  return boost::random::uniform_int_distribution<uint32_t>()(rng) % max;
+}
+} // namespace
+
+std::vector<TypePtr> DecimalArgGeneratorBase::generateArgs(
+    const exec::FunctionSignature& /*signature*/,
+    const TypePtr& returnType,
+    FuzzerGenerator& rng) {
+  auto inputs = findInputs(returnType, rng);
+  for (const auto& input : inputs) {
+    if (input == nullptr) {
+      return {};
+    }
+  }
+  return inputs;
+}
+
+void DecimalArgGeneratorBase::initialize(uint32_t numArgs) {
+  switch (numArgs) {
+    case 1: {
+      for (const auto& t : getAllTypes()) {
+        auto [p, s] = getDecimalPrecisionScale(*t);
+        if (auto returnType = toReturnType(p, s)) {
+          inputs_[returnType.value()].push_back({t});
+        }
+      }
+      break;
+    }
+    case 2: {
+      for (const auto& a : getAllTypes()) {
+        for (const auto& b : getAllTypes()) {
+          auto [p1, s1] = getDecimalPrecisionScale(*a);
+          auto [p2, s2] = getDecimalPrecisionScale(*b);
+
+          if (auto returnType = toReturnType(p1, s1, p2, s2)) {
+            inputs_[returnType.value()].push_back({a, b});
+          }
+        }
+      }
+      break;
+    }
+    default:
+      VELOX_NYI(
+          "Initialization with {} argument types is not supported.", numArgs);
+  }
+}
+
+std::vector<TypePtr> DecimalArgGeneratorBase::findInputs(
+    const TypePtr& returnType,
+    FuzzerGenerator& rng) const {
+  const auto [p, s] = getDecimalPrecisionScale(*returnType);
+  const auto it = inputs_.find({p, s});
+  if (it == inputs_.end()) {
+    VLOG(1) << "Cannot find input types for " << returnType->toString();
+    return {};
+  }
+
+  const auto index = rand32(it->second.size(), rng);
+  return it->second[index];
+}
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/fuzzer/DecimalArgGeneratorBase.h b/velox/expression/fuzzer/DecimalArgGeneratorBase.h
new file mode 100644
index 0000000000000..c27db1b8d264f
--- /dev/null
+++ b/velox/expression/fuzzer/DecimalArgGeneratorBase.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/expression/fuzzer/ArgGenerator.h"
+
+namespace facebook::velox::fuzzer {
+
+/// An argument type generator for decimal function. A map keyed on the pair of
+/// precision and scale could be initialized with all possible input types.
+/// Argument types are generated by looking up the map with the precision and
+/// scale of return type, and randomly selecting valid input types. Derived
+/// classes should call 'initialize' from the constructor and specify the number
+/// of decimal arguments. They should also implement toReturnType with matching
+/// number of pairs of precision and scale.
+class DecimalArgGeneratorBase : public ArgGenerator {
+ public:
+  std::vector<TypePtr> generateArgs(
+      const exec::FunctionSignature& signature,
+      const TypePtr& returnType,
+      FuzzerGenerator& rng) override;
+
+ protected:
+  // Computes result type for all possible pairs of decimal input types. Stores
+  // the results in 'inputs_' map keyed by the precision and scale of return
+  // type.
+  // @param numArgs the number of decimal argument types. It only supports
+  // initialization with one or two argument types.
+  virtual void initialize(uint32_t numArgs);
+
+  // Given precisions and scales of the inputs, returns precision and scale of
+  // the result. Returns std::nullopt if a valid return type cannot be generated
+  // with inputs. Used when the return type is generated with one pair of input
+  // precision and scale.
+  virtual std::optional<std::pair<int, int>> toReturnType(int p, int s) {
+    VELOX_UNREACHABLE();
+  }
+
+  // Used when the return type is generated with two pairs of input precision
+  // and scale.
+  virtual std::optional<std::pair<int, int>>
+  toReturnType(int p1, int s1, int p2, int s2) {
+    VELOX_UNREACHABLE();
+  }
+
+ private:
+  // Returns randomly selected pair of input types that produce the specified
+  // result type.
+  std::vector<TypePtr> findInputs(
+      const TypePtr& returnType,
+      FuzzerGenerator& rng) const;
+
+  // Maps from the precision and scale of return type to corresponding input
+  // types.
+  std::unordered_map<std::pair<int, int>, std::vector<std::vector<TypePtr>>>
+      inputs_;
+};
+
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/ExpressionFuzzer.cpp b/velox/expression/fuzzer/ExpressionFuzzer.cpp
similarity index 99%
rename from velox/expression/tests/ExpressionFuzzer.cpp
rename to velox/expression/fuzzer/ExpressionFuzzer.cpp
index f8bdf6c59c874..4817d71ac4627 100644
--- a/velox/expression/tests/ExpressionFuzzer.cpp
+++ b/velox/expression/fuzzer/ExpressionFuzzer.cpp
@@ -26,10 +26,10 @@
 #include "velox/expression/FunctionSignature.h"
 #include "velox/expression/ReverseSignatureBinder.h"
 #include "velox/expression/SimpleFunctionRegistry.h"
-#include "velox/expression/tests/ExpressionFuzzer.h"
-#include "velox/expression/tests/utils/ArgumentTypeFuzzer.h"
+#include "velox/expression/fuzzer/ArgumentTypeFuzzer.h"
+#include "velox/expression/fuzzer/ExpressionFuzzer.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 namespace {
 using exec::SignatureBinder;
@@ -1384,4 +1384,4 @@ RowTypePtr ExpressionFuzzer::fuzzRowReturnType(size_t size, char prefix) {
   return ROW(std::move(names), std::move(children));
 }
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/ExpressionFuzzer.h b/velox/expression/fuzzer/ExpressionFuzzer.h
similarity index 99%
rename from velox/expression/tests/ExpressionFuzzer.h
rename to velox/expression/fuzzer/ExpressionFuzzer.h
index fcc7bee02a27a..1e0dc9a74f748 100644
--- a/velox/expression/tests/ExpressionFuzzer.h
+++ b/velox/expression/fuzzer/ExpressionFuzzer.h
@@ -19,13 +19,13 @@
 #include "velox/core/ITypedExpr.h"
 #include "velox/core/QueryCtx.h"
 #include "velox/expression/Expr.h"
+#include "velox/expression/fuzzer/FuzzerToolkit.h"
 #include "velox/expression/tests/ExpressionVerifier.h"
-#include "velox/expression/tests/utils/FuzzerToolkit.h"
 #include "velox/functions/FunctionRegistry.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
 #include "velox/vector/tests/utils/VectorMaker.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 // A tool that can be used to generate random expressions.
 class ExpressionFuzzer {
@@ -418,4 +418,4 @@ class ExpressionFuzzer {
   friend class ExpressionFuzzerUnitTest;
 };
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/ExpressionFuzzerTest.cpp b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp
similarity index 94%
rename from velox/expression/tests/ExpressionFuzzerTest.cpp
rename to velox/expression/fuzzer/ExpressionFuzzerTest.cpp
index e6c0985c255a9..e9f35f02f7702 100644
--- a/velox/expression/tests/ExpressionFuzzerTest.cpp
+++ b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <unordered_set>
 
-#include "velox/expression/tests/FuzzerRunner.h"
+#include "velox/expression/fuzzer/FuzzerRunner.h"
 #include "velox/functions/prestosql/registration/RegistrationFunctions.h"
 
 DEFINE_int64(
@@ -27,7 +27,7 @@ DEFINE_int64(
     "Initial seed for random number generator used to reproduce previous "
     "results (0 means start with random seed).");
 
-using facebook::velox::test::FuzzerRunner;
+using facebook::velox::fuzzer::FuzzerRunner;
 
 int main(int argc, char** argv) {
   facebook::velox::functions::prestosql::registerAllScalarFunctions();
@@ -64,7 +64,6 @@ int main(int argc, char** argv) {
       "regexp_extract",
       "regexp_extract_all",
       "regexp_like",
-      "map_top_n", // https://github.com/facebookincubator/velox/issues/9497
   };
   size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed;
   return FuzzerRunner::run(initialSeed, skipFunctions, {{}});
diff --git a/velox/expression/tests/ExpressionFuzzerVerifier.cpp b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp
similarity index 96%
rename from velox/expression/tests/ExpressionFuzzerVerifier.cpp
rename to velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp
index 8e36fd739f21c..9ea2d1e0421c3 100644
--- a/velox/expression/tests/ExpressionFuzzerVerifier.cpp
+++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "velox/expression/tests/ExpressionFuzzerVerifier.h"
+#include "velox/expression/fuzzer/ExpressionFuzzerVerifier.h"
 
 #include <boost/random/uniform_int_distribution.hpp>
 #include <glog/logging.h>
@@ -24,9 +24,9 @@
 #include "velox/expression/Expr.h"
 #include "velox/expression/FunctionSignature.h"
 #include "velox/expression/ReverseSignatureBinder.h"
-#include "velox/expression/tests/ExpressionFuzzer.h"
+#include "velox/expression/fuzzer/ExpressionFuzzer.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 namespace {
 
@@ -250,9 +250,9 @@ void ExpressionFuzzerVerifier::retryWithTry(
                 false, // canThrow
                 columnsToWrapInLazy)
             .result;
-  } catch (const std::exception& e) {
+  } catch (const std::exception&) {
     if (options_.findMinimalSubexpression) {
-      computeMinimumSubExpression(
+      test::computeMinimumSubExpression(
           {&execCtx_, {false, ""}},
           *vectorFuzzer_,
           plans,
@@ -281,9 +281,9 @@ void ExpressionFuzzerVerifier::retryWithTry(
                        : nullptr,
           false, // canThrow
           columnsToWrapInLazy);
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       if (options_.findMinimalSubexpression) {
-        computeMinimumSubExpression(
+        test::computeMinimumSubExpression(
             {&execCtx_, {false, ""}},
             *vectorFuzzer_,
             plans,
@@ -339,9 +339,9 @@ void ExpressionFuzzerVerifier::go() {
           resultVectors ? BaseVector::copy(*resultVectors) : nullptr,
           true, // canThrow
           columnsToWrapInLazy);
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       if (options_.findMinimalSubexpression) {
-        computeMinimumSubExpression(
+        test::computeMinimumSubExpression(
             {&execCtx_, {false, ""}},
             *vectorFuzzer_,
             plans,
@@ -374,4 +374,4 @@ void ExpressionFuzzerVerifier::go() {
   LOG(ERROR) << "Total failed: " << numFailed;
 }
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/ExpressionFuzzerVerifier.h b/velox/expression/fuzzer/ExpressionFuzzerVerifier.h
similarity index 95%
rename from velox/expression/tests/ExpressionFuzzerVerifier.h
rename to velox/expression/fuzzer/ExpressionFuzzerVerifier.h
index 2f85b5d52bc71..f651ad5541430 100644
--- a/velox/expression/tests/ExpressionFuzzerVerifier.h
+++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.h
@@ -19,16 +19,16 @@
 #include "velox/core/ITypedExpr.h"
 #include "velox/core/QueryCtx.h"
 #include "velox/expression/Expr.h"
-#include "velox/expression/tests/ExpressionFuzzer.h"
+#include "velox/expression/fuzzer/ExpressionFuzzer.h"
+#include "velox/expression/fuzzer/FuzzerToolkit.h"
 #include "velox/expression/tests/ExpressionVerifier.h"
-#include "velox/expression/tests/utils/FuzzerToolkit.h"
 #include "velox/functions/FunctionRegistry.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
 #include "velox/vector/tests/utils/VectorMaker.h"
 
 DECLARE_int32(velox_fuzzer_max_level_of_nesting);
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 // A tool that utilizes ExpressionFuzzer, VectorFuzzer and ExpressionVerfier to
 // generate random expressions and verify the correctness of the results. It
@@ -139,10 +139,8 @@ class ExpressionFuzzerVerifier {
 
     // A no-op since we cannot tie errors directly to functions where they
     // occurred.
-    void onError(
-        const SelectivityVector& /*rows*/,
-        const ::facebook::velox::ErrorVector& /*errors*/,
-        const std::string& /*queryId*/) override {}
+    void onError(vector_size_t /*numRows*/, const std::string& /*queryId*/)
+        override {}
 
    private:
     std::unordered_map<std::string, ExprUsageStats>& exprNameToStats_;
@@ -212,4 +210,4 @@ class ExpressionFuzzerVerifier {
   ExpressionFuzzer expressionFuzzer_;
 };
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/FuzzerRunner.cpp b/velox/expression/fuzzer/FuzzerRunner.cpp
similarity index 97%
rename from velox/expression/tests/FuzzerRunner.cpp
rename to velox/expression/fuzzer/FuzzerRunner.cpp
index ac741944dfa61..56c58a3658413 100644
--- a/velox/expression/tests/FuzzerRunner.cpp
+++ b/velox/expression/fuzzer/FuzzerRunner.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "velox/expression/tests/FuzzerRunner.h"
-#include "velox/expression/tests/ExpressionFuzzer.h"
+#include "velox/expression/fuzzer/FuzzerRunner.h"
+#include "velox/expression/fuzzer/ExpressionFuzzer.h"
 
 DEFINE_int32(steps, 10, "Number of expressions to generate and execute.");
 
@@ -148,7 +148,7 @@ DEFINE_string(
     "of functions at every instance. Number of tickets must be a positive "
     "integer. Example: eq=3,floor=5");
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 namespace {
 VectorFuzzer::Options getVectorFuzzerOptions() {
@@ -222,4 +222,4 @@ void FuzzerRunner::runFromGtest(
       getExpressionFuzzerVerifierOptions(skipFunctions, queryConfigs))
       .go();
 }
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/FuzzerRunner.h b/velox/expression/fuzzer/FuzzerRunner.h
similarity index 90%
rename from velox/expression/tests/FuzzerRunner.h
rename to velox/expression/fuzzer/FuzzerRunner.h
index cbf3d5ac290a9..0eda0ecd1d7a9 100644
--- a/velox/expression/tests/FuzzerRunner.h
+++ b/velox/expression/fuzzer/FuzzerRunner.h
@@ -22,10 +22,10 @@
 #include <unordered_set>
 #include <vector>
 
-#include "velox/expression/tests/ExpressionFuzzerVerifier.h"
+#include "velox/expression/fuzzer/ExpressionFuzzerVerifier.h"
 #include "velox/functions/FunctionRegistry.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 /// FuzzerRunner leverages ExpressionFuzzerVerifier to create a gtest unit test.
 class FuzzerRunner {
@@ -41,4 +41,4 @@ class FuzzerRunner {
       const std::unordered_map<std::string, std::string>& queryConfigs);
 };
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/utils/FuzzerToolkit.cpp b/velox/expression/fuzzer/FuzzerToolkit.cpp
similarity index 97%
rename from velox/expression/tests/utils/FuzzerToolkit.cpp
rename to velox/expression/fuzzer/FuzzerToolkit.cpp
index cbbc60b4c08c1..292f4619bb661 100644
--- a/velox/expression/tests/utils/FuzzerToolkit.cpp
+++ b/velox/expression/fuzzer/FuzzerToolkit.cpp
@@ -13,9 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "velox/expression/tests/utils/FuzzerToolkit.h"
+#include "velox/expression/fuzzer/FuzzerToolkit.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 std::string CallableSignature::toString() const {
   std::string buf = name;
@@ -137,4 +137,4 @@ void compareVectors(
   LOG(INFO) << "Two vectors match.";
 }
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/utils/FuzzerToolkit.h b/velox/expression/fuzzer/FuzzerToolkit.h
similarity index 97%
rename from velox/expression/tests/utils/FuzzerToolkit.h
rename to velox/expression/fuzzer/FuzzerToolkit.h
index 0411d4aaecc12..9d78d0899c82d 100644
--- a/velox/expression/tests/utils/FuzzerToolkit.h
+++ b/velox/expression/fuzzer/FuzzerToolkit.h
@@ -18,7 +18,7 @@
 #include "velox/expression/FunctionSignature.h"
 #include "velox/vector/ComplexVector.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer {
 
 // Represents one available function signature.
 struct CallableSignature {
@@ -111,4 +111,4 @@ void compareVectors(
     const std::string& leftName = "left",
     const std::string& rightName = "right",
     const std::optional<SelectivityVector>& rows = std::nullopt);
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer
diff --git a/velox/expression/tests/SparkExpressionFuzzerTest.cpp b/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp
similarity index 95%
rename from velox/expression/tests/SparkExpressionFuzzerTest.cpp
rename to velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp
index c9531632f4137..ffba105e2e08d 100644
--- a/velox/expression/tests/SparkExpressionFuzzerTest.cpp
+++ b/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp
@@ -22,7 +22,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "velox/expression/tests/FuzzerRunner.h"
+#include "velox/expression/fuzzer/FuzzerRunner.h"
 #include "velox/functions/sparksql/Register.h"
 
 DEFINE_int64(
@@ -31,7 +31,7 @@ DEFINE_int64(
     "Initial seed for random number generator "
     "(use it to reproduce previous results).");
 
-using facebook::velox::test::FuzzerRunner;
+using facebook::velox::fuzzer::FuzzerRunner;
 
 int main(int argc, char** argv) {
   facebook::velox::functions::sparksql::registerFunctions("");
diff --git a/velox/expression/tests/ArgumentTypeFuzzerTest.cpp b/velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp
similarity index 99%
rename from velox/expression/tests/ArgumentTypeFuzzerTest.cpp
rename to velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp
index 595a09ccaf8d4..32971df22e0d6 100644
--- a/velox/expression/tests/ArgumentTypeFuzzerTest.cpp
+++ b/velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "velox/expression/tests/utils/ArgumentTypeFuzzer.h"
+#include "velox/expression/fuzzer/ArgumentTypeFuzzer.h"
 
 #include <gtest/gtest.h>
 
 #include "velox/expression/SignatureBinder.h"
 #include "velox/type/Type.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer::test {
 
 namespace {
 const uint32_t kMaxVariadicArgs = 5;
@@ -649,4 +649,4 @@ TEST_F(ArgumentTypeFuzzerTest, fuzzDecimalReturnType) {
   EXPECT_EQ(DECIMAL(10, 7)->toString(), returnType->toString());
 }
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer::test
diff --git a/velox/expression/fuzzer/tests/CMakeLists.txt b/velox/expression/fuzzer/tests/CMakeLists.txt
new file mode 100644
index 0000000000000..64af95929dcbc
--- /dev/null
+++ b/velox/expression/fuzzer/tests/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(velox_expression_fuzzer_unit_test ArgumentTypeFuzzerTest.cpp DecimalArgGeneratorTest.cpp ExpressionFuzzerUnitTest.cpp)
+
+target_link_libraries(
+  velox_expression_fuzzer_unit_test
+  velox_expression_fuzzer
+  velox_functions_prestosql
+  velox_core
+  velox_expression
+  gtest
+  gtest_main)
diff --git a/velox/expression/fuzzer/tests/DecimalArgGeneratorTest.cpp b/velox/expression/fuzzer/tests/DecimalArgGeneratorTest.cpp
new file mode 100644
index 0000000000000..4de9c473ad016
--- /dev/null
+++ b/velox/expression/fuzzer/tests/DecimalArgGeneratorTest.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "velox/expression/SignatureBinder.h"
+#include "velox/expression/fuzzer/DecimalArgGeneratorBase.h"
+
+namespace facebook::velox::fuzzer::test {
+
+class DecimalArgGeneratorTest : public testing::Test {
+ protected:
+  class UnaryArgGenerator : public DecimalArgGeneratorBase {
+   public:
+    UnaryArgGenerator() {
+      initialize(1);
+    }
+
+   protected:
+    std::optional<std::pair<int, int>> toReturnType(int p, int s) override {
+      auto precision = std::min(38, p + s + 1);
+      auto scale = std::min(s + 1, 18);
+      return {{precision, scale}};
+    }
+  };
+
+  class BinaryArgGenerator : public DecimalArgGeneratorBase {
+   public:
+    BinaryArgGenerator() {
+      initialize(2);
+    }
+
+   protected:
+    std::optional<std::pair<int, int>>
+    toReturnType(int p1, int s1, int p2, int s2) override {
+      auto s = std::max(s1, s2);
+      auto p = std::min(38, std::max(p1 - s1, p2 - s2) + std::max(s1, s2) + 1);
+      return {{p, s}};
+    }
+  };
+
+  // Assert the equivalence between the given return type and the actual type
+  // resolved from generated argument types.
+  void assertReturnType(
+      const std::shared_ptr<DecimalArgGeneratorBase>& generator,
+      const exec::FunctionSignature& signature,
+      const TypePtr& returnType) {
+    std::mt19937 seed{0};
+    const auto argTypes = generator->generateArgs(signature, returnType, seed);
+
+    // Resolve return type from argument types for the given signature.
+    TypePtr actualType;
+    exec::SignatureBinder binder(signature, argTypes);
+    if (binder.tryBind()) {
+      actualType = binder.tryResolveReturnType();
+    } else {
+      VELOX_FAIL("Failed to resolve return type from argument types.");
+    }
+    EXPECT_TRUE(returnType->equivalent(*actualType))
+        << "Expected type: " << returnType->toString()
+        << ", actual type: " << actualType->toString();
+  }
+
+  // Assert that no argument types can be generated for the given return type.
+  void assertEmptyArgs(
+      std::shared_ptr<DecimalArgGeneratorBase> generator,
+      const exec::FunctionSignature& signature,
+      const TypePtr& returnType) {
+    std::mt19937 seed{0};
+    const auto argTypes = generator->generateArgs(signature, returnType, seed);
+    EXPECT_TRUE(argTypes.empty());
+  }
+};
+
+TEST_F(DecimalArgGeneratorTest, unary) {
+  auto signature =
+      exec::FunctionSignatureBuilder()
+          .integerVariable("scale")
+          .integerVariable("precision")
+          .integerVariable("r_precision", "min(38, precision + scale + 1)")
+          .integerVariable("r_scale", "min(scale + 1, 18)")
+          .returnType("decimal(r_precision, r_scale)")
+          .argumentType("decimal(precision, scale)")
+          .build();
+
+  const auto generator = std::make_shared<UnaryArgGenerator>();
+  for (auto returnType : {DECIMAL(10, 2), DECIMAL(38, 18)}) {
+    assertReturnType(generator, *signature, returnType);
+  }
+  assertEmptyArgs(generator, *signature, DECIMAL(38, 20));
+}
+
+TEST_F(DecimalArgGeneratorTest, binary) {
+  auto signature =
+      exec::FunctionSignatureBuilder()
+          .integerVariable("a_scale")
+          .integerVariable("b_scale")
+          .integerVariable("a_precision")
+          .integerVariable("b_precision")
+          .integerVariable(
+              "r_precision",
+              "min(38, max(a_precision - a_scale, b_precision - b_scale) + max(a_scale, b_scale) + 1)")
+          .integerVariable("r_scale", "max(a_scale, b_scale)")
+          .returnType("decimal(r_precision, r_scale)")
+          .argumentType("decimal(a_precision, a_scale)")
+          .argumentType("decimal(b_precision, b_scale)")
+          .build();
+
+  const auto generator = std::make_shared<BinaryArgGenerator>();
+  for (auto returnType :
+       {DECIMAL(10, 2), DECIMAL(38, 20), DECIMAL(38, 38), DECIMAL(38, 0)}) {
+    assertReturnType(generator, *signature, returnType);
+  }
+}
+
+} // namespace facebook::velox::fuzzer::test
diff --git a/velox/expression/tests/ExpressionFuzzerUnitTest.cpp b/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp
similarity index 97%
rename from velox/expression/tests/ExpressionFuzzerUnitTest.cpp
rename to velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp
index 9fd3a09b3a047..2a00931336876 100644
--- a/velox/expression/tests/ExpressionFuzzerUnitTest.cpp
+++ b/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp
@@ -16,10 +16,10 @@
 
 #include <gtest/gtest.h>
 
-#include "velox/expression/tests/ExpressionFuzzer.h"
+#include "velox/expression/fuzzer/ExpressionFuzzer.h"
 #include "velox/functions/prestosql/registration/RegistrationFunctions.h"
 
-namespace facebook::velox::test {
+namespace facebook::velox::fuzzer::test {
 class ExpressionFuzzerUnitTest : public testing::Test {
  protected:
   static void SetUpTestCase() {
@@ -199,4 +199,4 @@ TEST_F(ExpressionFuzzerUnitTest, exprBank) {
   }
 }
 
-} // namespace facebook::velox::test
+} // namespace facebook::velox::fuzzer::test
diff --git a/velox/expression/tests/CMakeLists.txt b/velox/expression/tests/CMakeLists.txt
index 6958e22f9276d..c58c7963834e3 100644
--- a/velox/expression/tests/CMakeLists.txt
+++ b/velox/expression/tests/CMakeLists.txt
@@ -12,11 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(utils)
-
 add_executable(
   velox_expression_test
-  ArgumentTypeFuzzerTest.cpp
   ArrayViewTest.cpp
   ArrayWriterTest.cpp
   CastExprTest.cpp
@@ -88,29 +85,6 @@ target_link_libraries(
   velox_expression_verifier velox_vector_test_lib velox_vector_fuzzer
   velox_type velox_expression_test_utility)
 
-add_library(velox_expression_fuzzer ExpressionFuzzer.cpp FuzzerRunner.cpp
-                                    ExpressionFuzzerVerifier.cpp)
-
-target_link_libraries(
-  velox_expression_fuzzer
-  velox_expression_verifier
-  velox_type
-  velox_vector_fuzzer
-  velox_vector_test_lib
-  velox_function_registry
-  velox_expression_test_utility)
-
-add_executable(velox_expression_fuzzer_unit_test ExpressionFuzzerUnitTest.cpp)
-
-target_link_libraries(
-  velox_expression_fuzzer_unit_test
-  velox_expression_fuzzer
-  velox_functions_prestosql
-  velox_core
-  velox_expression
-  gtest
-  gtest_main)
-
 add_library(velox_expression_runner ExpressionRunner.cpp)
 target_link_libraries(
   velox_expression_runner velox_expression_verifier velox_functions_prestosql
@@ -150,13 +124,3 @@ target_link_libraries(
   velox_vector_test_lib
   gtest
   gtest_main)
-
-add_executable(velox_expression_fuzzer_test ExpressionFuzzerTest.cpp)
-
-target_link_libraries(velox_expression_fuzzer_test velox_expression_fuzzer
-                      velox_functions_prestosql gtest gtest_main)
-
-add_executable(spark_expression_fuzzer_test SparkExpressionFuzzerTest.cpp)
-
-target_link_libraries(spark_expression_fuzzer_test velox_expression_fuzzer
-                      velox_functions_spark gtest gtest_main)
diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp
index aa0b07ff9fb89..e7660ef058060 100644
--- a/velox/expression/tests/CastExprTest.cpp
+++ b/velox/expression/tests/CastExprTest.cpp
@@ -710,7 +710,6 @@ TEST_F(CastExprTest, dateToTimestamp) {
 }
 
 TEST_F(CastExprTest, timestampToDate) {
-  setTimezone("");
   std::vector<std::optional<Timestamp>> inputTimestamps = {
       Timestamp(0, 0),
       Timestamp(946684800, 0),
@@ -766,6 +765,10 @@ TEST_F(CastExprTest, timestampInvalid) {
 }
 
 TEST_F(CastExprTest, timestampAdjustToTimezone) {
+  // Empty timezone is assumed to be GMT.
+  testCast<std::string, Timestamp>(
+      "timestamp", {"1970-01-01"}, {Timestamp(0, 0)});
+
   setTimezone("America/Los_Angeles");
 
   // Expect unix epochs to be converted to LA timezone (8h offset).
@@ -789,21 +792,10 @@ TEST_F(CastExprTest, timestampAdjustToTimezone) {
           std::nullopt,
           Timestamp(957164400, 0),
       });
-
-  // Empty timezone is assumed to be GMT.
-  setTimezone("");
-  testCast<std::string, Timestamp>(
-      "timestamp", {"1970-01-01"}, {Timestamp(0, 0)});
 }
 
 TEST_F(CastExprTest, timestampAdjustToTimezoneInvalid) {
-  auto testFunc = [&]() {
-    testCast<std::string, Timestamp>(
-        "timestamp", {"1970-01-01"}, {Timestamp(1, 0)});
-  };
-
-  setTimezone("bla");
-  EXPECT_THROW(testFunc(), std::runtime_error);
+  VELOX_ASSERT_USER_THROW(setTimezone("bla"), "Unknown time zone: 'bla'");
 }
 
 TEST_F(CastExprTest, date) {
diff --git a/velox/expression/tests/ExprStatsTest.cpp b/velox/expression/tests/ExprStatsTest.cpp
index b892b8d575702..70394b4ce9176 100644
--- a/velox/expression/tests/ExprStatsTest.cpp
+++ b/velox/expression/tests/ExprStatsTest.cpp
@@ -158,10 +158,8 @@ struct Event {
 
 class TestListener : public exec::ExprSetListener {
  public:
-  explicit TestListener(
-      std::vector<Event>& events,
-      std::vector<std::string>& exceptions)
-      : events_{events}, exceptions_{exceptions}, exceptionCount_{0} {}
+  explicit TestListener(std::vector<Event>& events)
+      : events_{events}, exceptionCount_{0} {}
 
   void onCompletion(
       const std::string& uuid,
@@ -169,21 +167,8 @@ class TestListener : public exec::ExprSetListener {
     events_.push_back({uuid, event.stats, event.sqls});
   }
 
-  void onError(
-      const SelectivityVector& rows,
-      const ::facebook::velox::ErrorVector& errors,
-      const std::string& /*queryId*/) override {
-    rows.applyToSelected([&](auto row) {
-      exceptionCount_++;
-
-      try {
-        auto exception =
-            *std::static_pointer_cast<std::exception_ptr>(errors.valueAt(row));
-        std::rethrow_exception(exception);
-      } catch (const std::exception& e) {
-        exceptions_.push_back(e.what());
-      }
-    });
+  void onError(vector_size_t numRows, const std::string& /*queryId*/) override {
+    exceptionCount_ += numRows;
   }
 
   int exceptionCount() const {
@@ -193,12 +178,10 @@ class TestListener : public exec::ExprSetListener {
   void reset() {
     exceptionCount_ = 0;
     events_.clear();
-    exceptions_.clear();
   }
 
  private:
   std::vector<Event>& events_;
-  std::vector<std::string>& exceptions_;
   int exceptionCount_;
 };
 
@@ -207,8 +190,7 @@ TEST_F(ExprStatsTest, listener) {
 
   // Register a listener to receive stats on ExprSet destruction.
   std::vector<Event> events;
-  std::vector<std::string> exceptions;
-  auto listener = std::make_shared<TestListener>(events, exceptions);
+  auto listener = std::make_shared<TestListener>(events);
   ASSERT_TRUE(exec::registerExprSetListener(listener));
   ASSERT_FALSE(exec::registerExprSetListener(listener));
 
@@ -308,8 +290,7 @@ TEST_F(ExprStatsTest, specialForms) {
 
   // Register a listener to receive stats on ExprSet destruction.
   std::vector<Event> events;
-  std::vector<std::string> exceptions;
-  auto listener = std::make_shared<TestListener>(events, exceptions);
+  auto listener = std::make_shared<TestListener>(events);
   ASSERT_TRUE(exec::registerExprSetListener(listener));
 
   auto data = makeRowVector({
@@ -367,8 +348,7 @@ TEST_F(ExprStatsTest, specialForms) {
 TEST_F(ExprStatsTest, errorLog) {
   // Register a listener to log exceptions.
   std::vector<Event> events;
-  std::vector<std::string> exceptions;
-  auto listener = std::make_shared<TestListener>(events, exceptions);
+  auto listener = std::make_shared<TestListener>(events);
   ASSERT_TRUE(exec::registerExprSetListener(listener));
 
   auto data = makeRowVector(
@@ -386,14 +366,6 @@ TEST_F(ExprStatsTest, errorLog) {
 
   // Expect errors at rows 2 and 4.
   ASSERT_EQ(2, listener->exceptionCount());
-  ASSERT_EQ(2, exceptions.size());
-  for (const auto& exception : exceptions) {
-    ASSERT_TRUE(
-        exception.find("Context: cast((c0) as INTEGER)") != std::string::npos);
-    ASSERT_TRUE(
-        exception.find("Error Code: INVALID_ARGUMENT") != std::string::npos);
-    ASSERT_TRUE(exception.find("Stack trace:") != std::string::npos);
-  }
 
   // Test with multiple try expressions. Expect errors at rows 1, 2, 4, and 6.
   // The second row in c1 does not cause an additional error because the
@@ -405,7 +377,6 @@ TEST_F(ExprStatsTest, errorLog) {
 
   evaluate(*exprSet, data);
   ASSERT_EQ(4, listener->exceptionCount());
-  ASSERT_EQ(4, exceptions.size());
 
   // Test with nested try expressions. Expect errors at rows 2, 3, 4, and 6. Row
   // 5 in c2 does not cause an error because the corresponding row in c0 is
@@ -416,15 +387,6 @@ TEST_F(ExprStatsTest, errorLog) {
 
   evaluate(*exprSet, data);
   ASSERT_EQ(4, listener->exceptionCount());
-  ASSERT_EQ(4, exceptions.size());
-  ASSERT_TRUE(
-      exceptions[0].find("Error Code: INVALID_ARGUMENT") != std::string::npos);
-  ASSERT_TRUE(
-      exceptions[1].find("Error Code: INVALID_ARGUMENT") != std::string::npos);
-  ASSERT_TRUE(
-      exceptions[2].find("Error Code: ARITHMETIC_ERROR") != std::string::npos);
-  ASSERT_TRUE(
-      exceptions[3].find("Error Code: ARITHMETIC_ERROR") != std::string::npos);
 
   // Test with no error.
   listener->reset();
@@ -432,7 +394,6 @@ TEST_F(ExprStatsTest, errorLog) {
 
   evaluate(*exprSet, data);
   ASSERT_EQ(0, listener->exceptionCount());
-  ASSERT_EQ(0, exceptions.size());
 
   ASSERT_TRUE(exec::unregisterExprSetListener(listener));
 }
@@ -442,8 +403,7 @@ TEST_F(ExprStatsTest, complexConstants) {
   // '__complex_constant(c#)' pseudo functions.
 
   std::vector<Event> events;
-  std::vector<std::string> exceptions;
-  auto listener = std::make_shared<TestListener>(events, exceptions);
+  auto listener = std::make_shared<TestListener>(events);
   ASSERT_TRUE(exec::registerExprSetListener(listener));
 
   std::vector<core::TypedExprPtr> expressions = {
@@ -456,7 +416,7 @@ TEST_F(ExprStatsTest, complexConstants) {
   }
 
   ASSERT_EQ(1, events.size());
-  ASSERT_EQ(0, exceptions.size());
+  ASSERT_EQ(0, listener->exceptionCount());
 
   ASSERT_EQ(1, events[0].sqls.size());
   ASSERT_EQ("__complex_constant(c0)", events[0].sqls[0]);
diff --git a/velox/expression/tests/ExprTest.cpp b/velox/expression/tests/ExprTest.cpp
index 52f6abc4d4bb1..79c28eb99dae6 100644
--- a/velox/expression/tests/ExprTest.cpp
+++ b/velox/expression/tests/ExprTest.cpp
@@ -267,7 +267,7 @@ class ExprTest : public testing::Test, public VectorTestBase {
     VELOX_CHECK(startPos != std::string::npos);
     startPos += strlen(key);
     auto endPos = context.find(".", startPos);
-    VELOX_CHECK(endPos != std::string::npos);
+    VELOX_CHECK(endPos != std::string::npos, context);
     return context.substr(startPos, endPos - startPos);
   }
 
@@ -298,15 +298,16 @@ class ExprTest : public testing::Test, public VectorTestBase {
   }
 
   void verifyDataAndSqlPaths(const VeloxException& e, const VectorPtr& data) {
-    auto inputPath = extractInputPath(e.topLevelContext());
+    auto inputPath = extractInputPath(e.additionalContext());
     auto copy = restoreVector(inputPath);
     assertEqualVectors(data, copy);
 
-    auto sqlPath = extractSqlPath(e.topLevelContext());
+    auto sqlPath = extractSqlPath(e.additionalContext());
     auto sql = readSqlFromFile(sqlPath);
     ASSERT_NO_THROW(compileExpression(sql, asRowType(data->type())));
 
-    auto allSqlsPath = extractAllExprSqlPath(e.topLevelContext());
+    LOG(ERROR) << e.additionalContext();
+    auto allSqlsPath = extractAllExprSqlPath(e.additionalContext());
     auto allSqls = readSqlFromFile(allSqlsPath);
     ASSERT_NO_THROW(compileMultipleExprs(allSqls, asRowType(data->type())));
   }
@@ -334,20 +335,22 @@ class ExprTest : public testing::Test, public VectorTestBase {
     return sql;
   }
 
-  void assertError(
+  std::exception_ptr assertError(
       const std::string& expression,
       const VectorPtr& input,
       const std::string& context,
-      const std::string& topLevelContext,
+      const std::string& additionalContext,
       const std::string& message) {
     try {
       evaluate(expression, makeRowVector({input}));
-      ASSERT_TRUE(false) << "Expected an error";
+      EXPECT_TRUE(false) << "Expected an error";
     } catch (VeloxException& e) {
-      ASSERT_EQ(message, e.message());
-      ASSERT_EQ(context, trimInputPath(e.context()));
-      ASSERT_EQ(topLevelContext, trimInputPath(e.topLevelContext()));
+      EXPECT_EQ(context, trimInputPath(e.context()));
+      EXPECT_EQ(additionalContext, trimInputPath(e.additionalContext()));
+      EXPECT_EQ(message, e.message());
+      return e.wrappedException();
     }
+    return nullptr;
   }
 
   void assertErrorSimplified(
@@ -369,14 +372,14 @@ class ExprTest : public testing::Test, public VectorTestBase {
       const std::string& expression,
       const VectorPtr& input,
       const std::string& context,
-      const std::string& topLevelContext,
+      const std::string& additionalContext,
       const std::string& message) {
     try {
       evaluate(expression, makeRowVector({input}));
       EXPECT_TRUE(false) << "Expected an error";
     } catch (VeloxException& e) {
       EXPECT_EQ(context, trimInputPath(e.context()));
-      EXPECT_EQ(topLevelContext, trimInputPath(e.topLevelContext()));
+      EXPECT_EQ(additionalContext, trimInputPath(e.additionalContext()));
       EXPECT_EQ(message, e.message());
       return e.wrappedException();
     }
@@ -2410,7 +2413,9 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
     FAIL() << "Expected an exception";
   } catch (const VeloxException& e) {
     ASSERT_EQ("always_throws(c0)", e.context());
-    ASSERT_EQ("plus(always_throws(c0), c1)", e.topLevelContext());
+    ASSERT_EQ(
+        "Top-level Expression: plus(always_throws(c0), c1)",
+        e.additionalContext());
   }
 
   try {
@@ -2419,8 +2424,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT)", e.context());
     ASSERT_EQ(
-        "plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))",
-        e.topLevelContext());
+        "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))",
+        e.additionalContext());
   }
 
   try {
@@ -2429,8 +2434,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("mod(cast((c1) as BIGINT), 0:BIGINT)", e.context());
     ASSERT_EQ(
-        "plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))",
-        e.topLevelContext());
+        "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))",
+        e.additionalContext());
   }
 
   // Enable saving vector and expression SQL for system errors only.
@@ -2444,7 +2449,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("runtime_error(c0)", e.context());
     ASSERT_EQ(
-        "plus(runtime_error(c0), c1)", trimInputPath(e.topLevelContext()));
+        "Top-level Expression: plus(runtime_error(c0), c1)",
+        trimInputPath(e.additionalContext()));
     verifyDataAndSqlPaths(e, data);
   }
 
@@ -2454,8 +2460,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT)", e.context());
     ASSERT_EQ(
-        "plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))",
-        e.topLevelContext())
+        "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))",
+        e.additionalContext())
         << e.errorSource();
   }
 
@@ -2465,8 +2471,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT)", e.context());
     ASSERT_EQ(
-        "plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))",
-        e.topLevelContext());
+        "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))",
+        e.additionalContext());
   }
 
   // Enable saving vector and expression SQL for all errors.
@@ -2480,7 +2486,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("always_throws(c0)", e.context());
     ASSERT_EQ(
-        "plus(always_throws(c0), c1)", trimInputPath(e.topLevelContext()));
+        "Top-level Expression: plus(always_throws(c0), c1)",
+        trimInputPath(e.additionalContext()));
     verifyDataAndSqlPaths(e, data);
   }
 
@@ -2490,8 +2497,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT)", e.context());
     ASSERT_EQ(
-        "plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))",
-        trimInputPath(e.topLevelContext()));
+        "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((plus(c0, c1)) as BIGINT), 0:BIGINT))",
+        trimInputPath(e.additionalContext()));
     verifyDataAndSqlPaths(e, data);
   }
 
@@ -2501,8 +2508,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("mod(cast((c1) as BIGINT), 0:BIGINT)", e.context());
     ASSERT_EQ(
-        "plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))",
-        trimInputPath(e.topLevelContext()));
+        "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))",
+        trimInputPath(e.additionalContext()));
     verifyDataAndSqlPaths(e, data);
   }
 
@@ -2512,8 +2519,8 @@ TEST_P(ParameterizedExprTest, exceptionContext) {
   } catch (const VeloxException& e) {
     ASSERT_EQ("mod(cast((c1) as BIGINT), 0:BIGINT)", e.context());
     ASSERT_EQ(
-        "plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))",
-        trimInputPath(e.topLevelContext()));
+        "Top-level Expression: plus(cast((c0) as BIGINT), mod(cast((c1) as BIGINT), 0:BIGINT))",
+        trimInputPath(e.additionalContext()));
     verifyDataAndSqlPaths(e, data);
   }
 }
@@ -2536,19 +2543,19 @@ TEST_P(ParameterizedExprTest, stdExceptionContext) {
   registerFunction<AlwaysThrowsStdExceptionFunction, int64_t, int64_t>(
       {"throw_invalid_argument"});
 
-  auto wrappedEx = assertWrappedException(
+  auto wrappedEx = assertError(
       "throw_invalid_argument(c0) + 5",
       data,
       "throw_invalid_argument(c0)",
-      "plus(throw_invalid_argument(c0), 5:BIGINT)",
+      "Top-level Expression: plus(throw_invalid_argument(c0), 5:BIGINT)",
       "This is a test");
   ASSERT_THROW(std::rethrow_exception(wrappedEx), std::invalid_argument);
 
-  wrappedEx = assertWrappedException(
+  wrappedEx = assertError(
       "throw_invalid_argument(c0 + 5)",
       data,
-      "throw_invalid_argument(plus(c0, 5:BIGINT))",
-      "Same as context.",
+      "Top-level Expression: throw_invalid_argument(plus(c0, 5:BIGINT))",
+      "",
       "This is a test");
   ASSERT_THROW(std::rethrow_exception(wrappedEx), std::invalid_argument);
 }
@@ -2947,15 +2954,15 @@ TEST_P(ParameterizedExprTest, castExceptionContext) {
   assertError(
       "cast(c0 as bigint)",
       makeFlatVector<std::string>({"1a"}),
-      "cast((c0) as BIGINT)",
-      "Same as context.",
+      "Top-level Expression: cast((c0) as BIGINT)",
+      "",
       "Cannot cast VARCHAR '1a' to BIGINT. Non-whitespace character found after end of conversion: \"a\"");
 
   assertError(
       "cast(c0 as timestamp)",
       makeFlatVector(std::vector<int8_t>{1}),
-      "cast((c0) as TIMESTAMP)",
-      "Same as context.",
+      "Top-level Expression: cast((c0) as TIMESTAMP)",
+      "",
       "Cannot cast TINYINT '1' to TIMESTAMP. Conversion to Timestamp is not supported");
 }
 
@@ -2964,7 +2971,7 @@ TEST_P(ParameterizedExprTest, switchExceptionContext) {
       "case c0 when 7 then c0 / 0 else 0 end",
       makeFlatVector(std::vector<int64_t>{7}),
       "divide(c0, 0:BIGINT)",
-      "switch(eq(c0, 7:BIGINT), divide(c0, 0:BIGINT), 0:BIGINT)",
+      "Top-level Expression: switch(eq(c0, 7:BIGINT), divide(c0, 0:BIGINT), 0:BIGINT)",
       "division by zero");
 }
 
@@ -2975,7 +2982,7 @@ TEST_P(ParameterizedExprTest, conjunctExceptionContext) {
       "if (c0 % 409 < 300 and c0 / 0 < 30, 1, 2)",
       data,
       "divide(c0, 0:BIGINT)",
-      "switch(and(lt(mod(c0, 409:BIGINT), 300:BIGINT), lt(divide(c0, 0:BIGINT), 30:BIGINT)), 1:BIGINT, 2:BIGINT)",
+      "Top-level Expression: switch(and(lt(mod(c0, 409:BIGINT), 300:BIGINT), lt(divide(c0, 0:BIGINT), 30:BIGINT)), 1:BIGINT, 2:BIGINT)",
       "division by zero");
 }
 
@@ -2987,7 +2994,7 @@ TEST_P(ParameterizedExprTest, lambdaExceptionContext) {
       "filter(c0, x -> (x / 0 > 1))",
       array,
       "divide(x, 0:BIGINT)",
-      "filter(c0, (x) -> gt(divide(x, 0:BIGINT), 1:BIGINT))",
+      "Top-level Expression: filter(c0, (x) -> gt(divide(x, 0:BIGINT), 1:BIGINT))",
       "division by zero");
 }
 
@@ -3529,7 +3536,7 @@ TEST_P(ParameterizedExprTest, applyFunctionNoResult) {
       "always_throws_vector_function(c0) AND true",
       makeFlatVector<int32_t>({1, 2, 3}),
       "always_throws_vector_function(c0)",
-      "and(always_throws_vector_function(c0), true:BOOLEAN)",
+      "Top-level Expression: and(always_throws_vector_function(c0), true:BOOLEAN)",
       TestingAlwaysThrowsVectorFunction::kVeloxErrorMessage);
 
   exec::registerVectorFunction(
@@ -3541,7 +3548,7 @@ TEST_P(ParameterizedExprTest, applyFunctionNoResult) {
       "no_op(c0) AND true",
       makeFlatVector<int32_t>({1, 2, 3}),
       "no_op(c0)",
-      "and(no_op(c0), true:BOOLEAN)",
+      "Top-level Expression: and(no_op(c0), true:BOOLEAN)",
       "Function neither returned results nor threw exception.");
 }
 
@@ -3671,8 +3678,8 @@ TEST_P(ParameterizedExprTest, stdExceptionInVectorFunction) {
   assertError(
       "always_throws_vector_function(c0)",
       makeFlatVector<int32_t>({1, 2, 3}),
-      "always_throws_vector_function(c0)",
-      "Same as context.",
+      "Top-level Expression: always_throws_vector_function(c0)",
+      "",
       TestingAlwaysThrowsVectorFunction::kStdErrorMessage);
 
   assertErrorSimplified(
diff --git a/velox/expression/tests/ExpressionRunnerUnitTest.cpp b/velox/expression/tests/ExpressionRunnerUnitTest.cpp
index a6107543885be..2c56715b197f3 100644
--- a/velox/expression/tests/ExpressionRunnerUnitTest.cpp
+++ b/velox/expression/tests/ExpressionRunnerUnitTest.cpp
@@ -15,12 +15,12 @@
  */
 
 #include <gtest/gtest.h>
-#include "FuzzerRunner.h"
 #include "velox/dwio/common/tests/utils/BatchMaker.h"
 #include "velox/exec/tests/utils/TempFilePath.h"
 #include "velox/expression/Expr.h"
 #include "velox/expression/SignatureBinder.h"
-#include "velox/expression/tests/ExpressionFuzzer.h"
+#include "velox/expression/fuzzer/ExpressionFuzzer.h"
+#include "velox/expression/fuzzer/FuzzerRunner.h"
 #include "velox/expression/tests/ExpressionRunner.h"
 #include "velox/functions/prestosql/registration/RegistrationFunctions.h"
 #include "velox/vector/VectorSaver.h"
diff --git a/velox/expression/tests/ExpressionVerifier.cpp b/velox/expression/tests/ExpressionVerifier.cpp
index 9fd070a8657d6..1d3b74cf82fb1 100644
--- a/velox/expression/tests/ExpressionVerifier.cpp
+++ b/velox/expression/tests/ExpressionVerifier.cpp
@@ -41,7 +41,7 @@ void logRowVector(const RowVectorPtr& rowVector) {
 }
 } // namespace
 
-ResultOrError ExpressionVerifier::verify(
+fuzzer::ResultOrError ExpressionVerifier::verify(
     const std::vector<core::TypedExprPtr>& plans,
     const RowVectorPtr& rowVector,
     VectorPtr&& resultVector,
@@ -127,7 +127,7 @@ ResultOrError ExpressionVerifier::verify(
 
     if (copiedInput) {
       // Flatten the input vector as an optimization if its very deeply nested.
-      compareVectors(
+      fuzzer::compareVectors(
           copiedInput,
           BaseVector::copy(*inputRowVector),
           "Copy of original input",
@@ -162,7 +162,7 @@ ResultOrError ExpressionVerifier::verify(
     exprSetSimplified.eval(rows, evalCtxSimplified, simplifiedEvalResult);
 
     // Flatten the input vector as an optimization if its very deeply nested.
-    compareVectors(
+    fuzzer::compareVectors(
         copy,
         BaseVector::copy(*rowVector),
         "Copy of original input",
@@ -183,14 +183,14 @@ ResultOrError ExpressionVerifier::verify(
     if (exceptionCommonPtr || exceptionSimplifiedPtr) {
       // Throws in case exceptions are not compatible. If they are compatible,
       // return false to signal that the expression failed.
-      compareExceptions(exceptionCommonPtr, exceptionSimplifiedPtr);
+      fuzzer::compareExceptions(exceptionCommonPtr, exceptionSimplifiedPtr);
       return {nullptr, exceptionCommonPtr};
     } else {
       // Throws in case output is different.
       VELOX_CHECK_EQ(commonEvalResult.size(), plans.size());
       VELOX_CHECK_EQ(simplifiedEvalResult.size(), plans.size());
       for (int i = 0; i < plans.size(); ++i) {
-        compareVectors(
+        fuzzer::compareVectors(
             commonEvalResult[i],
             simplifiedEvalResult[i],
             "common path results ",
@@ -431,7 +431,7 @@ class MinimalSubExpressionFinder {
           results ? BaseVector::copy(*results) : nullptr,
           true, // canThrow
           columnsToWrapInLazy);
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       success = false;
     }
     FLAGS_minloglevel = 0;
diff --git a/velox/expression/tests/ExpressionVerifier.h b/velox/expression/tests/ExpressionVerifier.h
index be7cb52e680cb..240768b2aa68f 100644
--- a/velox/expression/tests/ExpressionVerifier.h
+++ b/velox/expression/tests/ExpressionVerifier.h
@@ -18,7 +18,7 @@
 
 #include "velox/core/ITypedExpr.h"
 #include "velox/core/QueryCtx.h"
-#include "velox/expression/tests/utils/FuzzerToolkit.h"
+#include "velox/expression/fuzzer/FuzzerToolkit.h"
 #include "velox/functions/FunctionRegistry.h"
 #include "velox/type/Type.h"
 #include "velox/vector/BaseVector.h"
@@ -63,7 +63,7 @@ class ExpressionVerifier {
   //  - exception thrown by the common path if both paths failed with compatible
   //  exceptions.
   //  - throws otherwise (incompatible exceptions or different results).
-  ResultOrError verify(
+  fuzzer::ResultOrError verify(
       const std::vector<core::TypedExprPtr>& plans,
       const RowVectorPtr& rowVector,
       VectorPtr&& resultVector,
diff --git a/velox/expression/tests/SimpleFunctionTest.cpp b/velox/expression/tests/SimpleFunctionTest.cpp
index 63143e7f06a81..cd76de6bf3a67 100644
--- a/velox/expression/tests/SimpleFunctionTest.cpp
+++ b/velox/expression/tests/SimpleFunctionTest.cpp
@@ -19,8 +19,10 @@
 #include <string>
 
 #include <glog/logging.h>
+#include <gtest/gtest.h>
 #include "folly/lang/Hint.h"
-#include "gtest/gtest.h"
+
+#include "velox/common/base/tests/GTestUtils.h"
 #include "velox/expression/Expr.h"
 #include "velox/expression/SimpleFunctionAdapter.h"
 #include "velox/functions/Udf.h"
@@ -1135,7 +1137,7 @@ struct StringInputIntOutputFunction {
   }
 };
 
-TEST_F(SimpleFunctionTest, TestcallAscii) {
+TEST_F(SimpleFunctionTest, callAscii) {
   registerFunction<StringInputIntOutputFunction, int32_t, Varchar>(
       {"get_input_size"});
   auto asciiInput = makeFlatVector<std::string>({"abc123", "10% #\0"});
@@ -1478,4 +1480,57 @@ TEST_F(SimpleFunctionTest, decimalsWithConstraints) {
   }
 }
 
+template <typename TExec>
+struct NoThrowFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(TExec);
+
+  Status call(out_type<int64_t>& out, const arg_type<int64_t>& in) {
+    if (in % 3 != 0) {
+      return Status::UserError("Input must be divisible by 3");
+    }
+
+    // Throwing exceptions is not recommended, but allowed.
+    VELOX_USER_CHECK(in % 2 == 0, "Input must be even");
+
+    if (in == 6) {
+      return Status::UnknownError("Input must not be 6");
+    }
+
+    out = in / 6;
+    return Status::OK();
+  }
+};
+
+TEST_F(SimpleFunctionTest, noThrow) {
+  registerFunction<NoThrowFunction, int64_t, int64_t>({"no_throw"});
+
+  auto result = evaluateOnce<int64_t, int64_t>("no_throw(c0)", 12);
+  EXPECT_EQ(2, result);
+
+  // Errors reported via Status.
+  VELOX_ASSERT_THROW(
+      (evaluateOnce<int64_t, int64_t>("no_throw(c0)", 10)),
+      "Input must be divisible by 3");
+
+  result = evaluateOnce<int64_t, int64_t>("try(no_throw(c0))", 10);
+  EXPECT_EQ(std::nullopt, result);
+
+  // Errors reported by throwing exceptions.
+  VELOX_ASSERT_THROW(
+      (evaluateOnce<int64_t, int64_t>("no_throw(c0)", 15)),
+      "Input must be even");
+
+  result = evaluateOnce<int64_t, int64_t>("try(no_throw(c0))", 15);
+  EXPECT_EQ(std::nullopt, result);
+
+  // Non-user errors cannot be suppressed by TRY.
+  VELOX_ASSERT_THROW(
+      (evaluateOnce<int64_t, int64_t>("no_throw(c0)", 6)),
+      "Input must not be 6");
+
+  VELOX_ASSERT_THROW(
+      (evaluateOnce<int64_t, int64_t>("try(no_throw(c0))", 6)),
+      "Input must not be 6");
+}
+
 } // namespace
diff --git a/velox/flag_definitions/flags.cpp b/velox/flag_definitions/flags.cpp
index d25ae44713f1b..b26eeb5d4c704 100644
--- a/velox/flag_definitions/flags.cpp
+++ b/velox/flag_definitions/flags.cpp
@@ -30,7 +30,7 @@ DEFINE_int32(
 
 DEFINE_bool(
     velox_time_allocations,
-    true,
+    false,
     "Record time and volume for large allocation/free");
 
 // Used in common/base/VeloxException.cpp
diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp
index 451640ef20252..c8fdfd80f160e 100644
--- a/velox/functions/lib/Re2Functions.cpp
+++ b/velox/functions/lib/Re2Functions.cpp
@@ -217,7 +217,7 @@ class Re2MatchConstantPattern final : public exec::VectorFunction {
     exec::LocalDecodedVector toSearch(context, *args[0], rows);
     try {
       checkForBadPattern(re_);
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       context.setErrors(rows, std::current_exception());
       return;
     }
@@ -288,7 +288,7 @@ class Re2SearchAndExtractConstantPattern final : public exec::VectorFunction {
     // apply() will not be invoked if the selection is empty.
     try {
       checkForBadPattern(re_);
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       context.setErrors(rows, std::current_exception());
       return;
     }
@@ -312,7 +312,7 @@ class Re2SearchAndExtractConstantPattern final : public exec::VectorFunction {
     if (const auto groupId = getIfConstant<T>(*args[2])) {
       try {
         checkForBadGroupId(*groupId, re_);
-      } catch (const std::exception& e) {
+      } catch (const std::exception&) {
         context.setErrors(rows, std::current_exception());
         return;
       }
@@ -825,7 +825,7 @@ class LikeWithRe2 final : public exec::VectorFunction {
     // apply() will not be invoked if the selection is empty.
     try {
       checkForBadPattern(*re_);
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       context.setErrors(rows, std::current_exception());
       return;
     }
@@ -1058,7 +1058,7 @@ class Re2ExtractAllConstantPattern final : public exec::VectorFunction {
     VELOX_CHECK(args.size() == 2 || args.size() == 3);
     try {
       checkForBadPattern(re_);
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       context.setErrors(rows, std::current_exception());
       return;
     }
@@ -1083,7 +1083,7 @@ class Re2ExtractAllConstantPattern final : public exec::VectorFunction {
       //
       try {
         checkForBadGroupId(*_groupId, re_);
-      } catch (const std::exception& e) {
+      } catch (const std::exception&) {
         context.setErrors(rows, std::current_exception());
         return;
       }
diff --git a/velox/functions/lib/RegistrationHelpers.h b/velox/functions/lib/RegistrationHelpers.h
index e411057d8cd29..cea30c50a2a1e 100644
--- a/velox/functions/lib/RegistrationHelpers.h
+++ b/velox/functions/lib/RegistrationHelpers.h
@@ -73,6 +73,14 @@ void registerUnaryIntegral(const std::vector<std::string>& aliases) {
   registerFunction<T, int64_t, int64_t>(aliases);
 }
 
+template <template <class> class T, typename TReturn>
+void registerUnaryIntegralWithTReturn(const std::vector<std::string>& aliases) {
+  registerFunction<T, TReturn, int8_t>(aliases);
+  registerFunction<T, TReturn, int16_t>(aliases);
+  registerFunction<T, TReturn, int32_t>(aliases);
+  registerFunction<T, TReturn, int64_t>(aliases);
+}
+
 template <template <class> class T>
 void registerUnaryFloatingPoint(const std::vector<std::string>& aliases) {
   registerFunction<T, double, double>(aliases);
diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h
index c3a3b96a1190c..94180fe2ab3be 100644
--- a/velox/functions/prestosql/BinaryFunctions.h
+++ b/velox/functions/prestosql/BinaryFunctions.h
@@ -284,7 +284,6 @@ struct ToBase64Function {
 template <typename T>
 struct FromBase64Function {
   VELOX_DEFINE_FUNCTION_TYPES(T);
-
   FOLLY_ALWAYS_INLINE void call(
       out_type<Varbinary>& result,
       const arg_type<Varchar>& input) {
@@ -292,7 +291,8 @@ struct FromBase64Function {
       auto inputSize = input.size();
       result.resize(
           encoding::Base64::calculateDecodedSize(input.data(), inputSize));
-      encoding::Base64::decode(input.data(), input.size(), result.data());
+      encoding::Base64::decode(
+          input.data(), inputSize, result.data(), result.size());
     } catch (const encoding::Base64Exception& e) {
       VELOX_USER_FAIL(e.what());
     }
@@ -302,19 +302,14 @@ struct FromBase64Function {
 template <typename T>
 struct FromBase64UrlFunction {
   VELOX_DEFINE_FUNCTION_TYPES(T);
-
   FOLLY_ALWAYS_INLINE void call(
       out_type<Varbinary>& result,
       const arg_type<Varchar>& input) {
-    auto inputData = input.data();
     auto inputSize = input.size();
-    bool hasPad =
-        inputSize > 0 && (*(input.end() - 1) == encoding::Base64::kBase64Pad);
     result.resize(
-        encoding::Base64::calculateDecodedSize(inputData, inputSize, hasPad));
-    hasPad = false; // calculateDecodedSize() updated inputSize to exclude pad.
+        encoding::Base64::calculateDecodedSize(input.data(), inputSize));
     encoding::Base64::decodeUrl(
-        inputData, inputSize, result.data(), result.size(), hasPad);
+        input.data(), inputSize, result.data(), result.size());
   }
 };
 
diff --git a/velox/functions/prestosql/DateTimeFunctions.h b/velox/functions/prestosql/DateTimeFunctions.h
index 95fa3a8bd96d3..4d0cda270c4d3 100644
--- a/velox/functions/prestosql/DateTimeFunctions.h
+++ b/velox/functions/prestosql/DateTimeFunctions.h
@@ -1293,7 +1293,7 @@ struct DateParseFunction {
     }
   }
 
-  FOLLY_ALWAYS_INLINE void call(
+  FOLLY_ALWAYS_INLINE Status call(
       out_type<Timestamp>& result,
       const arg_type<Varchar>& input,
       const arg_type<Varchar>& format) {
@@ -1302,15 +1302,17 @@ struct DateParseFunction {
           std::string_view(format.data(), format.size()));
     }
 
-    auto dateTimeResult =
-        format_->parse(std::string_view(input.data(), input.size()), true)
-            .value();
+    auto dateTimeResult = format_->parse((std::string_view)(input));
+    if (!dateTimeResult.has_value()) {
+      return Status::UserError("Invalid date format: '{}'", input);
+    }
 
     // Since MySql format has no timezone specifier, simply check if session
     // timezone was provided. If not, fallback to 0 (GMT).
     int16_t timezoneId = sessionTzID_.value_or(0);
-    dateTimeResult.timestamp.toGMT(timezoneId);
-    result = dateTimeResult.timestamp;
+    dateTimeResult->timestamp.toGMT(timezoneId);
+    result = dateTimeResult->timestamp;
+    return Status::OK();
   }
 };
 
diff --git a/velox/functions/prestosql/benchmarks/URLBenchmark.cpp b/velox/functions/prestosql/benchmarks/URLBenchmark.cpp
index 3ba4e941fc270..0cd63aedf0609 100644
--- a/velox/functions/prestosql/benchmarks/URLBenchmark.cpp
+++ b/velox/functions/prestosql/benchmarks/URLBenchmark.cpp
@@ -127,7 +127,7 @@ struct FollyUrlExtractPortFunction {
       result = parsedUrl.port();
     } catch (const folly::ConversionError&) {
       return false;
-    } catch (const std::invalid_argument& e) {
+    } catch (const std::invalid_argument&) {
       return false;
     }
     return true;
@@ -154,7 +154,7 @@ struct FollyUrlExtractParameterFunction {
           strncpy(result.data(), pair.second.c_str(), pair.second.length());
         }
       }
-    } catch (const std::invalid_argument& e) {
+    } catch (const std::invalid_argument&) {
       return false;
     }
     return false;
diff --git a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp
index e690674ba431b..bcd3d3cdaf595 100644
--- a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp
+++ b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp
@@ -425,6 +425,7 @@ TEST_F(BinaryFunctionsTest, fromBase64) {
   EXPECT_EQ(std::nullopt, fromBase64(std::nullopt));
   EXPECT_EQ("", fromBase64(""));
   EXPECT_EQ("a", fromBase64("YQ=="));
+  EXPECT_EQ("ab", fromBase64("YWI="));
   EXPECT_EQ("abc", fromBase64("YWJj"));
   EXPECT_EQ("hello world", fromBase64("aGVsbG8gd29ybGQ="));
   EXPECT_EQ(
@@ -433,6 +434,11 @@ TEST_F(BinaryFunctionsTest, fromBase64) {
 
   EXPECT_THROW(fromBase64("YQ="), VeloxUserError);
   EXPECT_THROW(fromBase64("YQ==="), VeloxUserError);
+
+  // Check encoded strings without padding
+  EXPECT_EQ("a", fromBase64("YQ"));
+  EXPECT_EQ("ab", fromBase64("YWI"));
+  EXPECT_EQ("abcd", fromBase64("YWJjZA"));
 }
 
 TEST_F(BinaryFunctionsTest, fromBase64Url) {
diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
index e6f4a8c139622..425cfd4879bb9 100644
--- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
+++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
@@ -3467,13 +3467,9 @@ TEST_F(DateTimeFunctionsTest, dateParse) {
   EXPECT_EQ(
       Timestamp(-66600, 0), dateParse("1969-12-31+11:00", "%Y-%m-%d+%H:%i"));
 
-  VELOX_ASSERT_THROW(
-      dateParse("", "%y+"), "Invalid date format: '' is malformed at ''");
-  VELOX_ASSERT_THROW(
-      dateParse("1", "%y+"), "Invalid date format: '1' is malformed at '1'");
-  VELOX_ASSERT_THROW(
-      dateParse("116", "%y+"),
-      "Invalid date format: '116' is malformed at '6'");
+  VELOX_ASSERT_THROW(dateParse("", "%y+"), "Invalid date format: ''");
+  VELOX_ASSERT_THROW(dateParse("1", "%y+"), "Invalid date format: '1'");
+  VELOX_ASSERT_THROW(dateParse("116", "%y+"), "Invalid date format: '116'");
 }
 
 TEST_F(DateTimeFunctionsTest, dateFunctionVarchar) {
diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
index 702b461324ef8..8b980fc319593 100644
--- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
+++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
@@ -272,7 +272,7 @@ TEST_F(JsonFunctionsTest, jsonParse) {
     jsonParse(R"({"k1":})");
     FAIL() << "Error expected";
   } catch (const VeloxUserError& e) {
-    ASSERT_EQ(e.context(), "json_parse(c0)");
+    ASSERT_EQ(e.context(), "Top-level Expression: json_parse(c0)");
   }
 }
 
diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h
index 458d61ade2f5b..5111453ee2ea3 100644
--- a/velox/functions/sparksql/DateTimeFunctions.h
+++ b/velox/functions/sparksql/DateTimeFunctions.h
@@ -760,4 +760,58 @@ struct MakeYMIntervalFunction {
     result = totalMonths;
   }
 };
+
+template <typename T>
+struct UnixSecondsFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  FOLLY_ALWAYS_INLINE void call(
+      int64_t& result,
+      const arg_type<Timestamp>& timestamp) {
+    result = timestamp.getSeconds();
+  }
+};
+
+template <typename T>
+struct TimestampToMicrosFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  FOLLY_ALWAYS_INLINE void call(
+      int64_t& result,
+      const arg_type<Timestamp>& timestamp) {
+    result = timestamp.toMicros();
+  }
+};
+
+template <typename TExec>
+struct MicrosToTimestampFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(TExec);
+
+  template <typename T>
+  FOLLY_ALWAYS_INLINE void call(out_type<Timestamp>& result, const T& micros) {
+    result = Timestamp::fromMicrosNoError(micros);
+  }
+};
+
+template <typename T>
+struct TimestampToMillisFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  FOLLY_ALWAYS_INLINE void call(
+      int64_t& result,
+      const arg_type<Timestamp>& timestamp) {
+    result = timestamp.toMillis();
+  }
+};
+
+template <typename TExec>
+struct MillisToTimestampFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(TExec);
+
+  template <typename T>
+  FOLLY_ALWAYS_INLINE void call(out_type<Timestamp>& result, const T& millis) {
+    result = Timestamp::fromMillisNoError(millis);
+  }
+};
+
 } // namespace facebook::velox::functions::sparksql
diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp
index b553b38ca0e39..80738571beed3 100644
--- a/velox/functions/sparksql/Register.cpp
+++ b/velox/functions/sparksql/Register.cpp
@@ -328,6 +328,9 @@ void registerFunctions(const std::string& prefix) {
 
   registerFunction<UnixDateFunction, int32_t, Date>({prefix + "unix_date"});
 
+  registerFunction<UnixSecondsFunction, int64_t, Timestamp>(
+      {prefix + "unix_seconds"});
+
   registerFunction<UnixTimestampFunction, int64_t>({prefix + "unix_timestamp"});
 
   registerFunction<UnixTimestampParseFunction, int64_t, Varchar>(
@@ -391,6 +394,15 @@ void registerFunctions(const std::string& prefix) {
 
   VELOX_REGISTER_VECTOR_FUNCTION(udf_make_timestamp, prefix + "make_timestamp");
 
+  registerFunction<TimestampToMicrosFunction, int64_t, Timestamp>(
+      {prefix + "unix_micros"});
+  registerUnaryIntegralWithTReturn<MicrosToTimestampFunction, Timestamp>(
+      {prefix + "timestamp_micros"});
+  registerFunction<TimestampToMillisFunction, int64_t, Timestamp>(
+      {prefix + "unix_millis"});
+  registerUnaryIntegralWithTReturn<MillisToTimestampFunction, Timestamp>(
+      {prefix + "timestamp_millis"});
+
   // Register bloom filter function
   registerFunction<BloomFilterMightContainFunction, bool, Varbinary, int64_t>(
       {prefix + "might_contain"});
diff --git a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp
index 9226f496cb4f1..cc46ba3d0e3ed 100644
--- a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp
+++ b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp
@@ -40,6 +40,8 @@ class DateTimeFunctionsTest : public SparkFunctionBaseTest {
   static constexpr int16_t kMaxSmallint = std::numeric_limits<int16_t>::max();
   static constexpr int8_t kMinTinyint = std::numeric_limits<int8_t>::min();
   static constexpr int8_t kMaxTinyint = std::numeric_limits<int8_t>::max();
+  static constexpr int64_t kMinBigint = std::numeric_limits<int64_t>::min();
+  static constexpr int64_t kMaxBigint = std::numeric_limits<int64_t>::max();
 
  protected:
   void setQueryTimeZone(const std::string& timeZone) {
@@ -983,5 +985,128 @@ TEST_F(DateTimeFunctionsTest, yearOfWeek) {
   EXPECT_EQ(2006, yearOfWeek(parseDate("2006-01-02")));
 }
 
+TEST_F(DateTimeFunctionsTest, unixSeconds) {
+  const auto unixSeconds = [&](const StringView time) {
+    return evaluateOnce<int64_t, Timestamp>(
+        "unix_seconds(c0)", util::fromTimestampString(time));
+  };
+  EXPECT_EQ(unixSeconds("1970-01-01 00:00:01"), 1);
+  EXPECT_EQ(unixSeconds("1970-01-01 00:00:00.000127"), 0);
+  EXPECT_EQ(unixSeconds("1969-12-31 23:59:59.999872"), -1);
+  EXPECT_EQ(unixSeconds("1970-01-01 00:35:47.483647"), 2147);
+  EXPECT_EQ(unixSeconds("1971-01-01 00:00:01.483647"), 31536001);
+}
+
+TEST_F(DateTimeFunctionsTest, microsToTimestamp) {
+  const auto microsToTimestamp = [&](int64_t micros) {
+    return evaluateOnce<Timestamp, int64_t>("timestamp_micros(c0)", micros);
+  };
+  EXPECT_EQ(
+      microsToTimestamp(1000000),
+      util::fromTimestampString("1970-01-01 00:00:01"));
+  EXPECT_EQ(
+      microsToTimestamp(1230219000123123),
+      util::fromTimestampString("2008-12-25 15:30:00.123123"));
+
+  EXPECT_EQ(
+      microsToTimestamp(kMaxTinyint),
+      util::fromTimestampString("1970-01-01 00:00:00.000127"));
+  EXPECT_EQ(
+      microsToTimestamp(kMinTinyint),
+      util::fromTimestampString("1969-12-31 23:59:59.999872"));
+  EXPECT_EQ(
+      microsToTimestamp(kMaxSmallint),
+      util::fromTimestampString("1970-01-01 00:00:00.032767"));
+  EXPECT_EQ(
+      microsToTimestamp(kMinSmallint),
+      util::fromTimestampString("1969-12-31 23:59:59.967232"));
+  EXPECT_EQ(
+      microsToTimestamp(kMax),
+      util::fromTimestampString("1970-01-01 00:35:47.483647"));
+  EXPECT_EQ(
+      microsToTimestamp(kMin),
+      util::fromTimestampString("1969-12-31 23:24:12.516352"));
+  EXPECT_EQ(
+      microsToTimestamp(kMaxBigint),
+      util::fromTimestampString("294247-01-10 04:00:54.775807"));
+  EXPECT_EQ(
+      microsToTimestamp(kMinBigint),
+      util::fromTimestampString("-290308-12-21 19:59:05.224192"));
+}
+
+TEST_F(DateTimeFunctionsTest, millisToTimestamp) {
+  const auto millisToTimestamp = [&](int64_t millis) {
+    return evaluateOnce<Timestamp, int64_t>("timestamp_millis(c0)", millis);
+  };
+  EXPECT_EQ(
+      millisToTimestamp(1000),
+      util::fromTimestampString("1970-01-01 00:00:01"));
+  EXPECT_EQ(
+      millisToTimestamp(1230219000123),
+      util::fromTimestampString("2008-12-25 15:30:00.123"));
+
+  EXPECT_EQ(
+      millisToTimestamp(kMaxTinyint),
+      util::fromTimestampString("1970-01-01 00:00:00.127"));
+  EXPECT_EQ(
+      millisToTimestamp(kMinTinyint),
+      util::fromTimestampString("1969-12-31 23:59:59.872"));
+  EXPECT_EQ(
+      millisToTimestamp(kMaxSmallint),
+      util::fromTimestampString("1970-01-01 00:00:32.767"));
+  EXPECT_EQ(
+      millisToTimestamp(kMinSmallint),
+      util::fromTimestampString("1969-12-31 23:59:27.232"));
+  EXPECT_EQ(
+      millisToTimestamp(kMax),
+      util::fromTimestampString("1970-01-25 20:31:23.647"));
+  EXPECT_EQ(
+      millisToTimestamp(kMin),
+      util::fromTimestampString("1969-12-07 03:28:36.352"));
+  EXPECT_EQ(
+      millisToTimestamp(kMaxBigint),
+      util::fromTimestampString("292278994-08-17T07:12:55.807"));
+  EXPECT_EQ(
+      millisToTimestamp(kMinBigint),
+      util::fromTimestampString("-292275055-05-16T16:47:04.192"));
+}
+
+TEST_F(DateTimeFunctionsTest, timestampToMicros) {
+  const auto timestampToMicros = [&](const StringView time) {
+    return evaluateOnce<int64_t, Timestamp>(
+        "unix_micros(c0)", util::fromTimestampString(time));
+  };
+  EXPECT_EQ(timestampToMicros("1970-01-01 00:00:01"), 1000000);
+  EXPECT_EQ(timestampToMicros("2008-12-25 15:30:00.123123"), 1230219000123123);
+
+  EXPECT_EQ(timestampToMicros("1970-01-01 00:00:00.000127"), kMaxTinyint);
+  EXPECT_EQ(timestampToMicros("1969-12-31 23:59:59.999872"), kMinTinyint);
+  EXPECT_EQ(timestampToMicros("1970-01-01 00:00:00.032767"), kMaxSmallint);
+  EXPECT_EQ(timestampToMicros("1969-12-31 23:59:59.967232"), kMinSmallint);
+  EXPECT_EQ(timestampToMicros("1970-01-01 00:35:47.483647"), kMax);
+  EXPECT_EQ(timestampToMicros("1969-12-31 23:24:12.516352"), kMin);
+  EXPECT_EQ(timestampToMicros("294247-01-10 04:00:54.775807"), kMaxBigint);
+  EXPECT_EQ(
+      timestampToMicros("-290308-12-21 19:59:06.224192"), kMinBigint + 1000000);
+}
+
+TEST_F(DateTimeFunctionsTest, timestampToMillis) {
+  const auto timestampToMillis = [&](const StringView time) {
+    return evaluateOnce<int64_t, Timestamp>(
+        "unix_millis(c0)", util::fromTimestampString(time));
+  };
+  EXPECT_EQ(timestampToMillis("1970-01-01 00:00:01"), 1000);
+  EXPECT_EQ(timestampToMillis("2008-12-25 15:30:00.123"), 1230219000123);
+
+  EXPECT_EQ(timestampToMillis("1970-01-01 00:00:00.127"), kMaxTinyint);
+  EXPECT_EQ(timestampToMillis("1969-12-31 23:59:59.872"), kMinTinyint);
+  EXPECT_EQ(timestampToMillis("1970-01-01 00:00:32.767"), kMaxSmallint);
+  EXPECT_EQ(timestampToMillis("1969-12-31 23:59:27.232"), kMinSmallint);
+  EXPECT_EQ(timestampToMillis("1970-01-25 20:31:23.647"), kMax);
+  EXPECT_EQ(timestampToMillis("1969-12-07 03:28:36.352"), kMin);
+  EXPECT_EQ(timestampToMillis("292278994-08-17T07:12:55.807"), kMaxBigint);
+  EXPECT_EQ(timestampToMillis("-292275055-05-16T16:47:04.192"), kMinBigint);
+}
+
 } // namespace
 } // namespace facebook::velox::functions::sparksql::test
diff --git a/velox/functions/sparksql/tests/MakeTimestampTest.cpp b/velox/functions/sparksql/tests/MakeTimestampTest.cpp
index b53bfe32bdea9..9b63d42af20e2 100644
--- a/velox/functions/sparksql/tests/MakeTimestampTest.cpp
+++ b/velox/functions/sparksql/tests/MakeTimestampTest.cpp
@@ -130,6 +130,11 @@ TEST_F(MakeTimestampTest, errors) {
         {INTEGER(), INTEGER(), INTEGER(), INTEGER(), INTEGER(), microsType});
   };
 
+  // Throw if no session time zone.
+  VELOX_ASSERT_USER_THROW(
+      testInvalidArguments(60007000, DECIMAL(16, 6)),
+      "make_timestamp requires session time zone to be set.");
+
   setQueryTimeZone("Asia/Shanghai");
   // Invalid input returns null.
   const auto year = makeFlatVector<int32_t>(
@@ -166,11 +171,6 @@ TEST_F(MakeTimestampTest, errors) {
       "Scalar function signature is not supported: "
       "make_timestamp(INTEGER, INTEGER, INTEGER, INTEGER, INTEGER, "
       "DECIMAL(16, 8)).");
-  // Throw if no session time zone.
-  setQueryTimeZone("");
-  VELOX_ASSERT_THROW(
-      testInvalidArguments(60007000, DECIMAL(16, 6)),
-      "make_timestamp requires session time zone to be set.");
 }
 
 TEST_F(MakeTimestampTest, invalidTimezone) {
@@ -184,13 +184,7 @@ TEST_F(MakeTimestampTest, invalidTimezone) {
       {45678000, 1e6, 6e7, 59999999, std::nullopt}, microsType);
   auto data = makeRowVector({year, month, day, hour, minute, micros});
 
-  // Invalid time zone from query config.
-  setQueryTimeZone("Invalid");
-  VELOX_ASSERT_USER_THROW(
-      evaluate("make_timestamp(c0, c1, c2, c3, c4, c5)", data),
-      "Unknown time zone: 'Invalid'");
-
-  setQueryTimeZone("");
+  // Time zone is not set.
   VELOX_ASSERT_USER_THROW(
       evaluate("make_timestamp(c0, c1, c2, c3, c4, c5)", data),
       "make_timestamp requires session time zone to be set.");
diff --git a/velox/type/Timestamp.h b/velox/type/Timestamp.h
index 269dba53a9d5c..ebb1bc8fbd4ce 100644
--- a/velox/type/Timestamp.h
+++ b/velox/type/Timestamp.h
@@ -224,6 +224,21 @@ struct Timestamp {
     return Timestamp(second, nano);
   }
 
+  static Timestamp fromMicrosNoError(int64_t micros)
+#if defined(__has_feature)
+#if __has_feature(__address_sanitizer__)
+      __attribute__((__no_sanitize__("signed-integer-overflow")))
+#endif
+#endif
+  {
+    if (micros >= 0 || micros % 1'000'000 == 0) {
+      return Timestamp(micros / 1'000'000, (micros % 1'000'000) * 1'000);
+    }
+    auto second = micros / 1'000'000 - 1;
+    auto nano = ((micros - second * 1'000'000) % 1'000'000) * 1'000;
+    return Timestamp(second, nano);
+  }
+
   static Timestamp fromNanos(int64_t nanos) {
     if (nanos >= 0 || nanos % 1'000'000'000 == 0) {
       return Timestamp(nanos / 1'000'000'000, nanos % 1'000'000'000);
diff --git a/velox/vector/BaseVector.h b/velox/vector/BaseVector.h
index d2d6627325c8f..7a6b2d9349e27 100644
--- a/velox/vector/BaseVector.h
+++ b/velox/vector/BaseVector.h
@@ -795,7 +795,9 @@ class BaseVector {
   }
 
   void clearContainingLazyAndWrapped() {
-    containsLazyAndIsWrapped_ = false;
+    if (containsLazyAndIsWrapped_) {
+      containsLazyAndIsWrapped_ = false;
+    }
   }
 
   bool memoDisabled() const {
@@ -919,7 +921,7 @@ class BaseVector {
   // unloaded lazy vector should not be wrapped by two separate top level
   // vectors. This would ensure we avoid it being loaded for two separate set
   // of rows.
-  bool containsLazyAndIsWrapped_{false};
+  std::atomic_bool containsLazyAndIsWrapped_{false};
 
   // Whether we should use Expr::evalWithMemo to cache the result of evaluation
   // on dictionary values (this vector).  Set to false when the dictionary
diff --git a/velox/vector/ConstantVector.h b/velox/vector/ConstantVector.h
index 816c88a0cffc4..2a0e47e44ade4 100644
--- a/velox/vector/ConstantVector.h
+++ b/velox/vector/ConstantVector.h
@@ -129,7 +129,11 @@ class ConstantVector final : public SimpleVector<T> {
     setInternalState();
   }
 
-  virtual ~ConstantVector() override = default;
+  virtual ~ConstantVector() override {
+    if (valueVector_) {
+      valueVector_->clearContainingLazyAndWrapped();
+    }
+  }
 
   bool isNullAt(vector_size_t /*idx*/) const override {
     VELOX_DCHECK(initialized_);
diff --git a/velox/vector/DictionaryVector.h b/velox/vector/DictionaryVector.h
index c7e422490ecc8..97b4d346e4a9c 100644
--- a/velox/vector/DictionaryVector.h
+++ b/velox/vector/DictionaryVector.h
@@ -64,7 +64,9 @@ class DictionaryVector : public SimpleVector<T> {
       std::optional<ByteCount> representedBytes = std::nullopt,
       std::optional<ByteCount> storageByteCount = std::nullopt);
 
-  virtual ~DictionaryVector() override = default;
+  virtual ~DictionaryVector() override {
+    dictionaryValues_->clearContainingLazyAndWrapped();
+  }
 
   bool mayHaveNulls() const override {
     VELOX_DCHECK(initialized_);
@@ -196,6 +198,7 @@ class DictionaryVector : public SimpleVector<T> {
   }
 
   void setDictionaryValues(VectorPtr dictionaryValues) {
+    dictionaryValues_->clearContainingLazyAndWrapped();
     dictionaryValues_ = dictionaryValues;
     initialized_ = false;
     setInternalState();
diff --git a/velox/vector/tests/VectorTest.cpp b/velox/vector/tests/VectorTest.cpp
index eb992afe7d3de..6711347ed8bad 100644
--- a/velox/vector/tests/VectorTest.cpp
+++ b/velox/vector/tests/VectorTest.cpp
@@ -2248,6 +2248,13 @@ TEST_F(VectorTest, nestedLazy) {
       "An unloaded lazy vector cannot be wrapped by two different top level"
       " vectors.");
 
+  // Verify that if the original dictionary layer is destroyed without loading
+  // the underlying vector then the lazy vector can be wrapped in a new encoding
+  // layer.
+  dict.reset();
+  dict = BaseVector::wrapInDictionary(
+      nullptr, makeIndices(size, indexAt), size, lazy);
+
   // Verify that the unloaded dictionary can be nested as long as it has one top
   // level vector.
   EXPECT_NO_THROW(BaseVector::wrapInDictionary(