From 97f3eecfc3a659b3f70cd69dde9a49a04d845930 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Sun, 8 Sep 2024 22:02:54 -0700
Subject: [PATCH 01/41] adds streaming selection and partition

---
 cub/cub/agent/agent_select_if.cuh             |  67 ++--
 .../device/dispatch/dispatch_select_if.cuh    | 348 ++++++++++++------
 2 files changed, 286 insertions(+), 129 deletions(-)
diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 2a9958901ff..fd9e2bbbc61 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -164,10 +164,15 @@ struct partition_distinct_output_t
  * selection)
  *
  * @tparam OffsetT
- *   Signed integer type for global offsets
+ *   Signed integer type for offsets within a partition
  *
- * @tparam ScanTileStateT
- *   The tile state class used in the decoupled look-back
+ * @tparam StreamingContextT
+ *   Type providing the context information for the current partition, with the following member functions:
+ *    input_offset() -> base offset for the input (and flags) iterator
+ *    num_previously_selected() -> base offset for the output iterator for selected items
+ *    num_previously_rejected() -> base offset for the output iterator for rejected items (partition only)
+ *    num_total_items() -> total number of items across all partitions (partition only)
+ *    update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected
  *
  * @tparam KEEP_REJECTS
  *   Whether or not we push rejected items to the back of the output
@@ -179,6 +184,7 @@ template <typename AgentSelectIfPolicyT,
           typename SelectOpT,
           typename EqualityOpT,
           typename OffsetT,
+          typename StreamingContextT,
           bool KEEP_REJECTS,
           bool MayAlias>
 struct AgentSelectIf
@@ -304,11 +310,12 @@ struct AgentSelectIf
 
   _TempStorage& temp_storage; ///< Reference to temp_storage
   WrappedInputIteratorT d_in; ///< Input items
-  OutputIteratorWrapperT d_selected_out; ///< Unique output items
+  OutputIteratorWrapperT d_selected_out; ///< Output iterator for the selected items
   WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable)
   InequalityWrapper<EqualityOpT> inequality_op; ///< T inequality operator
   SelectOpT select_op; ///< Selection operator
   OffsetT num_items; ///< Total number of input items
+  StreamingContextT streaming_context; ///< Context for the current partition
 
   //---------------------------------------------------------------------
   // Constructor
@@ -343,7 +350,8 @@ struct AgentSelectIf
     OutputIteratorWrapperT d_selected_out,
     SelectOpT select_op,
     EqualityOpT equality_op,
-    OffsetT num_items)
+    OffsetT num_items,
+    StreamingContextT streaming_context)
       : temp_storage(temp_storage.Alias())
       , d_in(d_in)
       , d_selected_out(d_selected_out)
@@ -351,6 +359,7 @@ struct AgentSelectIf
       , inequality_op(equality_op)
       , select_op(select_op)
       , num_items(num_items)
+      , streaming_context(streaming_context)
   {}
 
   //---------------------------------------------------------------------
@@ -395,6 +404,7 @@ struct AgentSelectIf
     CTA_SYNC();
 
     FlagT flags[ITEMS_PER_THREAD];
+    auto d_base_flags = d_flags_in + streaming_context.input_offset();
     if (IS_LAST_TILE)
     {
       // Initialize the out-of-bounds flags
@@ -404,11 +414,11 @@ struct AgentSelectIf
         selection_flags[ITEM] = true;
       }
       // Guarded loads
-      BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items);
+      BlockLoadFlags(temp_storage.load_flags).Load(d_base_flags + tile_offset, flags, num_tile_items);
     }
     else
     {
-      BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+      BlockLoadFlags(temp_storage.load_flags).Load(d_base_flags + tile_offset, flags);
     }
 
 #pragma unroll
@@ -513,6 +523,9 @@ struct AgentSelectIf
     OffsetT (&selection_indices)[ITEMS_PER_THREAD],
     OffsetT num_selections)
   {
+    // Get the output iterator with a given base offset
+    auto d_base_selected_out = d_selected_out + streaming_context.num_previously_selected();
+
 // Scatter flagged items
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -521,7 +534,7 @@ struct AgentSelectIf
       {
         if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
         {
-          d_selected_out[selection_indices[ITEM]] = items[ITEM];
+          d_base_selected_out[selection_indices[ITEM]] = items[ITEM];
         }
       }
     }
@@ -568,9 +581,11 @@ struct AgentSelectIf
 
     CTA_SYNC();
 
+    // Get the output iterator with a given base offset
+    auto d_base_selected_out = d_selected_out + streaming_context.num_previously_selected();
     for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
     {
-      d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+      d_base_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
     }
   }
 
@@ -683,8 +698,11 @@ struct AgentSelectIf
     int tile_num_rejections,
     OffsetT num_selections_prefix,
     OffsetT num_rejected_prefix,
-    detail::partition_distinct_output_t<SelectedItT, RejectedItT> partitioned_out_it_wrapper)
+    detail::partition_distinct_output_t<SelectedItT, RejectedItT> partitioned_out_wrapper)
   {
+    auto selected_out_it = partitioned_out_wrapper.selected_it + streaming_context.num_previously_selected();
+    auto rejected_out_it = partitioned_out_wrapper.rejected_it + streaming_context.num_previously_rejected();
+
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
     {
@@ -700,11 +718,11 @@ struct AgentSelectIf
       {
         if (item_idx >= tile_num_rejections)
         {
-          partitioned_out_it_wrapper.selected_it[scatter_offset] = item;
+          selected_out_it[scatter_offset] = item;
         }
         else
         {
-          partitioned_out_it_wrapper.rejected_it[scatter_offset] = item;
+          rejected_out_it[scatter_offset] = item;
         }
       }
     }
@@ -723,16 +741,19 @@ struct AgentSelectIf
     OffsetT num_rejected_prefix,
     PartitionedOutputItT partitioned_out_it)
   {
+    using total_offset_t           = typename StreamingContextT::total_num_items_t;
+    auto const selected_base_begin = streaming_context.num_previously_selected();
+    auto const rejected_base_end   = streaming_context.num_total_items() - streaming_context.num_previously_rejected();
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
     {
       int item_idx      = (ITEM * BLOCK_THREADS) + threadIdx.x;
       int rejection_idx = item_idx;
       int selection_idx = item_idx - tile_num_rejections;
-      OffsetT scatter_offset =
+      total_offset_t scatter_offset =
         (item_idx < tile_num_rejections)
-          ? num_items - num_rejected_prefix - rejection_idx - 1
-          : num_selections_prefix + selection_idx;
+          ? rejected_base_end - static_cast<total_offset_t>(num_rejected_prefix - rejection_idx - 1)
+          : selected_base_begin + num_selections_prefix + selection_idx;
 
       InputT item = temp_storage.raw_exchange.Alias()[item_idx];
 
@@ -771,13 +792,14 @@ struct AgentSelectIf
     OffsetT selection_indices[ITEMS_PER_THREAD];
 
     // Load items
+    auto d_base_in = d_in + streaming_context.input_offset();
     if (IS_LAST_TILE)
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+      BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items, num_tile_items);
     }
     else
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+      BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items);
     }
 
     // Initialize selection_flags
@@ -850,13 +872,14 @@ struct AgentSelectIf
     OffsetT selection_indices[ITEMS_PER_THREAD];
 
     // Load items
+    auto d_base_in = d_in + streaming_context.input_offset();
     if (IS_LAST_TILE)
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+      BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items, num_tile_items);
     }
     else
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+      BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items);
     }
 
     // Initialize selection_flags
@@ -960,7 +983,7 @@ struct AgentSelectIf
     auto tile_state_wrapper = MemoryOrderedTileStateT{tile_state};
 
     // Blocks are launched in increasing order, so just assign one tile per block
-    int tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+    int tile_idx        = blockIdx.x;
     OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(TILE_ITEMS);
 
     if (tile_idx < num_tiles - 1)
@@ -976,8 +999,8 @@ struct AgentSelectIf
 
       if (threadIdx.x == 0)
       {
-        // Output the total number of items selection_flags
-        *d_num_selected_out = num_selections;
+        // Update the number of selected items with this partition's selections
+        streaming_context.update_num_selected(d_num_selected_out, num_selections);
       }
     }
   }
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 50a2022184a..35717403c03 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -65,6 +65,85 @@ CUB_NAMESPACE_BEGIN
 
 namespace detail
 {
+
+template <typename TotalNumItemsT>
+class streaming_select_context_t
+{
+private:
+  TotalNumItemsT first_partition = true;
+  TotalNumItemsT last_partition  = false;
+  TotalNumItemsT total_num_items{};
+  TotalNumItemsT total_previous_num_items{};
+
+  // We use a double-buffer for keeping track of the number of previously selected items
+  // <partition index>: <buffer providing previously selected> -> <buffer to which num. selected is written>
+  // 0: '0'       -> buffer 1
+  // 1: buffer 1  -> buffer 0
+  // 2: buffer 0  -> buffer 1
+  // ...
+  TotalNumItemsT selector              = 0x00U;
+  TotalNumItemsT* d_num_selected_dbuff = nullptr;
+
+public:
+  using total_num_items_t = TotalNumItemsT;
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_select_context_t(
+    TotalNumItemsT* d_num_selected_dbuff, TotalNumItemsT total_num_items, bool is_last_partition)
+      : last_partition(is_last_partition)
+      , total_num_items(total_num_items)
+      , d_num_selected_dbuff(d_num_selected_dbuff)
+  {}
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last)
+  {
+    first_partition = false;
+    last_partition  = next_partition_is_the_last;
+    selector ^= 0x01;
+    total_previous_num_items += num_items;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset()
+  {
+    return first_partition ? TotalNumItemsT{0} : total_previous_num_items;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected()
+  {
+    if (threadIdx.x == 0 && blockIdx.x < 2)
+    {
+    }
+    return first_partition ? TotalNumItemsT{0} : d_num_selected_dbuff[selector];
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected()
+  {
+    if (threadIdx.x == 0 && blockIdx.x < 2)
+    {
+    }
+    return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected());
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items()
+  {
+    return total_num_items;
+  };
+
+  template <typename NumSelectedIteratorT, typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
+  update_num_selected(NumSelectedIteratorT d_num_selected_out, OffsetT num_selections)
+  {
+    if (last_partition)
+    {
+      *d_num_selected_out = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
+    }
+    else
+    {
+      d_num_selected_dbuff[(selector ^ 0x01U)] =
+        num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
+    }
+  };
+};
+
 /**
  * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`.
  */
@@ -79,7 +158,8 @@ struct agent_select_if_wrapper_t
             typename SelectedOutputIteratorT,
             typename SelectOpT,
             typename EqualityOpT,
-            typename OffsetT>
+            typename OffsetT,
+            typename StreamingContextT>
   struct agent_t
       : public AgentSelectIf<AgentSelectIfPolicyT,
                              InputIteratorT,
@@ -88,6 +168,7 @@ struct agent_select_if_wrapper_t
                              SelectOpT,
                              EqualityOpT,
                              OffsetT,
+                             StreamingContextT,
                              KeepRejects,
                              MayAlias>
   {
@@ -98,6 +179,7 @@ struct agent_select_if_wrapper_t
                         SelectOpT,
                         EqualityOpT,
                         OffsetT,
+                        StreamingContextT,
                         KeepRejects,
                         MayAlias>::AgentSelectIf;
   };
@@ -140,7 +222,15 @@ struct agent_select_if_wrapper_t
  *   to be used for selection)
  *
  * @tparam OffsetT
- *   Signed integer type for global offsets
+ *   Signed integer type for offsets within a partition
+ *
+ * @tparam StreamingContextT
+ *   Type providing the context information for the current partition, with the following member functions:
+ *    input_offset() -> base offset for the input (and flags) iterator
+ *    num_previously_selected() -> base offset for the output iterator for selected items
+ *    num_previously_rejected() -> base offset for the output iterator for rejected items (partition only)
+ *    num_total_items() -> total number of items across all partitions (partition only)
+ *    update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected
  *
  * @tparam KEEP_REJECTS
  *   Whether or not we push rejected items to the back of the output
@@ -172,6 +262,9 @@ struct agent_select_if_wrapper_t
  * @param[in] num_tiles
  *   Total number of tiles for the entire problem
  *
+ * @param[in] streaming_context
+ *   The context information for the current partition
+ *
  * @param[in] vsmem
  *   Memory to support virtual shared memory
  */
@@ -184,6 +277,7 @@ template <typename ChainedPolicyT,
           typename SelectOpT,
           typename EqualityOpT,
           typename OffsetT,
+          typename StreamingContextT,
           bool KEEP_REJECTS,
           bool MayAlias>
 __launch_bounds__(int(
@@ -195,7 +289,8 @@ __launch_bounds__(int(
     SelectedOutputIteratorT,
     SelectOpT,
     EqualityOpT,
-    OffsetT>::agent_policy_t::BLOCK_THREADS))
+    OffsetT,
+    StreamingContextT>::agent_policy_t::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel(
     InputIteratorT d_in,
     FlagsInputIteratorT d_flags,
@@ -206,6 +301,7 @@ __launch_bounds__(int(
     EqualityOpT equality_op,
     OffsetT num_items,
     int num_tiles,
+    StreamingContextT streaming_context,
     cub::detail::vsmem_t vsmem)
 {
   using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
@@ -216,7 +312,8 @@ __launch_bounds__(int(
     SelectedOutputIteratorT,
     SelectOpT,
     EqualityOpT,
-    OffsetT>;
+    OffsetT,
+    StreamingContextT>;
 
   using AgentSelectIfPolicyT = typename VsmemHelperT::agent_policy_t;
 
@@ -230,7 +327,7 @@ __launch_bounds__(int(
   typename AgentSelectIfT::TempStorage& temp_storage = VsmemHelperT::get_temp_storage(static_temp_storage, vsmem);
 
   // Process tiles
-  AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items)
+  AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items, streaming_context)
     .ConsumeRange(num_tiles, tile_status, d_num_selected_out);
 
   // If applicable, hints to discard modified cache lines for vsmem
@@ -290,7 +387,16 @@ struct DispatchSelectIf : SelectedPolicy
   /******************************************************************************
    * Types and constants
    ******************************************************************************/
-  using ScanTileStateT = ScanTileState<OffsetT>;
+
+  // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
+  using PerPartitionOffsetT = ::cuda::std::int32_t;
+
+  // Offset type large enough to represent any index within the input and output iterators
+  using NumTotalItemsT = ::cuda::std::int64_t;
+
+  using streaming_context_t = detail::streaming_select_context_t<NumTotalItemsT>;
+
+  using ScanTileStateT = ScanTileState<PerPartitionOffsetT>;
 
   static constexpr int INIT_KERNEL_THREADS = 128;
 
@@ -408,15 +514,42 @@ struct DispatchSelectIf : SelectedPolicy
       SelectedOutputIteratorT,
       SelectOpT,
       EqualityOpT,
-      OffsetT>;
+      PerPartitionOffsetT,
+      streaming_context_t>;
+
+    // Return for empty problem (also needed to avoid division by zero)
+    // TODO(elstehle): In this case d_num_selected_out will never be written. Maybe we want to write it despite?
+    if (num_items == 0)
+    {
+      // If this was just to query temporary storage requirements, return non-empty bytes
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = std::size_t{1};
+      }
+      return cudaSuccess;
+    }
 
     cudaError error = cudaSuccess;
 
     constexpr auto block_threads    = VsmemHelperT::agent_policy_t::BLOCK_THREADS;
     constexpr auto items_per_thread = VsmemHelperT::agent_policy_t::ITEMS_PER_THREAD;
-    constexpr int tile_size         = block_threads * items_per_thread;
-    int num_tiles                   = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
-    const auto vsmem_size           = num_tiles * VsmemHelperT::vsmem_per_block;
+    constexpr auto tile_size        = static_cast<OffsetT>(block_threads * items_per_thread);
+
+    // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size)
+    auto const max_partition_size =
+      num_items > static_cast<OffsetT>(cuda::std::numeric_limits<PerPartitionOffsetT>::max())
+        ? static_cast<OffsetT>(cuda::std::numeric_limits<PerPartitionOffsetT>::max())
+        : num_items;
+
+    // The number of partitions required to "iterate" over the total input
+    auto const num_partitions = cub::DivideAndRoundUp(num_items, max_partition_size);
+
+    // The maximum number of tiles for which we will ever invoke the kernel
+    auto const max_num_tiles_per_invocation =
+      static_cast<OffsetT>(cub::DivideAndRoundUp(max_partition_size, tile_size));
+
+    // The amount of virtual shared memory to allocate
+    const auto vsmem_size = max_num_tiles_per_invocation * VsmemHelperT::vsmem_per_block;
 
     do
     {
@@ -429,17 +562,19 @@ struct DispatchSelectIf : SelectedPolicy
       }
 
       // Specify temporary storage allocation requirements
-      size_t allocation_sizes[2] = {0ULL, vsmem_size};
+      ::cuda::std::size_t streaming_selection_storage_bytes =
+        (num_partitions > 1) ? 2 * sizeof(NumTotalItemsT) : ::cuda::std::size_t{0};
+      ::cuda::std::size_t allocation_sizes[3] = {0ULL, vsmem_size, streaming_selection_storage_bytes};
 
-      // bytes needed for tile status descriptors
-      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      // Bytes needed for tile status descriptors
+      error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles_per_invocation, allocation_sizes[0]));
       if (cudaSuccess != error)
       {
         break;
       }
 
       // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-      void* allocations[2] = {};
+      void* allocations[3] = {};
 
       error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
       if (cudaSuccess != error)
@@ -453,111 +588,109 @@ struct DispatchSelectIf : SelectedPolicy
         break;
       }
 
-      // Construct the tile status interface
-      ScanTileStateT tile_status;
-      error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
-
-      // Log scan_init_kernel configuration
-      int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
-
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
-      _CubLog(
-        "Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+      // Initialize the streaming context with the temporary storage for double-buffering the previously selected items
+      // and the total number (across all partitions) of items
+      streaming_context_t streaming_context{
+        reinterpret_cast<NumTotalItemsT*>(allocations[2]), num_items, (num_partitions <= 1)};
 
-      // Invoke scan_init_kernel to initialize tile descriptors
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
-        .doit(scan_init_kernel, tile_status, num_tiles, d_num_selected_out);
-
-      // Check for failure to launch
-      error = CubDebug(cudaPeekAtLastError());
-      if (cudaSuccess != error)
+      // Iterate over the partitions until all input is processed
+      for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++)
       {
-        break;
-      }
-
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+        OffsetT current_partition_offset = partition_idx * max_partition_size;
+        OffsetT current_num_items =
+          (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : max_partition_size;
+
+        // Construct the tile status interface
+        const auto current_num_tiles = static_cast<int>(cub::DivideAndRoundUp(current_num_items, tile_size));
+        ScanTileStateT tile_status;
+        error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0]));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
 
-      // Return if empty problem
-      if (num_items == 0)
-      {
-        break;
-      }
+        // Log scan_init_kernel configuration
+        int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(current_num_tiles, INIT_KERNEL_THREADS));
 
-      // Get max x-dimension of grid
-      int max_dim_x;
-      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
+                init_grid_size,
+                INIT_KERNEL_THREADS,
+                (long long) stream);
+#endif
+
+        // Invoke scan_init_kernel to initialize tile descriptors
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+          .doit(scan_init_kernel, tile_status, current_num_tiles, d_num_selected_out);
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          break;
+        }
 
-      // Get grid size for scanning tiles
-      dim3 scan_grid_size;
-      scan_grid_size.z = 1;
-      scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
-      scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
 
 // Log select_if_kernel configuration
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
-      {
-        // Get SM occupancy for select_if_kernel
-        int range_select_sm_occupancy;
-        error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out
-                                        select_if_kernel,
-                                        block_threads));
+        {
+          // Get SM occupancy for select_if_kernel
+          int range_select_sm_occupancy;
+          error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out
+                                          select_if_kernel,
+                                          block_threads));
+          if (cudaSuccess != error)
+          {
+            break;
+          }
+
+          _CubLog("Invoking select_if_kernel<<<%d, %d, 0, "
+                  "%lld>>>(), %d items per thread, %d SM occupancy\n",
+                  current_num_tiles,
+                  block_threads,
+                  (long long) stream,
+                  items_per_thread,
+                  range_select_sm_occupancy);
+        }
+#endif
+
+        // Invoke select_if_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream)
+          .doit(select_if_kernel,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                static_cast<PerPartitionOffsetT>(current_num_items),
+                current_num_tiles,
+                streaming_context,
+                cub::detail::vsmem_t{allocations[1]});
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
         if (cudaSuccess != error)
         {
           break;
         }
 
-        _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, "
-                "%lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x,
-                scan_grid_size.y,
-                scan_grid_size.z,
-                block_threads,
-                (long long) stream,
-                items_per_thread,
-                range_select_sm_occupancy);
-      }
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
-
-      // Invoke select_if_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
-        .doit(select_if_kernel,
-              d_in,
-              d_flags,
-              d_selected_out,
-              d_num_selected_out,
-              tile_status,
-              select_op,
-              equality_op,
-              num_items,
-              num_tiles,
-              cub::detail::vsmem_t{allocations[1]});
-
-      // Check for failure to launch
-      error = CubDebug(cudaPeekAtLastError());
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
 
-      // Sync the stream if specified to flush runtime errors
-      error = CubDebug(detail::DebugSyncStream(stream));
-      if (cudaSuccess != error)
-      {
-        break;
+        // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.)
+        streaming_context.advance(current_num_items, (partition_idx + OffsetT{2} == num_partitions));
       }
     } while (0);
 
@@ -580,7 +713,8 @@ struct DispatchSelectIf : SelectedPolicy
         ScanTileStateT,
         SelectOpT,
         EqualityOpT,
-        OffsetT,
+        PerPartitionOffsetT,
+        streaming_context_t,
         KEEP_REJECTS,
         MayAlias>);
   }
@@ -685,7 +819,7 @@ struct DispatchSelectIf : SelectedPolicy
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
 };
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 CUB_NAMESPACE_END

From 3085f8671b5b69db5acc8e5f297cacce4a5a84ee Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Sun, 8 Sep 2024 22:43:56 -0700
Subject: [PATCH 02/41] ensures policy lookup uses per-partition offset type

---
 .../device/dispatch/dispatch_select_if.cuh    | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 35717403c03..2a30ad2f7ed 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -66,6 +66,14 @@ CUB_NAMESPACE_BEGIN
 namespace detail
 {
 
+namespace select
+{
+// Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
+using per_partition_offset_t = ::cuda::std::int32_t;
+
+// Offset type large enough to represent any index within the input and output iterators
+using num_total_items_t = ::cuda::std::int64_t;
+
 template <typename TotalNumItemsT>
 class streaming_select_context_t
 {
@@ -184,6 +192,7 @@ struct agent_select_if_wrapper_t
                         MayAlias>::AgentSelectIf;
   };
 };
+} // namespace select
 } // namespace detail
 
 /******************************************************************************
@@ -283,7 +292,7 @@ template <typename ChainedPolicyT,
 __launch_bounds__(int(
   cub::detail::vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
-    detail::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+    detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
     InputIteratorT,
     FlagsInputIteratorT,
     SelectedOutputIteratorT,
@@ -306,7 +315,7 @@ __launch_bounds__(int(
 {
   using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
     typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
-    detail::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+    detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
     InputIteratorT,
     FlagsInputIteratorT,
     SelectedOutputIteratorT,
@@ -379,7 +388,7 @@ template <typename InputIteratorT,
           bool MayAlias           = false,
           typename SelectedPolicy = detail::device_select_policy_hub<cub::detail::value_t<InputIteratorT>,
                                                                      cub::detail::value_t<FlagsInputIteratorT>,
-                                                                     OffsetT,
+                                                                     detail::select::per_partition_offset_t,
                                                                      MayAlias,
                                                                      KEEP_REJECTS>>
 struct DispatchSelectIf : SelectedPolicy
@@ -389,14 +398,15 @@ struct DispatchSelectIf : SelectedPolicy
    ******************************************************************************/
 
   // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
-  using PerPartitionOffsetT = ::cuda::std::int32_t;
+  using per_partition_offset_t = detail::select::per_partition_offset_t;
 
   // Offset type large enough to represent any index within the input and output iterators
-  using NumTotalItemsT = ::cuda::std::int64_t;
+  using num_total_items_t = detail::select::num_total_items_t;
 
-  using streaming_context_t = detail::streaming_select_context_t<NumTotalItemsT>;
+  // Type used to provide streaming information about each partition's context
+  using streaming_context_t = detail::select::streaming_select_context_t<num_total_items_t>;
 
-  using ScanTileStateT = ScanTileState<PerPartitionOffsetT>;
+  using ScanTileStateT = ScanTileState<per_partition_offset_t>;
 
   static constexpr int INIT_KERNEL_THREADS = 128;
 
@@ -508,13 +518,13 @@ struct DispatchSelectIf : SelectedPolicy
 
     using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
       Policy,
-      detail::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+      detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
       InputIteratorT,
       FlagsInputIteratorT,
       SelectedOutputIteratorT,
       SelectOpT,
       EqualityOpT,
-      PerPartitionOffsetT,
+      per_partition_offset_t,
       streaming_context_t>;
 
     // Return for empty problem (also needed to avoid division by zero)
@@ -537,8 +547,8 @@ struct DispatchSelectIf : SelectedPolicy
 
     // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size)
     auto const max_partition_size =
-      num_items > static_cast<OffsetT>(cuda::std::numeric_limits<PerPartitionOffsetT>::max())
-        ? static_cast<OffsetT>(cuda::std::numeric_limits<PerPartitionOffsetT>::max())
+      num_items > static_cast<OffsetT>(cuda::std::numeric_limits<per_partition_offset_t>::max())
+        ? static_cast<OffsetT>(cuda::std::numeric_limits<per_partition_offset_t>::max())
         : num_items;
 
     // The number of partitions required to "iterate" over the total input
@@ -563,7 +573,7 @@ struct DispatchSelectIf : SelectedPolicy
 
       // Specify temporary storage allocation requirements
       ::cuda::std::size_t streaming_selection_storage_bytes =
-        (num_partitions > 1) ? 2 * sizeof(NumTotalItemsT) : ::cuda::std::size_t{0};
+        (num_partitions > 1) ? 2 * sizeof(num_total_items_t) : ::cuda::std::size_t{0};
       ::cuda::std::size_t allocation_sizes[3] = {0ULL, vsmem_size, streaming_selection_storage_bytes};
 
       // Bytes needed for tile status descriptors
@@ -591,7 +601,7 @@ struct DispatchSelectIf : SelectedPolicy
       // Initialize the streaming context with the temporary storage for double-buffering the previously selected items
       // and the total number (across all partitions) of items
       streaming_context_t streaming_context{
-        reinterpret_cast<NumTotalItemsT*>(allocations[2]), num_items, (num_partitions <= 1)};
+        reinterpret_cast<num_total_items_t*>(allocations[2]), num_items, (num_partitions <= 1)};
 
       // Iterate over the partitions until all input is processed
       for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++)
@@ -670,7 +680,7 @@ struct DispatchSelectIf : SelectedPolicy
                 tile_status,
                 select_op,
                 equality_op,
-                static_cast<PerPartitionOffsetT>(current_num_items),
+                static_cast<per_partition_offset_t>(current_num_items),
                 current_num_tiles,
                 streaming_context,
                 cub::detail::vsmem_t{allocations[1]});
@@ -713,7 +723,7 @@ struct DispatchSelectIf : SelectedPolicy
         ScanTileStateT,
         SelectOpT,
         EqualityOpT,
-        PerPartitionOffsetT,
+        per_partition_offset_t,
         streaming_context_t,
         KEEP_REJECTS,
         MayAlias>);

From 7f264ecf0b3a661cf69a3cbc72d6f32bf441a639 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 9 Sep 2024 22:40:51 -0700
Subject: [PATCH 03/41] mitigates perf degradation on select

---
 cub/cub/agent/agent_select_if.cuh             | 34 +++++-------
 .../device/dispatch/dispatch_select_if.cuh    | 55 ++++++++-----------
 2 files changed, 39 insertions(+), 50 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index fd9e2bbbc61..1dc70230863 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -315,7 +315,7 @@ struct AgentSelectIf
   InequalityWrapper<EqualityOpT> inequality_op; ///< T inequality operator
   SelectOpT select_op; ///< Selection operator
   OffsetT num_items; ///< Total number of input items
-  StreamingContextT streaming_context; ///< Context for the current partition
+  const StreamingContextT& streaming_context; ///< Context for the current partition
 
   //---------------------------------------------------------------------
   // Constructor
@@ -351,7 +351,7 @@ struct AgentSelectIf
     SelectOpT select_op,
     EqualityOpT equality_op,
     OffsetT num_items,
-    StreamingContextT streaming_context)
+    const StreamingContextT& streaming_context)
       : temp_storage(temp_storage.Alias())
       , d_in(d_in)
       , d_selected_out(d_selected_out)
@@ -404,7 +404,6 @@ struct AgentSelectIf
     CTA_SYNC();
 
     FlagT flags[ITEMS_PER_THREAD];
-    auto d_base_flags = d_flags_in + streaming_context.input_offset();
     if (IS_LAST_TILE)
     {
       // Initialize the out-of-bounds flags
@@ -414,11 +413,12 @@ struct AgentSelectIf
         selection_flags[ITEM] = true;
       }
       // Guarded loads
-      BlockLoadFlags(temp_storage.load_flags).Load(d_base_flags + tile_offset, flags, num_tile_items);
+      BlockLoadFlags(temp_storage.load_flags)
+        .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items);
     }
     else
     {
-      BlockLoadFlags(temp_storage.load_flags).Load(d_base_flags + tile_offset, flags);
+      BlockLoadFlags(temp_storage.load_flags).Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags);
     }
 
 #pragma unroll
@@ -523,9 +523,6 @@ struct AgentSelectIf
     OffsetT (&selection_indices)[ITEMS_PER_THREAD],
     OffsetT num_selections)
   {
-    // Get the output iterator with a given base offset
-    auto d_base_selected_out = d_selected_out + streaming_context.num_previously_selected();
-
 // Scatter flagged items
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -534,7 +531,7 @@ struct AgentSelectIf
       {
         if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
         {
-          d_base_selected_out[selection_indices[ITEM]] = items[ITEM];
+          *((d_selected_out + streaming_context.num_previously_selected()) + selection_indices[ITEM]) = items[ITEM];
         }
       }
     }
@@ -581,11 +578,10 @@ struct AgentSelectIf
 
     CTA_SYNC();
 
-    // Get the output iterator with a given base offset
-    auto d_base_selected_out = d_selected_out + streaming_context.num_previously_selected();
     for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
     {
-      d_base_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+      *((d_selected_out + streaming_context.num_previously_selected()) + (num_selections_prefix + item)) =
+        temp_storage.raw_exchange.Alias()[item];
     }
   }
 
@@ -752,7 +748,7 @@ struct AgentSelectIf
       int selection_idx = item_idx - tile_num_rejections;
       total_offset_t scatter_offset =
         (item_idx < tile_num_rejections)
-          ? rejected_base_end - static_cast<total_offset_t>(num_rejected_prefix - rejection_idx - 1)
+          ? rejected_base_end - static_cast<total_offset_t>(num_rejected_prefix + rejection_idx + 1)
           : selected_base_begin + num_selections_prefix + selection_idx;
 
       InputT item = temp_storage.raw_exchange.Alias()[item_idx];
@@ -792,14 +788,14 @@ struct AgentSelectIf
     OffsetT selection_indices[ITEMS_PER_THREAD];
 
     // Load items
-    auto d_base_in = d_in + streaming_context.input_offset();
     if (IS_LAST_TILE)
     {
-      BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items, num_tile_items);
+      BlockLoadT(temp_storage.load_items)
+        .Load((d_in + streaming_context.input_offset()) + tile_offset, items, num_tile_items);
     }
     else
     {
-      BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items);
+      BlockLoadT(temp_storage.load_items).Load((d_in + streaming_context.input_offset()) + tile_offset, items);
     }
 
     // Initialize selection_flags
@@ -872,14 +868,14 @@ struct AgentSelectIf
     OffsetT selection_indices[ITEMS_PER_THREAD];
 
     // Load items
-    auto d_base_in = d_in + streaming_context.input_offset();
     if (IS_LAST_TILE)
     {
-      BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items, num_tile_items);
+      BlockLoadT(temp_storage.load_items)
+        .Load((d_in + streaming_context.input_offset()) + tile_offset, items, num_tile_items);
     }
     else
     {
-      BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items);
+      BlockLoadT(temp_storage.load_items).Load((d_in + streaming_context.input_offset()) + tile_offset, items);
     }
 
     // Initialize selection_flags
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 2a30ad2f7ed..27e5a68a1f3 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -75,79 +75,71 @@ using per_partition_offset_t = ::cuda::std::int32_t;
 using num_total_items_t = ::cuda::std::int64_t;
 
 template <typename TotalNumItemsT>
-class streaming_select_context_t
+class streaming_context_t
 {
 private:
-  TotalNumItemsT first_partition = true;
-  TotalNumItemsT last_partition  = false;
+  bool first_partition = true;
+  bool last_partition  = false;
   TotalNumItemsT total_num_items{};
   TotalNumItemsT total_previous_num_items{};
 
   // We use a double-buffer for keeping track of the number of previously selected items
-  // <partition index>: <buffer providing previously selected> -> <buffer to which num. selected is written>
-  // 0: '0'       -> buffer 1
-  // 1: buffer 1  -> buffer 0
-  // 2: buffer 0  -> buffer 1
-  // ...
-  TotalNumItemsT selector              = 0x00U;
-  TotalNumItemsT* d_num_selected_dbuff = nullptr;
+  TotalNumItemsT* d_num_selected_in  = nullptr;
+  TotalNumItemsT* d_num_selected_out = nullptr;
 
 public:
   using total_num_items_t = TotalNumItemsT;
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_select_context_t(
-    TotalNumItemsT* d_num_selected_dbuff, TotalNumItemsT total_num_items, bool is_last_partition)
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_context_t(
+    TotalNumItemsT* d_num_selected_in,
+    TotalNumItemsT* d_num_selected_out,
+    TotalNumItemsT total_num_items,
+    bool is_last_partition)
       : last_partition(is_last_partition)
       , total_num_items(total_num_items)
-      , d_num_selected_dbuff(d_num_selected_dbuff)
+      , d_num_selected_in(d_num_selected_in)
+      , d_num_selected_out(d_num_selected_out)
   {}
 
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last)
   {
+    ::cuda::std::swap(d_num_selected_in, d_num_selected_out);
     first_partition = false;
     last_partition  = next_partition_is_the_last;
-    selector ^= 0x01;
     total_previous_num_items += num_items;
   };
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset()
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const
   {
     return first_partition ? TotalNumItemsT{0} : total_previous_num_items;
   };
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected()
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
   {
-    if (threadIdx.x == 0 && blockIdx.x < 2)
-    {
-    }
-    return first_partition ? TotalNumItemsT{0} : d_num_selected_dbuff[selector];
+    return first_partition ? TotalNumItemsT{0} : *d_num_selected_in;
   };
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected()
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
   {
-    if (threadIdx.x == 0 && blockIdx.x < 2)
-    {
-    }
     return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected());
   };
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items()
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() const
   {
     return total_num_items;
   };
 
   template <typename NumSelectedIteratorT, typename OffsetT>
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
-  update_num_selected(NumSelectedIteratorT d_num_selected_out, OffsetT num_selections)
+  update_num_selected(NumSelectedIteratorT user_num_selected_out_it, OffsetT num_selections) const
   {
     if (last_partition)
     {
-      *d_num_selected_out = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
+      *user_num_selected_out_it = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
     }
     else
     {
-      d_num_selected_dbuff[(selector ^ 0x01U)] =
-        num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
+      *d_num_selected_out = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
     }
   };
 };
@@ -404,7 +396,7 @@ struct DispatchSelectIf : SelectedPolicy
   using num_total_items_t = detail::select::num_total_items_t;
 
   // Type used to provide streaming information about each partition's context
-  using streaming_context_t = detail::select::streaming_select_context_t<num_total_items_t>;
+  using streaming_context_t = detail::select::streaming_context_t<num_total_items_t>;
 
   using ScanTileStateT = ScanTileState<per_partition_offset_t>;
 
@@ -600,8 +592,9 @@ struct DispatchSelectIf : SelectedPolicy
 
       // Initialize the streaming context with the temporary storage for double-buffering the previously selected items
       // and the total number (across all partitions) of items
+      num_total_items_t* tmp_num_selected_out = reinterpret_cast<num_total_items_t*>(allocations[2]);
       streaming_context_t streaming_context{
-        reinterpret_cast<num_total_items_t*>(allocations[2]), num_items, (num_partitions <= 1)};
+        tmp_num_selected_out, (tmp_num_selected_out + 1), num_items, (num_partitions <= 1)};
 
       // Iterate over the partitions until all input is processed
       for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++)

From 412ea3c361bfac297e7a938f1902335060c33deb Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 10 Sep 2024 00:56:15 -0700
Subject: [PATCH 04/41] makes device interfaces use i64 num_items

---
 cub/cub/device/device_partition.cuh | 32 ++++++++++++------------
 cub/cub/device/device_select.cuh    | 38 ++++++++++++++---------------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index 28bfc377bdc..bd0873634dc 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -177,11 +177,11 @@ struct DevicePartition
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged");
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using SelectOp   = NullType; // Selection op (not used)
     using EqualityOp = NullType; // Equality operator (not used)
     using DispatchSelectIfT =
@@ -216,7 +216,7 @@ struct DevicePartition
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -254,10 +254,10 @@ struct DevicePartition
   //!    {
   //!        int compare;
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        explicit LessThan(int compare) : compare(compare) {}
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        bool operator()(const int &a) const
   //!        {
   //!            return (a < compare);
@@ -338,12 +338,12 @@ struct DevicePartition
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
-    using OffsetT      = int; // Signed integer type for global offsets
+    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using EqualityOp   = NullType; // Equality operator (not used)
 
@@ -378,7 +378,7 @@ struct DevicePartition
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream,
      bool debug_synchronous)
@@ -416,12 +416,12 @@ private:
     SecondOutputIteratorT d_second_part_out,
     UnselectedOutputIteratorT d_unselected_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     SelectFirstPartOp select_first_part_op,
     SelectSecondPartOp select_second_part_op,
     cudaStream_t stream = 0)
   {
-    using OffsetT                      = int;
+    using OffsetT                      = ::cuda::std::int64_t;
     using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf<
       InputIteratorT,
       FirstOutputIteratorT,
@@ -500,10 +500,10 @@ public:
   //!    {
   //!        int compare;
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        explicit LessThan(int compare) : compare(compare) {}
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        bool operator()(const int &a) const
   //!        {
   //!            return a < compare;
@@ -515,10 +515,10 @@ public:
   //!    {
   //!        int compare;
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        explicit GreaterThan(int compare) : compare(compare) {}
   //!
-  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        __host__ __device__ __forceinline__
   //!        bool operator()(const int &a) const
   //!        {
   //!            return a > compare;
@@ -641,7 +641,7 @@ public:
      SecondOutputIteratorT d_second_part_out,
      UnselectedOutputIteratorT d_unselected_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectFirstPartOp select_first_part_op,
      SelectSecondPartOp select_second_part_op,
      cudaStream_t stream = 0)
@@ -677,7 +677,7 @@ public:
      SecondOutputIteratorT d_second_part_out,
      UnselectedOutputIteratorT d_unselected_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectFirstPartOp select_first_part_op,
      SelectSecondPartOp select_second_part_op,
      cudaStream_t stream,
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 332bbe6c7d2..e24526cd821 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -174,12 +174,12 @@ struct DeviceSelect
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged");
 
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using SelectOp   = NullType; // Selection op (not used)
     using EqualityOp = NullType; // Equality operator (not used)
 
@@ -212,7 +212,7 @@ struct DeviceSelect
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -309,12 +309,12 @@ struct DeviceSelect
     IteratorT d_data,
     FlagIterator d_flags,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged");
 
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using SelectOp   = NullType; // Selection op (not used)
     using EqualityOp = NullType; // Equality operator (not used)
 
@@ -349,7 +349,7 @@ struct DeviceSelect
     IteratorT d_data,
     FlagIterator d_flags,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -468,13 +468,13 @@ struct DeviceSelect
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If");
 
-    using OffsetT      = int; // Signed integer type for global offsets
+    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using EqualityOp   = NullType; // Equality operator (not used)
 
@@ -506,7 +506,7 @@ struct DeviceSelect
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream,
      bool debug_synchronous)
@@ -615,13 +615,13 @@ struct DeviceSelect
      size_t& temp_storage_bytes,
      IteratorT d_data,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If");
 
-    using OffsetT      = int; // Signed integer type for global offsets
+    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using EqualityOp   = NullType; // Equality operator (not used)
 
@@ -655,7 +655,7 @@ struct DeviceSelect
      size_t& temp_storage_bytes,
      IteratorT d_data,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     ::cuda::std::int64_t num_items,
      SelectOp select_op,
      cudaStream_t stream,
      bool debug_synchronous)
@@ -756,13 +756,13 @@ struct DeviceSelect
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     SelectOp select_op,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf");
 
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using EqualityOp = NullType; // Equality operator (not used)
 
     return DispatchSelectIf<
@@ -861,13 +861,13 @@ struct DeviceSelect
     IteratorT d_data,
     FlagIterator d_flags,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     SelectOp select_op,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf");
 
-    using OffsetT    = int; // Signed integer type for global offsets
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
     using EqualityOp = NullType; // Equality operator (not used)
 
     constexpr bool may_alias = true;
@@ -981,12 +981,12 @@ struct DeviceSelect
     InputIteratorT d_in,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique");
 
-    using OffsetT      = int; // Signed integer type for global offsets
+    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using SelectOp     = NullType; // Selection op (not used)
     using EqualityOp   = Equality; // Default == operator
@@ -1019,7 +1019,7 @@ struct DeviceSelect
     InputIteratorT d_in,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {

From 70d562bc6f27f03c5d158b13a3ddcccb36944c06 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 10 Sep 2024 22:58:47 -0700
Subject: [PATCH 05/41] updates select::if large num_items tests

---
 cub/test/catch2_test_device_select_if.cu | 125 +++++++++--------------
 1 file changed, 47 insertions(+), 78 deletions(-)

diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu
index e38f9957d6b..5d6a5421ec7 100644
--- a/cub/test/catch2_test_device_select_if.cu
+++ b/cub/test/catch2_test_device_select_if.cu
@@ -45,48 +45,7 @@
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
-// TODO replace with DeviceSelect::If interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
-// Temporary wrapper that allows specializing the DeviceSelect algorithm for different offset types
-template <typename InputIteratorT,
-          typename OutputIteratorT,
-          typename NumSelectedIteratorT,
-          typename OffsetT,
-          typename SelectOp>
-CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_select_if_wrapper(
-  void* d_temp_storage,
-  std::size_t& temp_storage_bytes,
-  InputIteratorT d_in,
-  OutputIteratorT d_out,
-  NumSelectedIteratorT d_num_selected_out,
-  OffsetT num_items,
-  SelectOp select_op,
-  cudaStream_t stream = 0)
-{
-  using flag_iterator_t = cub::NullType*;
-  using equality_op_t   = cub::NullType;
-
-  return cub::DispatchSelectIf<
-    InputIteratorT,
-    flag_iterator_t,
-    OutputIteratorT,
-    NumSelectedIteratorT,
-    SelectOp,
-    equality_op_t,
-    OffsetT,
-    false>::Dispatch(d_temp_storage,
-                     temp_storage_bytes,
-                     d_in,
-                     nullptr,
-                     d_out,
-                     d_num_selected_out,
-                     select_op,
-                     equality_op_t{},
-                     num_items,
-                     stream);
-}
-
 DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::If, select_if);
-DECLARE_LAUNCH_WRAPPER(dispatch_select_if_wrapper, dispatch_select_if);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
 
@@ -152,6 +111,18 @@ struct multiply_n
   }
 };
 
+template <typename T, typename TargetT>
+struct modx_and_add_divy
+{
+  T mod;
+  T div;
+
+  __host__ __device__ TargetT operator()(T x)
+  {
+    return static_cast<TargetT>((x % mod) + (x / div));
+  }
+};
+
 using all_types =
   c2h::type_list<std::uint8_t,
                  std::uint16_t,
@@ -166,8 +137,6 @@ using all_types =
 using types = c2h::
   type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>>;
 
-using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
-
 CUB_TEST("DeviceSelect::If can run with empty input", "[device][select_if]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -393,25 +362,24 @@ CUB_TEST("DeviceSelect::If works with a different output type", "[device][select
   REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{}));
 }
 
-CUB_TEST("DeviceSelect::If works for very large number of items", "[device][select_if]", offset_types)
+CUB_TEST("DeviceSelect::If works for very large number of items", "[device][select_if]")
 try
 {
   using type     = std::int64_t;
-  using offset_t = typename c2h::get<0, TestType>;
-
-  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
-  auto num_items_max_ull =
-    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
-             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
-  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
-  offset_t num_items_min =
-    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
   offset_t num_items = GENERATE_COPY(
     values({
-      num_items_max,
-      static_cast<offset_t>(num_items_max - 1),
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
     }),
-    take(2, random(num_items_min, num_items_max)));
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
 
   // Input
   auto in = thrust::make_counting_iterator(static_cast<type>(0));
@@ -421,11 +389,10 @@ try
   offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   // Run test
-  std::size_t match_every_nth = 1000000;
-  offset_t expected_num_copied =
-    static_cast<offset_t>((static_cast<std::size_t>(num_items) + match_every_nth - 1ULL) / match_every_nth);
+  offset_t match_every_nth     = 1000000;
+  offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
   c2h::device_vector<type> out(expected_num_copied);
-  dispatch_select_if(
+  select_if(
     in, out.begin(), d_first_num_selected_out, num_items, mod_n<offset_t>{static_cast<offset_t>(match_every_nth)});
 
   // Ensure that we created the correct output
@@ -440,29 +407,30 @@ catch (std::bad_alloc&)
   // Exceeding memory is not a failure.
 }
 
-CUB_TEST("DeviceSelect::If works for very large number of output items", "[device][select_if]", offset_types)
+CUB_TEST("DeviceSelect::If works for very large number of output items", "[device][select_if]")
 try
 {
   using type     = std::uint8_t;
-  using offset_t = typename c2h::get<0, TestType>;
-
-  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
-  auto num_items_max_ull =
-    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
-             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
-  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
-  offset_t num_items_min =
-    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
   offset_t num_items = GENERATE_COPY(
     values({
-      num_items_max,
-      static_cast<offset_t>(num_items_max - 1),
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
     }),
-    take(2, random(num_items_min, num_items_max)));
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
 
-  // Prepare input
-  c2h::device_vector<type> in(num_items);
-  c2h::gen(CUB_SEED(1), in);
+  // Prepare input iterator: it[i] = (i%mod)+(i/div)
+  static constexpr offset_t mod = 200;
+  static constexpr offset_t div = 1000000000;
+  auto in                       = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(offset_t{0}), modx_and_add_divy<offset_t, type>{mod, div});
 
   // Prepare output
   c2h::device_vector<type> out(num_items);
@@ -472,11 +440,12 @@ try
   offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   // Run test
-  dispatch_select_if(in.cbegin(), out.begin(), d_first_num_selected_out, num_items, always_true_t{});
+  select_if(in, out.begin(), d_first_num_selected_out, num_items, always_true_t{});
 
   // Ensure that we created the correct output
   REQUIRE(num_selected_out[0] == num_items);
-  REQUIRE(in == out);
+  bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), in);
+  REQUIRE(all_results_correct == true);
 }
 catch (std::bad_alloc&)
 {

From b274e1b49f71a95f91406e525b4df655994ae3db Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 10 Sep 2024 23:05:19 -0700
Subject: [PATCH 06/41] fixes syntax

---
 cub/cub/device/dispatch/dispatch_select_if.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 27e5a68a1f3..d78f5218fb9 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -141,7 +141,7 @@ public:
     {
       *d_num_selected_out = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
     }
-  };
+  }
 };
 
 /**

From d6b21ba83911564e24ea1b7f7d6bbdc6f836d877 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 11 Sep 2024 01:07:39 -0700
Subject: [PATCH 07/41] adds tests for large num_items for select::flagged

---
 cub/cub/agent/agent_select_if.cuh             |  5 +-
 cub/test/catch2_test_device_select_common.cuh | 36 ++++++++++++++
 cub/test/catch2_test_device_select_flagged.cu | 48 ++++++++++++++++++-
 cub/test/catch2_test_device_select_if.cu      | 35 +-------------
 4 files changed, 88 insertions(+), 36 deletions(-)
 create mode 100644 cub/test/catch2_test_device_select_common.cuh

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 1dc70230863..3693822b385 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -450,11 +450,12 @@ struct AgentSelectIf
     if (IS_LAST_TILE)
     {
       // Out-of-bounds items are selection_flags
-      BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+      BlockLoadFlags(temp_storage.load_flags)
+        .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items, 1);
     }
     else
     {
-      BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+      BlockLoadFlags(temp_storage.load_flags).Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags);
     }
 
 // Convert flag type to selection_flags type
diff --git a/cub/test/catch2_test_device_select_common.cuh b/cub/test/catch2_test_device_select_common.cuh
new file mode 100644
index 00000000000..a686909cbcd
--- /dev/null
+++ b/cub/test/catch2_test_device_select_common.cuh
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+template <typename T>
+struct mod_n
+{
+  T mod;
+  __host__ __device__ bool operator()(T x)
+  {
+    return (x % mod == 0) ? true : false;
+  }
+};
+
+template <typename T>
+struct multiply_n
+{
+  T multiplier;
+  __host__ __device__ T operator()(T x)
+  {
+    return x * multiplier;
+  }
+};
+
+template <typename T, typename TargetT>
+struct modx_and_add_divy
+{
+  T mod;
+  T div;
+
+  __host__ __device__ TargetT operator()(T x)
+  {
+    return static_cast<TargetT>((x % mod) + (x / div));
+  }
+};
\ No newline at end of file
diff --git a/cub/test/catch2_test_device_select_flagged.cu b/cub/test/catch2_test_device_select_flagged.cu
index f3477787ecd..0ff05b3f00c 100644
--- a/cub/test/catch2_test_device_select_flagged.cu
+++ b/cub/test/catch2_test_device_select_flagged.cu
@@ -36,6 +36,7 @@
 
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -325,7 +326,7 @@ CUB_TEST("DeviceSelect::Flagged works with flags that alias input", "[device][se
   REQUIRE(reference == out);
 }
 
-CUB_TEST("DeviceSelect::Flagged works in place", "[device][select_if]", types)
+CUB_TEST("DeviceSelect::Flagged works in place", "[device][select_flagged]", types)
 {
   using type = typename c2h::get<0, TestType>;
 
@@ -416,3 +417,48 @@ CUB_TEST("DeviceSelect::Flagged works with a different output type", "[device][s
   REQUIRE(num_selected == num_selected_out[0]);
   REQUIRE(reference == out);
 }
+
+CUB_TEST("DeviceSelect::Flagged works for very large number of items", "[device][select_flagged]")
+try
+{
+  using type     = std::int64_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  // Input
+  constexpr offset_t match_every_nth = 1000000;
+  auto in                            = thrust::make_counting_iterator(static_cast<type>(0));
+  auto flags_in = thrust::make_transform_iterator(in, mod_n<offset_t>{static_cast<offset_t>(match_every_nth)});
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
+  c2h::device_vector<type> out(expected_num_copied);
+  select_flagged(in, flags_in, out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == expected_num_copied);
+  auto expected_out_it =
+    thrust::make_transform_iterator(in, multiply_n<offset_t>{static_cast<offset_t>(match_every_nth)});
+  bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), expected_out_it);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu
index 5d6a5421ec7..bfa1d31d6ac 100644
--- a/cub/test/catch2_test_device_select_if.cu
+++ b/cub/test/catch2_test_device_select_if.cu
@@ -42,6 +42,7 @@
 
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -91,38 +92,6 @@ struct always_true_t
   }
 };
 
-template <typename T>
-struct mod_n
-{
-  T mod;
-  __host__ __device__ bool operator()(T x)
-  {
-    return (x % mod == 0) ? true : false;
-  }
-};
-
-template <typename T>
-struct multiply_n
-{
-  T multiplier;
-  __host__ __device__ T operator()(T x)
-  {
-    return x * multiplier;
-  }
-};
-
-template <typename T, typename TargetT>
-struct modx_and_add_divy
-{
-  T mod;
-  T div;
-
-  __host__ __device__ TargetT operator()(T x)
-  {
-    return static_cast<TargetT>((x % mod) + (x / div));
-  }
-};
-
 using all_types =
   c2h::type_list<std::uint8_t,
                  std::uint16_t,
@@ -389,7 +358,7 @@ try
   offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   // Run test
-  offset_t match_every_nth     = 1000000;
+  constexpr offset_t match_every_nth     = 1000000;
   offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
   c2h::device_vector<type> out(expected_num_copied);
   select_if(

From d26eb655d15209cf1c2dec878eebfac81dfa3e39 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 11 Sep 2024 05:52:17 -0700
Subject: [PATCH 08/41] adds tests for large num_items for partition::if

---
 cub/test/catch2_test_device_partition_if.cu   | 86 +++++++++++++++----
 cub/test/catch2_test_device_select_common.cuh | 71 ++++++++++++++-
 2 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu
index 84890a1233f..7c3c778584d 100644
--- a/cub/test/catch2_test_device_partition_if.cu
+++ b/cub/test/catch2_test_device_partition_if.cu
@@ -31,11 +31,17 @@
 #include <cub/device/device_partition.cuh>
 
 #include <thrust/distance.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/reverse.h>
 
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -43,21 +49,6 @@ DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::If, partition_if);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
 
-template <typename T>
-struct less_than_t
-{
-  T compare;
-
-  explicit __host__ less_than_t(T compare)
-      : compare(compare)
-  {}
-
-  __host__ __device__ bool operator()(const T& a) const
-  {
-    return a < compare;
-  }
-};
-
 struct always_false_t
 {
   template <typename T>
@@ -308,3 +299,68 @@ CUB_TEST("DevicePartition::If works with a different output type", "[device][par
   REQUIRE(num_selected_out[0] == thrust::distance(reference.begin(), boundary));
   REQUIRE(reference == out);
 }
+
+CUB_TEST("DevicePartition::If works for very large number of items", "[device][partition_if]")
+try
+{
+  using type     = std::int64_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  auto in = thrust::make_counting_iterator(offset_t{0});
+
+  // We select the first <cut_off_index> items and reject the rest
+  const offset_t cut_off_index = num_items / 4;
+
+  // Prepare tabulate output iterator to verify results in a memory-efficient way:
+  // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item
+  // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  c2h::device_vector<std::uint32_t> correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element));
+  auto expected_selected_it = thrust::make_counting_iterator(offset_t{0});
+  auto expected_rejected_it = thrust::make_reverse_iterator(
+    thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index));
+  auto expected_result_op =
+    make_index_to_expected_partition_op(expected_selected_it, expected_rejected_it, cut_off_index);
+  auto expected_result_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(offset_t{0}), expected_result_op);
+  auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+  auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  partition_if(
+    in, check_result_it, d_first_num_selected_out, num_items, less_than_t<type>{static_cast<type>(cut_off_index)});
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == cut_off_index);
+  bool all_results_correct = thrust::equal(
+    correctness_flags.cbegin(),
+    correctness_flags.cbegin() + (num_items / bits_per_element),
+    thrust::make_constant_iterator(0xFFFFFFFFU));
+  REQUIRE(all_results_correct == true);
+  if (num_items % bits_per_element != 0)
+  {
+    std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U;
+    REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
+  }
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/cub/test/catch2_test_device_select_common.cuh b/cub/test/catch2_test_device_select_common.cuh
index a686909cbcd..3ee93ee6cae 100644
--- a/cub/test/catch2_test_device_select_common.cuh
+++ b/cub/test/catch2_test_device_select_common.cuh
@@ -3,6 +3,23 @@
 
 #pragma once
 
+#include <cuda/std/type_traits>
+
+template <typename T>
+struct less_than_t
+{
+  T compare;
+
+  explicit __host__ less_than_t(T compare)
+      : compare(compare)
+  {}
+
+  __host__ __device__ bool operator()(const T& a) const
+  {
+    return a < compare;
+  }
+};
+
 template <typename T>
 struct mod_n
 {
@@ -33,4 +50,56 @@ struct modx_and_add_divy
   {
     return static_cast<TargetT>((x % mod) + (x / div));
   }
-};
\ No newline at end of file
+};
+
+template <typename SelectedItT, typename RejectedItT>
+struct index_to_expected_partition_op
+{
+  using value_t = typename ::cuda::std::iterator_traits<SelectedItT>::value_type;
+  SelectedItT expected_selected_it;
+  RejectedItT expected_rejected_it;
+  std::int64_t expected_num_selected;
+
+  template <typename OffsetT>
+  __host__ __device__ value_t operator()(OffsetT index)
+  {
+    return (index < static_cast<OffsetT>(expected_num_selected))
+           ? expected_selected_it[index]
+           : expected_rejected_it[index - expected_num_selected];
+  }
+};
+
+template <typename SelectedItT, typename RejectedItT>
+index_to_expected_partition_op<SelectedItT, RejectedItT> make_index_to_expected_partition_op(
+  SelectedItT expected_selected_it, RejectedItT expected_rejected_it, std::int64_t expected_num_selected)
+{
+  return index_to_expected_partition_op<SelectedItT, RejectedItT>{
+    expected_selected_it, expected_rejected_it, expected_num_selected};
+}
+
+template <typename ExpectedValuesItT>
+struct flag_correct_writes_op
+{
+  ExpectedValuesItT expected_it;
+  std::uint32_t* d_correctness_flags;
+
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  template <typename OffsetT, typename T>
+  __host__ __device__ void operator()(OffsetT index, T val)
+  {
+    // Set bit-flag if the correct result has been written at the given index
+    if (expected_it[index] == val)
+    {
+      OffsetT uint_index     = index / static_cast<OffsetT>(bits_per_element);
+      std::uint32_t bit_flag = 0x00000001U << (index % bits_per_element);
+      atomicOr(&d_correctness_flags[uint_index], bit_flag);
+    }
+  }
+};
+
+template <typename ExpectedValuesItT>
+flag_correct_writes_op<ExpectedValuesItT>
+make_checking_write_op(ExpectedValuesItT expected_it, std::uint32_t* d_correctness_flags)
+{
+  return flag_correct_writes_op<ExpectedValuesItT>{expected_it, d_correctness_flags};
+}

From 1be152ce0f0d5cee0b8bf616382a6052aa586a73 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 11 Sep 2024 08:20:51 -0700
Subject: [PATCH 09/41] adds tests for large num_items for partition::flagged

---
 .../catch2_test_device_partition_flagged.cu   | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu
index 2317c4bfb2e..7cd5e6d10bb 100644
--- a/cub/test/catch2_test_device_partition_flagged.cu
+++ b/cub/test/catch2_test_device_partition_flagged.cu
@@ -31,11 +31,17 @@
 #include <cub/device/device_partition.cuh>
 
 #include <thrust/count.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/reverse.h>
 
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -371,3 +377,69 @@ CUB_TEST("DevicePartition::Flagged works with different output type", "[device][
   REQUIRE(num_selected == num_selected_out[0]);
   REQUIRE(reference == out);
 }
+
+CUB_TEST("DevicePartition::Flagged works for very large number of items", "[device][partition_flagged]")
+try
+{
+  using type     = std::int64_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  // We select the first <cut_off_index> items and reject the rest
+  const offset_t cut_off_index = num_items / 4;
+
+  auto in       = thrust::make_counting_iterator(offset_t{0});
+  auto in_flags = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(offset_t{0}), less_than_t<type>{static_cast<type>(cut_off_index)});
+
+  // Prepare tabulate output iterator to verify results in a memory-efficient way:
+  // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item
+  // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  c2h::device_vector<std::uint32_t> correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element));
+  auto expected_selected_it = thrust::make_counting_iterator(offset_t{0});
+  auto expected_rejected_it = thrust::make_reverse_iterator(
+    thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index));
+  auto expected_result_op =
+    make_index_to_expected_partition_op(expected_selected_it, expected_rejected_it, cut_off_index);
+  auto expected_result_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(offset_t{0}), expected_result_op);
+  auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+  auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  partition_flagged(in, in_flags, check_result_it, d_first_num_selected_out, num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == cut_off_index);
+  bool all_results_correct = thrust::equal(
+    correctness_flags.cbegin(),
+    correctness_flags.cbegin() + (num_items / bits_per_element),
+    thrust::make_constant_iterator(0xFFFFFFFFU));
+  REQUIRE(all_results_correct == true);
+  if (num_items % bits_per_element != 0)
+  {
+    std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U;
+    REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
+  }
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}

From 466f91513385a70448d088e7c5ca75280a8f0c13 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 11 Sep 2024 08:55:35 -0700
Subject: [PATCH 10/41] fixes redundant definition

---
 cub/test/catch2_test_device_select_if.cu | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu
index bfa1d31d6ac..3f7cc402b93 100644
--- a/cub/test/catch2_test_device_select_if.cu
+++ b/cub/test/catch2_test_device_select_if.cu
@@ -50,21 +50,6 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::If, select_if);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
 
-template <typename T>
-struct less_than_t
-{
-  T compare;
-
-  explicit __host__ less_than_t(T compare)
-      : compare(compare)
-  {}
-
-  __host__ __device__ bool operator()(const T& a) const
-  {
-    return a < compare;
-  }
-};
-
 struct equal_to_default_t
 {
   template <typename T>

From d0f6e7fab18e40907811aa4e11ff0c7fb3da7ad4 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 11 Sep 2024 14:20:12 -0700
Subject: [PATCH 11/41] fixes implicit conversion

---
 cub/cub/device/dispatch/dispatch_select_if.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index d78f5218fb9..27cf88025e5 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -569,7 +569,8 @@ struct DispatchSelectIf : SelectedPolicy
       ::cuda::std::size_t allocation_sizes[3] = {0ULL, vsmem_size, streaming_selection_storage_bytes};
 
       // Bytes needed for tile status descriptors
-      error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles_per_invocation, allocation_sizes[0]));
+      error =
+        CubDebug(ScanTileStateT::AllocationSize(static_cast<int>(max_num_tiles_per_invocation), allocation_sizes[0]));
       if (cudaSuccess != error)
       {
         break;

From cea85090bf5cd41c60f424f7402a379095d5d1d2 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Sun, 15 Sep 2024 22:21:18 -0700
Subject: [PATCH 12/41] fixes f32 select::if perf regression

---
 cub/cub/device/dispatch/dispatch_select_if.cuh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 27cf88025e5..67d241ce9eb 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -190,6 +190,12 @@ struct agent_select_if_wrapper_t
 /******************************************************************************
  * Kernel entry points
  *****************************************************************************/
+// TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want to make this a CCCL macro?
+#if _CCCL_CUDACC_BELOW_11_7
+#define _CUB_GRID_CONSTANT 
+#else
+#define _CUB_GRID_CONSTANT __grid_constant__
+#endif
 
 /**
  * Select kernel entry point (multi-block)
@@ -302,7 +308,7 @@ __launch_bounds__(int(
     EqualityOpT equality_op,
     OffsetT num_items,
     int num_tiles,
-    StreamingContextT streaming_context,
+    _CUB_GRID_CONSTANT StreamingContextT streaming_context,
     cub::detail::vsmem_t vsmem)
 {
   using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<

From 96be1e8ee07dbf35a8062110469d52e45bfeb2f8 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:35:35 -0700
Subject: [PATCH 13/41] fixes perf regression for partition

---
 cub/cub/agent/agent_select_if.cuh             | 25 +++---
 cub/cub/detail/choose_offset.cuh              | 46 +++++++++++
 cub/cub/device/device_partition.cuh           | 81 ++++++++++++++-----
 .../device/dispatch/dispatch_select_if.cuh    |  7 +-
 cub/test/catch2_test_util_choose_offset.cu    | 32 ++++++++
 5 files changed, 155 insertions(+), 36 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 3693822b385..ebfe03dc1c2 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -681,7 +681,7 @@ struct AgentSelectIf
     CTA_SYNC();
 
     // Gather items from shared memory and scatter to global
-    ScatterPartitionsToGlobal<IS_LAST_TILE, IS_FIRST_TILE>(
+    ScatterPartitionsToGlobal<IS_LAST_TILE>(
       num_tile_items, tile_num_rejections, num_selections_prefix, num_rejected_prefix, d_selected_out);
   }
 
@@ -689,7 +689,7 @@ struct AgentSelectIf
    * @brief Second phase of scattering partitioned items to global memory. Specialized for partitioning to two
    * distinct partitions.
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE, typename SelectedItT, typename RejectedItT>
+  template <bool IS_LAST_TILE, typename SelectedItT, typename RejectedItT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
@@ -730,7 +730,7 @@ struct AgentSelectIf
    * iterator, where selected items are written in order from the beginning of the itereator and rejected items are
    * writtem from the iterators end backwards.
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE, typename PartitionedOutputItT>
+  template <bool IS_LAST_TILE, typename PartitionedOutputItT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
@@ -739,20 +739,25 @@ struct AgentSelectIf
     PartitionedOutputItT partitioned_out_it)
   {
     using total_offset_t           = typename StreamingContextT::total_num_items_t;
-    auto const selected_base_begin = streaming_context.num_previously_selected();
-    auto const rejected_base_end   = streaming_context.num_total_items() - streaming_context.num_previously_rejected();
+    
+    total_offset_t _num_rejected_prefix = streaming_context.num_total_items() - streaming_context.num_previously_rejected();
+    _num_rejected_prefix -= static_cast<total_offset_t>(num_rejected_prefix + 1);
+    total_offset_t _num_selections_prefix = streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix);
+    
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
+    {      
       int item_idx      = (ITEM * BLOCK_THREADS) + threadIdx.x;
+      InputT item = temp_storage.raw_exchange.Alias()[item_idx];
+      
       int rejection_idx = item_idx;
       int selection_idx = item_idx - tile_num_rejections;
+      total_offset_t scatter_rejected_index = _num_rejected_prefix - static_cast<total_offset_t>(rejection_idx);
+      total_offset_t scatter_selected_index = _num_selections_prefix + static_cast<total_offset_t>(selection_idx);
       total_offset_t scatter_offset =
         (item_idx < tile_num_rejections)
-          ? rejected_base_end - static_cast<total_offset_t>(num_rejected_prefix + rejection_idx + 1)
-          : selected_base_begin + num_selections_prefix + selection_idx;
-
-      InputT item = temp_storage.raw_exchange.Alias()[item_idx];
+          ? scatter_rejected_index
+          : scatter_selected_index;
 
       if (!IS_LAST_TILE || (item_idx < num_tile_items))
       {
diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh
index 18fd568d9b8..28639a1bf56 100644
--- a/cub/cub/detail/choose_offset.cuh
+++ b/cub/cub/detail/choose_offset.cuh
@@ -29,6 +29,8 @@
 
 #include <cub/config.cuh>
 
+#include "cuda/std/__type_traits/is_unsigned.h"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -37,6 +39,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/cstdint>
 #include <cuda/std/iterator>
 #include <cuda/std/type_traits>
 
@@ -93,6 +96,49 @@ struct promote_small_offset
 template <typename NumItemsT>
 using promote_small_offset_t = typename promote_small_offset<NumItemsT>::type;
 
+/**
+ * choose_signed_offset checks NumItemsT, the type of the num_items parameter, and
+ * selects the offset type to be either int32 or int64, such that the selected offset type covers the range of NumItemsT
+ * unless it was uint64, in which case int64 will be used.
+ */
+template <typename NumItemsT>
+struct choose_signed_offset
+{
+  // NumItemsT must be an integral type (but not bool).
+  static_assert(::cuda::std::is_integral<NumItemsT>::value
+                  && !::cuda::std::is_same<typename ::cuda::std::remove_cv<NumItemsT>::type, bool>::value,
+                "NumItemsT must be an integral type, but not bool");
+
+  // Signed integer type for global offsets.
+  // uint32 -> int64, else
+  // LEQ 4B -> int32, else
+  // int64
+  using type = typename ::cuda::std::conditional<
+    (::cuda::std::is_integral<NumItemsT>::value && ::cuda::std::is_unsigned<NumItemsT>::value),
+    ::cuda::std::int64_t,
+    typename ::cuda::std::conditional<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>::type>::type;
+
+  /**
+   * Checks if the given num_items can be covered by the selected offset type. If not, returns cudaErrorInvalidValue,
+   * otherwise returns cudaSuccess.
+   */
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t is_exceeding_offset_type(NumItemsT num_items)
+  {
+    if (sizeof(NumItemsT) >= 8 && num_items > static_cast<NumItemsT>(::cuda::std::numeric_limits<type>::max()))
+    {
+      return cudaErrorInvalidValue;
+    }
+    return cudaSuccess;
+  }
+};
+
+/**
+ * choose_signed_offset_t is an alias template that checks NumItemsT, the type of the num_items parameter, and
+ * selects the corresponding signed offset type based on it.
+ */
+template <typename NumItemsT>
+using choose_signed_offset_t = typename choose_signed_offset<NumItemsT>::type;
+
 /**
  * common_iterator_value sets member type to the common_type of
  * value_type for all argument types. used to get OffsetT in
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index bd0873634dc..48666f1370b 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -42,6 +42,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/choose_offset.cuh>
 #include <cub/detail/nvtx.cuh>
 #include <cub/device/dispatch/dispatch_select_if.cuh>
 #include <cub/device/dispatch/dispatch_three_way_partition.cuh>
@@ -142,6 +143,9 @@ struct DevicePartition
   //! @tparam NumSelectedIteratorT
   //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -169,7 +173,11 @@ struct DevicePartition
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
+  template <typename InputIteratorT,
+            typename FlagIterator,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -177,13 +185,14 @@ struct DevicePartition
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    ::cuda::std::int64_t num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged");
-    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
-    using SelectOp   = NullType; // Selection op (not used)
-    using EqualityOp = NullType; // Equality operator (not used)
+    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
+    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
+    using SelectOp      = NullType; // Selection op (not used)
+    using EqualityOp    = NullType; // Equality operator (not used)
     using DispatchSelectIfT =
       DispatchSelectIf<InputIteratorT,
                        FlagIterator,
@@ -194,6 +203,13 @@ struct DevicePartition
                        OffsetT,
                        true>;
 
+    // Check if the number of items exceeds the range covered by the selected signed offset type
+    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
+    if (error)
+    {
+      return error;
+    }
+
     return DispatchSelectIfT::Dispatch(
       d_temp_storage,
       temp_storage_bytes,
@@ -208,7 +224,11 @@ struct DevicePartition
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
+  template <typename InputIteratorT,
+            typename FlagIterator,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -216,7 +236,7 @@ struct DevicePartition
     FlagIterator d_flags,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    ::cuda::std::int64_t num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -254,10 +274,10 @@ struct DevicePartition
   //!    {
   //!        int compare;
   //!
-  //!        __host__ __device__ __forceinline__
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
   //!        explicit LessThan(int compare) : compare(compare) {}
   //!
-  //!        __host__ __device__ __forceinline__
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
   //!        bool operator()(const int &a) const
   //!        {
   //!            return (a < compare);
@@ -305,6 +325,9 @@ struct DevicePartition
   //! @tparam SelectOp
   //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -331,21 +354,33 @@ struct DevicePartition
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
      size_t& temp_storage_bytes,
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     ::cuda::std::int64_t num_items,
+     NumItemsT num_items,
      SelectOp select_op,
      cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
-    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
-    using FlagIterator = NullType*; // FlagT iterator type (not used)
-    using EqualityOp   = NullType; // Equality operator (not used)
+    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
+    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
+    using FlagIterator  = NullType*; // FlagT iterator type (not used)
+    using EqualityOp    = NullType; // Equality operator (not used)
+
+    // Check if the number of items exceeds the range covered by the selected signed offset type
+    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
+    if (error)
+    {
+      return error;
+    }
 
     using DispatchSelectIfT =
       DispatchSelectIf<InputIteratorT,
@@ -371,21 +406,25 @@ struct DevicePartition
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp,
+            typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
      size_t& temp_storage_bytes,
      InputIteratorT d_in,
      OutputIteratorT d_out,
      NumSelectedIteratorT d_num_selected_out,
-     ::cuda::std::int64_t num_items,
+     NumItemsT num_items,
      SelectOp select_op,
      cudaStream_t stream,
      bool debug_synchronous)
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>(
+    return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp, NumItemsT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
   }
 #endif // DOXYGEN_SHOULD_SKIP_THIS
@@ -416,12 +455,12 @@ private:
     SecondOutputIteratorT d_second_part_out,
     UnselectedOutputIteratorT d_unselected_out,
     NumSelectedIteratorT d_num_selected_out,
-    ::cuda::std::int64_t num_items,
+    int num_items,
     SelectFirstPartOp select_first_part_op,
     SelectSecondPartOp select_second_part_op,
     cudaStream_t stream = 0)
   {
-    using OffsetT                      = ::cuda::std::int64_t;
+    using OffsetT                      = int;
     using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf<
       InputIteratorT,
       FirstOutputIteratorT,
@@ -641,7 +680,7 @@ public:
      SecondOutputIteratorT d_second_part_out,
      UnselectedOutputIteratorT d_unselected_out,
      NumSelectedIteratorT d_num_selected_out,
-     ::cuda::std::int64_t num_items,
+     int num_items,
      SelectFirstPartOp select_first_part_op,
      SelectSecondPartOp select_second_part_op,
      cudaStream_t stream = 0)
@@ -677,7 +716,7 @@ public:
      SecondOutputIteratorT d_second_part_out,
      UnselectedOutputIteratorT d_unselected_out,
      NumSelectedIteratorT d_num_selected_out,
-     ::cuda::std::int64_t num_items,
+     int num_items,
      SelectFirstPartOp select_first_part_op,
      SelectSecondPartOp select_second_part_op,
      cudaStream_t stream,
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 67d241ce9eb..fff8b0ca527 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -71,9 +71,6 @@ namespace select
 // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
 using per_partition_offset_t = ::cuda::std::int32_t;
 
-// Offset type large enough to represent any index within the input and output iterators
-using num_total_items_t = ::cuda::std::int64_t;
-
 template <typename TotalNumItemsT>
 class streaming_context_t
 {
@@ -308,7 +305,7 @@ __launch_bounds__(int(
     EqualityOpT equality_op,
     OffsetT num_items,
     int num_tiles,
-    _CUB_GRID_CONSTANT StreamingContextT streaming_context,
+    _CUB_GRID_CONSTANT const StreamingContextT streaming_context,
     cub::detail::vsmem_t vsmem)
 {
   using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
@@ -399,7 +396,7 @@ struct DispatchSelectIf : SelectedPolicy
   using per_partition_offset_t = detail::select::per_partition_offset_t;
 
   // Offset type large enough to represent any index within the input and output iterators
-  using num_total_items_t = detail::select::num_total_items_t;
+  using num_total_items_t = OffsetT;
 
   // Type used to provide streaming information about each partition's context
   using streaming_context_t = detail::select::streaming_context_t<num_total_items_t>;
diff --git a/cub/test/catch2_test_util_choose_offset.cu b/cub/test/catch2_test_util_choose_offset.cu
index 62b8e4757af..9da5b59239f 100644
--- a/cub/test/catch2_test_util_choose_offset.cu
+++ b/cub/test/catch2_test_util_choose_offset.cu
@@ -27,6 +27,8 @@
 
 #include <cub/detail/choose_offset.cuh>
 
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include "catch2_test_helper.h"
@@ -43,6 +45,36 @@ CUB_TEST("Tests choose_offset", "[util][type]")
   STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_offset_t<std::int64_t>, unsigned long long>::value);
 }
 
+CUB_TEST("Tests choose_signed_offset", "[util][type]")
+{
+  // Uses signed 64-bit type for unsigned signed 32-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::uint32_t>, std::int64_t>::value);
+
+  // Uses signed 32-bit type for signed 32-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int32_t>, std::int32_t>::value);
+
+  // Uses signed 32-bit type for type smaller than 32 bits
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int8_t>, std::int32_t>::value);
+
+  // Uses signed 64-bit type for signed 64-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int64_t>, std::int64_t>::value);
+
+  // Offset type covers maximum number representable by a signed 32-bit integer
+  REQUIRE(cudaSuccess
+          == cub::detail::choose_signed_offset<std::int32_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::int32_t>::max()));
+
+  // Offset type covers maximum number representable by a signed 64-bit integer
+  REQUIRE(cudaSuccess
+          == cub::detail::choose_signed_offset<std::int64_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::int64_t>::max()));
+
+  // Offset type does not support maximum number representable by an unsigned 64-bit integer
+  REQUIRE(cudaErrorInvalidValue
+          == cub::detail::choose_signed_offset<std::uint64_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::uint64_t>::max()));
+}
+
 CUB_TEST("Tests promote_small_offset", "[util][type]")
 {
   // Uses input type for types of at least 32 bits

From a117b452312c6da8385727c52694c79431d4e274 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:43:05 -0700
Subject: [PATCH 14/41] fix feature macro

---
 cub/cub/device/dispatch/dispatch_select_if.cuh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index fff8b0ca527..abfd611ca5f 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -187,11 +187,12 @@ struct agent_select_if_wrapper_t
 /******************************************************************************
  * Kernel entry points
  *****************************************************************************/
-// TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want to make this a CCCL macro?
-#if _CCCL_CUDACC_BELOW_11_7
-#define _CUB_GRID_CONSTANT 
+// TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want
+// to make this a CCCL macro?
+#if defined(_CCCL_CUDACC_BELOW_11_7)
+#  define _CUB_GRID_CONSTANT
 #else
-#define _CUB_GRID_CONSTANT __grid_constant__
+#  define _CUB_GRID_CONSTANT __grid_constant__
 #endif
 
 /**

From 480da39881591f1c4cccc600ab190c0700dc237e Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 16 Sep 2024 12:51:19 -0700
Subject: [PATCH 15/41] fixes feature macro

---
 cub/cub/device/dispatch/dispatch_select_if.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index abfd611ca5f..d1f901ec2c7 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -189,7 +189,7 @@ struct agent_select_if_wrapper_t
  *****************************************************************************/
 // TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want
 // to make this a CCCL macro?
-#if defined(_CCCL_CUDACC_BELOW_11_7)
+#if defined(_CCCL_CUDACC_BELOW_11_7) && (CUB_PTX_ARCH >= 700)
 #  define _CUB_GRID_CONSTANT
 #else
 #  define _CUB_GRID_CONSTANT __grid_constant__

From aaed489e58958ad5c967a2f865f2e95ad5d686a6 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 16 Sep 2024 13:16:47 -0700
Subject: [PATCH 16/41] fixes feature macro

---
 cub/cub/device/dispatch/dispatch_select_if.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index d1f901ec2c7..0d0b6dd3030 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -189,7 +189,7 @@ struct agent_select_if_wrapper_t
  *****************************************************************************/
 // TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want
 // to make this a CCCL macro?
-#if defined(_CCCL_CUDACC_BELOW_11_7) && (CUB_PTX_ARCH >= 700)
+#if defined(_CCCL_CUDACC_BELOW_11_7) || (CUB_PTX_ARCH < 700)
 #  define _CUB_GRID_CONSTANT
 #else
 #  define _CUB_GRID_CONSTANT __grid_constant__

From 33337380057b2175d47e66944a50491a822673b7 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 16 Sep 2024 22:00:07 -0700
Subject: [PATCH 17/41] silences msvc constant conditional warning

---
 cub/cub/detail/choose_offset.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh
index 28639a1bf56..3e3b51f9504 100644
--- a/cub/cub/detail/choose_offset.cuh
+++ b/cub/cub/detail/choose_offset.cuh
@@ -124,10 +124,13 @@ struct choose_signed_offset
    */
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t is_exceeding_offset_type(NumItemsT num_items)
   {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */
     if (sizeof(NumItemsT) >= 8 && num_items > static_cast<NumItemsT>(::cuda::std::numeric_limits<type>::max()))
     {
       return cudaErrorInvalidValue;
     }
+    _CCCL_DIAG_POP
     return cudaSuccess;
   }
 };

From 66ca8fd5c604a4545d6a524f3aca10976b3ed312 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 17 Sep 2024 22:14:03 -0700
Subject: [PATCH 18/41] add support for streamin ctx dummy for partition with
 small offset types

---
 .../device/dispatch/dispatch_select_if.cuh    | 63 +++++++++++++++++--
 1 file changed, 59 insertions(+), 4 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 0d0b6dd3030..70edebbfc65 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -71,7 +71,7 @@ namespace select
 // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
 using per_partition_offset_t = ::cuda::std::int32_t;
 
-template <typename TotalNumItemsT>
+template <typename TotalNumItemsT, bool IsStreamingInvocation>
 class streaming_context_t
 {
 private:
@@ -141,6 +141,50 @@ public:
   }
 };
 
+template <typename TotalNumItemsT>
+class streaming_context_t<TotalNumItemsT, false>
+{
+private:
+  TotalNumItemsT total_num_items{};
+
+public:
+  using total_num_items_t = TotalNumItemsT;
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+  streaming_context_t(TotalNumItemsT*, TotalNumItemsT*, TotalNumItemsT total_num_items, bool)
+      : total_num_items(total_num_items)
+  {}
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT, bool) {};
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() const
+  {
+    return total_num_items;
+  };
+
+  template <typename NumSelectedIteratorT, typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
+  update_num_selected(NumSelectedIteratorT user_num_selected_out_it, OffsetT num_selections) const
+  {
+    *user_num_selected_out_it = num_selections;
+  }
+};
+
 /**
  * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`.
  */
@@ -400,7 +444,16 @@ struct DispatchSelectIf : SelectedPolicy
   using num_total_items_t = OffsetT;
 
   // Type used to provide streaming information about each partition's context
-  using streaming_context_t = detail::select::streaming_context_t<num_total_items_t>;
+  static constexpr per_partition_offset_t const partition_size =
+    cuda::std::numeric_limits<per_partition_offset_t>::max();
+
+  // If the values representable by OffsetT exceed the partition_size, we use a kernel template specialization that
+  // supports streaming (i.e., splitting the input into partitions of up to partition_size number of items)
+  static constexpr bool may_require_streaming =
+    (static_cast<::cuda::std::uint64_t>(partition_size)
+     < static_cast<::cuda::std::uint64_t>(cuda::std::numeric_limits<OffsetT>::max()));
+
+  using streaming_context_t = detail::select::streaming_context_t<num_total_items_t, may_require_streaming>;
 
   using ScanTileStateT = ScanTileState<per_partition_offset_t>;
 
@@ -542,9 +595,11 @@ struct DispatchSelectIf : SelectedPolicy
     constexpr auto tile_size        = static_cast<OffsetT>(block_threads * items_per_thread);
 
     // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size)
+    // The extra check of may_require_streaming ensures that OffsetT is larger than per_partition_offset_t to avoid
+    // truncation of partition_size
     auto const max_partition_size =
-      num_items > static_cast<OffsetT>(cuda::std::numeric_limits<per_partition_offset_t>::max())
-        ? static_cast<OffsetT>(cuda::std::numeric_limits<per_partition_offset_t>::max())
+      (may_require_streaming && num_items > static_cast<OffsetT>(partition_size))
+        ? static_cast<OffsetT>(partition_size)
         : num_items;
 
     // The number of partitions required to "iterate" over the total input

From 2781775512855fdee53dd25d3e5e116545b4eb53 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 17 Sep 2024 22:34:42 -0700
Subject: [PATCH 19/41] removes superfluous template parameter

---
 cub/cub/agent/agent_select_if.cuh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index ebfe03dc1c2..e3a450a4a83 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -517,7 +517,7 @@ struct AgentSelectIf
   /**
    * Scatter flagged items to output offsets (specialized for direct scattering).
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  template <bool IS_LAST_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedDirect(
     InputT (&items)[ITEMS_PER_THREAD],
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
@@ -556,7 +556,7 @@ struct AgentSelectIf
    * @param is_keep_rejects
    *   Marker type indicating whether to keep rejected items in the second partition
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  template <bool IS_LAST_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedTwoPhase(
     InputT (&items)[ITEMS_PER_THREAD],
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
@@ -604,7 +604,7 @@ struct AgentSelectIf
    * @param num_selections
    *   Total number of selections including this tile
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  template <bool IS_LAST_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
     InputT (&items)[ITEMS_PER_THREAD],
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
@@ -620,12 +620,12 @@ struct AgentSelectIf
     // greater than one
     if (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))
     {
-      ScatterSelectedTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+      ScatterSelectedTwoPhase<IS_LAST_TILE>(
         items, selection_flags, selection_indices, num_tile_selections, num_selections_prefix);
     }
     else
     {
-      ScatterSelectedDirect<IS_LAST_TILE, IS_FIRST_TILE>(items, selection_flags, selection_indices, num_selections);
+      ScatterSelectedDirect<IS_LAST_TILE>(items, selection_flags, selection_indices, num_selections);
     }
   }
 
@@ -648,7 +648,7 @@ struct AgentSelectIf
    * @param is_keep_rejects
    *   Marker type indicating whether to keep rejected items in the second partition
    */
-  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  template <bool IS_LAST_TILE>
   _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
     InputT (&items)[ITEMS_PER_THREAD],
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
@@ -833,7 +833,7 @@ struct AgentSelectIf
     }
 
     // Scatter flagged items
-    Scatter<IS_LAST_TILE, true>(
+    Scatter<IS_LAST_TILE>(
       items,
       selection_flags,
       selection_indices,
@@ -915,7 +915,7 @@ struct AgentSelectIf
     // previous tiles' input items, in case of in-place compaction), because this is implicitly ensured through
     // execution dependency: The scatter stage requires the offset from the prefix-sum and it can only know the
     // prefix-sum after having read that from the decoupled look-back. Scatter flagged items
-    Scatter<IS_LAST_TILE, false>(
+    Scatter<IS_LAST_TILE>(
       items,
       selection_flags,
       selection_indices,

From 73736960e47acec31659e44640bbb89220feec81 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 17 Sep 2024 22:55:55 -0700
Subject: [PATCH 20/41] adds test for different offset types for partition::if
 & ::flagged

---
 .../catch2_test_device_partition_flagged.cu   | 29 ++++++++++---------
 cub/test/catch2_test_device_partition_if.cu   | 29 ++++++++++---------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu
index 7cd5e6d10bb..4e8f04bdecf 100644
--- a/cub/test/catch2_test_device_partition_flagged.cu
+++ b/cub/test/catch2_test_device_partition_flagged.cu
@@ -91,6 +91,9 @@ using all_types =
 
 using types = c2h::type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::equal_comparable_t>>;
 
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
+
 CUB_TEST("DevicePartition::Flagged can run with empty input", "[device][partition_flagged]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -378,24 +381,22 @@ CUB_TEST("DevicePartition::Flagged works with different output type", "[device][
   REQUIRE(reference == out);
 }
 
-CUB_TEST("DevicePartition::Flagged works for very large number of items", "[device][partition_flagged]")
+CUB_TEST("DevicePartition::Flagged works for very large number of items", "[device][partition_flagged]", offset_types)
 try
 {
   using type     = std::int64_t;
-  using offset_t = std::int64_t;
-
-  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
-  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
-
+  using offset_t = typename c2h::get<0, TestType>;
+
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
   offset_t num_items = GENERATE_COPY(
-    values({
-      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
-      offset_t{2} * max_partition_size, // 2 partitions
-      max_partition_size + offset_t{1}, // 2 partitions
-      max_partition_size, // 1 partitions
-      max_partition_size - offset_t{1} // 1 partitions
-    }),
-    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
 
   // We select the first <cut_off_index> items and reject the rest
   const offset_t cut_off_index = num_items / 4;
diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu
index 7c3c778584d..a301ee6061e 100644
--- a/cub/test/catch2_test_device_partition_if.cu
+++ b/cub/test/catch2_test_device_partition_if.cu
@@ -81,6 +81,9 @@ using all_types =
 using types = c2h::
   type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>>;
 
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
+
 CUB_TEST("DevicePartition::If can run with empty input", "[device][partition_if]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -300,24 +303,22 @@ CUB_TEST("DevicePartition::If works with a different output type", "[device][par
   REQUIRE(reference == out);
 }
 
-CUB_TEST("DevicePartition::If works for very large number of items", "[device][partition_if]")
+CUB_TEST("DevicePartition::If works for very large number of items", "[device][partition_if]", offset_types)
 try
 {
   using type     = std::int64_t;
-  using offset_t = std::int64_t;
-
-  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
-  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
-
+  using offset_t = typename c2h::get<0, TestType>;
+
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
   offset_t num_items = GENERATE_COPY(
-    values({
-      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
-      offset_t{2} * max_partition_size, // 2 partitions
-      max_partition_size + offset_t{1}, // 2 partitions
-      max_partition_size, // 1 partitions
-      max_partition_size - offset_t{1} // 1 partitions
-    }),
-    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
 
   auto in = thrust::make_counting_iterator(offset_t{0});
 

From 687fb9978f29be0faa20fdea507182f72594425d Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 18 Sep 2024 02:14:07 -0700
Subject: [PATCH 21/41] adds tests and support for streaming select::unique

---
 cub/cub/agent/agent_select_if.cuh             |   4 +-
 cub/cub/device/device_select.cuh              |  25 +++--
 .../device/dispatch/dispatch_select_if.cuh    |  10 ++
 cub/test/catch2_test_device_select_unique.cu  | 101 ++++++++++++++++++
 4 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index e3a450a4a83..42e4869318d 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -477,7 +477,7 @@ struct AgentSelectIf
     OffsetT (&selection_flags)[ITEMS_PER_THREAD],
     Int2Type<USE_DISCONTINUITY> /*select_method*/)
   {
-    if (IS_FIRST_TILE)
+    if (IS_FIRST_TILE && streaming_context.is_first_partition())
     {
       CTA_SYNC();
 
@@ -489,7 +489,7 @@ struct AgentSelectIf
       InputT tile_predecessor;
       if (threadIdx.x == 0)
       {
-        tile_predecessor = d_in[tile_offset - 1];
+        tile_predecessor = d_in[tile_offset + streaming_context.input_offset() - 1];
       }
 
       CTA_SYNC();
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index e24526cd821..279878d048f 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -950,6 +950,9 @@ struct DeviceSelect
   //! @tparam NumSelectedIteratorT
   //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -974,23 +977,32 @@ struct DeviceSelect
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    ::cuda::std::int64_t num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique");
 
-    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
+    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
+    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
     using FlagIterator = NullType*; // FlagT iterator type (not used)
     using SelectOp     = NullType; // Selection op (not used)
     using EqualityOp   = Equality; // Default == operator
 
+    // Check if the number of items exceeds the range covered by the selected signed offset type
+    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
+    if (error)
+    {
+      return error;
+    }
+
     return DispatchSelectIf<
       InputIteratorT,
       FlagIterator,
@@ -1012,20 +1024,21 @@ struct DeviceSelect
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT,
+            typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    ::cuda::std::int64_t num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return Unique<InputIteratorT, OutputIteratorT, NumSelectedIteratorT>(
+    return Unique<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, NumItemsT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream);
   }
 #endif // DOXYGEN_SHOULD_SKIP_THIS
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 70edebbfc65..926d6e9caac 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -111,6 +111,11 @@ public:
     return first_partition ? TotalNumItemsT{0} : total_previous_num_items;
   };
 
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT is_first_partition() const
+  {
+    return first_partition;
+  };
+
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
   {
     return first_partition ? TotalNumItemsT{0} : *d_num_selected_in;
@@ -162,6 +167,11 @@ public:
     return TotalNumItemsT{0};
   };
 
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT is_first_partition() const
+  {
+    return true;
+  };
+
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
   {
     return TotalNumItemsT{0};
diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu
index 51c6200c624..a5397457438 100644
--- a/cub/test/catch2_test_device_select_unique.cu
+++ b/cub/test/catch2_test_device_select_unique.cu
@@ -30,11 +30,14 @@
 
 #include <cub/device/device_select.cuh>
 
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
 
 #include <algorithm>
 
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
@@ -97,6 +100,9 @@ using all_types =
 
 using types = c2h::type_list<std::uint8_t, std::uint32_t>;
 
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
+
 CUB_TEST("DeviceSelect::Unique can run with empty input", "[device][select_unique]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -264,3 +270,98 @@ CUB_TEST("DeviceSelect::Unique works with a different output type", "[device][se
   reference.resize(num_selected_out[0]);
   REQUIRE(reference == out);
 }
+
+CUB_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]", offset_types)
+try
+{
+  using type     = std::int64_t;
+  using offset_t = typename c2h::get<0, TestType>;
+
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  // All unique
+  SECTION("AllUnique")
+  {
+    auto in = thrust::make_counting_iterator(offset_t{0});
+
+    // Prepare tabulate output iterator to verify results in a memory-efficient way:
+    // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item
+    // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+    static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+    c2h::device_vector<std::uint32_t> correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element));
+    auto expected_result_it = in;
+    auto check_result_op =
+      make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+    auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+    // Needs to be device accessible
+    c2h::device_vector<offset_t> num_selected_out(1, 0);
+    offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+    // Run test
+    select_unique(in, check_result_it, d_first_num_selected_out, num_items);
+
+    // Ensure that we created the correct output
+    REQUIRE(num_selected_out[0] == num_items);
+    bool all_results_correct = thrust::equal(
+      correctness_flags.cbegin(),
+      correctness_flags.cbegin() + (num_items / bits_per_element),
+      thrust::make_constant_iterator(0xFFFFFFFFU));
+    REQUIRE(all_results_correct == true);
+    if (num_items % bits_per_element != 0)
+    {
+      std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U;
+      REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
+    }
+  }
+
+  // All the same -> single unique
+  SECTION("AllSame")
+  {
+    auto in = thrust::make_constant_iterator(offset_t{0});
+    constexpr offset_t expected_num_unique{1};
+
+    // Prepare tabulate output iterator to verify results in a memory-efficient way:
+    // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item
+    // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+    static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+    c2h::device_vector<std::uint32_t> correctness_flags(cub::DivideAndRoundUp(expected_num_unique, bits_per_element));
+    auto expected_result_it = in;
+    auto check_result_op =
+      make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+    auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+    // Needs to be device accessible
+    c2h::device_vector<offset_t> num_selected_out(1, 0);
+    offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+    // Run test
+    select_unique(in, check_result_it, d_first_num_selected_out, num_items);
+
+    // Ensure that we created the correct output
+    REQUIRE(num_selected_out[0] == expected_num_unique);
+    bool all_results_correct = thrust::equal(
+      correctness_flags.cbegin(),
+      correctness_flags.cbegin() + (expected_num_unique / bits_per_element),
+      thrust::make_constant_iterator(0xFFFFFFFFU));
+    REQUIRE(all_results_correct == true);
+    if (expected_num_unique % bits_per_element != 0)
+    {
+      std::uint32_t last_element_flags = (0x00000001U << (expected_num_unique % bits_per_element)) - 0x01U;
+      REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
+    }
+  }
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}

From 0e3f602bcf94ca267b5740d35bbea45dfd779dff Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 18 Sep 2024 05:32:33 -0700
Subject: [PATCH 22/41] fixes msvc warning

---
 cub/test/catch2_test_device_select_unique.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu
index a5397457438..fa015a50a27 100644
--- a/cub/test/catch2_test_device_select_unique.cu
+++ b/cub/test/catch2_test_device_select_unique.cu
@@ -354,11 +354,14 @@ try
       correctness_flags.cbegin() + (expected_num_unique / bits_per_element),
       thrust::make_constant_iterator(0xFFFFFFFFU));
     REQUIRE(all_results_correct == true);
-    if (expected_num_unique % bits_per_element != 0)
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */
+    _CCCL_IF_CONSTEXPR (expected_num_unique % bits_per_element != 0)
     {
       std::uint32_t last_element_flags = (0x00000001U << (expected_num_unique % bits_per_element)) - 0x01U;
       REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
     }
+    _CCCL_DIAG_POP
   }
 }
 catch (std::bad_alloc&)

From 305b3073bb26dc7cc6b257c35c544934f27ab45b Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 18 Sep 2024 09:05:43 -0700
Subject: [PATCH 23/41] fixes perf for partition

---
 cub/cub/agent/agent_select_if.cuh             | 38 ++++++++++++++++++-
 .../device/dispatch/dispatch_select_if.cuh    | 16 +++-----
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 42e4869318d..5618265baa7 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -730,7 +730,41 @@ struct AgentSelectIf
    * iterator, where selected items are written in order from the beginning of the itereator and rejected items are
    * writtem from the iterators end backwards.
    */
-  template <bool IS_LAST_TILE, typename PartitionedOutputItT>
+  template <bool IS_LAST_TILE, typename PartitionedOutputItT, typename _InputT = InputT, typename ::cuda::std::enable_if<sizeof(_InputT) == 1, int>::type = 0>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
+    int num_tile_items,
+    int tile_num_rejections,
+    OffsetT num_selections_prefix,
+    OffsetT num_rejected_prefix,
+    PartitionedOutputItT partitioned_out_it)
+  {
+    using total_offset_t = typename StreamingContextT::total_num_items_t;
+
+    total_offset_t total_rejected_prefix =
+      streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected() - num_rejected_prefix;
+    total_offset_t total_selected_prefix =
+      streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix);
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int item_idx                          = (ITEM * BLOCK_THREADS) + threadIdx.x;
+      int rejection_idx                     = item_idx;
+      int selection_idx                     = item_idx - tile_num_rejections;
+      total_offset_t scatter_rejected_index = total_rejected_prefix - rejection_idx - 1;
+      total_offset_t scatter_selected_index = total_selected_prefix + selection_idx;
+      total_offset_t scatter_offset =
+        (item_idx < tile_num_rejections) ? scatter_rejected_index : scatter_selected_index;
+
+      InputT item = temp_storage.raw_exchange.Alias()[item_idx];
+      if (!IS_LAST_TILE || (item_idx < num_tile_items))
+      {
+        partitioned_out_it[scatter_offset] = item;
+      }
+    }
+  }
+
+  template <bool IS_LAST_TILE, typename PartitionedOutputItT, typename _InputT = InputT, typename ::cuda::std::enable_if<(sizeof(_InputT) > 1), int>::type = 0>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
@@ -740,7 +774,7 @@ struct AgentSelectIf
   {
     using total_offset_t           = typename StreamingContextT::total_num_items_t;
     
-    total_offset_t _num_rejected_prefix = streaming_context.num_total_items() - streaming_context.num_previously_rejected();
+    total_offset_t _num_rejected_prefix = streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected();
     _num_rejected_prefix -= static_cast<total_offset_t>(num_rejected_prefix + 1);
     total_offset_t _num_selections_prefix = streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix);
     
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 926d6e9caac..07f8ed3049b 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -126,7 +126,8 @@ public:
     return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected());
   };
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() const
+  template <typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT) const
   {
     return total_num_items;
   };
@@ -149,16 +150,10 @@ public:
 template <typename TotalNumItemsT>
 class streaming_context_t<TotalNumItemsT, false>
 {
-private:
-  TotalNumItemsT total_num_items{};
-
 public:
   using total_num_items_t = TotalNumItemsT;
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
-  streaming_context_t(TotalNumItemsT*, TotalNumItemsT*, TotalNumItemsT total_num_items, bool)
-      : total_num_items(total_num_items)
-  {}
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_context_t(TotalNumItemsT*, TotalNumItemsT*, TotalNumItemsT, bool) {}
 
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT, bool) {};
 
@@ -182,9 +177,10 @@ public:
     return TotalNumItemsT{0};
   };
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() const
+  template <typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT num_partition_items) const
   {
-    return total_num_items;
+    return num_partition_items;
   };
 
   template <typename NumSelectedIteratorT, typename OffsetT>

From db36ae9c5278fedfa3bb188805c2f64fac5e3f45 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 18 Sep 2024 13:32:52 -0700
Subject: [PATCH 24/41] fixes format

---
 cub/cub/agent/agent_select_if.cuh             | 38 +++++++++++--------
 cub/cub/device/device_select.cuh              | 12 +++---
 .../device/dispatch/dispatch_select_if.cuh    |  4 +-
 cub/test/catch2_test_device_select_if.cu      |  4 +-
 4 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 5618265baa7..2c88e2a5e0a 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -730,7 +730,10 @@ struct AgentSelectIf
    * iterator, where selected items are written in order from the beginning of the itereator and rejected items are
    * writtem from the iterators end backwards.
    */
-  template <bool IS_LAST_TILE, typename PartitionedOutputItT, typename _InputT = InputT, typename ::cuda::std::enable_if<sizeof(_InputT) == 1, int>::type = 0>
+  template <bool IS_LAST_TILE,
+            typename PartitionedOutputItT,
+            typename _InputT                                                 = InputT,
+            typename ::cuda::std::enable_if<sizeof(_InputT) == 1, int>::type = 0>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
@@ -764,7 +767,10 @@ struct AgentSelectIf
     }
   }
 
-  template <bool IS_LAST_TILE, typename PartitionedOutputItT, typename _InputT = InputT, typename ::cuda::std::enable_if<(sizeof(_InputT) > 1), int>::type = 0>
+  template <bool IS_LAST_TILE,
+            typename PartitionedOutputItT,
+            typename _InputT                                                  = InputT,
+            typename ::cuda::std::enable_if<(sizeof(_InputT) > 1), int>::type = 0>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
@@ -772,26 +778,26 @@ struct AgentSelectIf
     OffsetT num_rejected_prefix,
     PartitionedOutputItT partitioned_out_it)
   {
-    using total_offset_t           = typename StreamingContextT::total_num_items_t;
-    
-    total_offset_t _num_rejected_prefix = streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected();
+    using total_offset_t = typename StreamingContextT::total_num_items_t;
+
+    total_offset_t _num_rejected_prefix =
+      streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected();
     _num_rejected_prefix -= static_cast<total_offset_t>(num_rejected_prefix + 1);
-    total_offset_t _num_selections_prefix = streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix);
-    
+    total_offset_t _num_selections_prefix =
+      streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix);
+
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {      
-      int item_idx      = (ITEM * BLOCK_THREADS) + threadIdx.x;
-      InputT item = temp_storage.raw_exchange.Alias()[item_idx];
-      
-      int rejection_idx = item_idx;
-      int selection_idx = item_idx - tile_num_rejections;
+    {
+      int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
+      InputT item  = temp_storage.raw_exchange.Alias()[item_idx];
+
+      int rejection_idx                     = item_idx;
+      int selection_idx                     = item_idx - tile_num_rejections;
       total_offset_t scatter_rejected_index = _num_rejected_prefix - static_cast<total_offset_t>(rejection_idx);
       total_offset_t scatter_selected_index = _num_selections_prefix + static_cast<total_offset_t>(selection_idx);
       total_offset_t scatter_offset =
-        (item_idx < tile_num_rejections)
-          ? scatter_rejected_index
-          : scatter_selected_index;
+        (item_idx < tile_num_rejections) ? scatter_rejected_index : scatter_selected_index;
 
       if (!IS_LAST_TILE || (item_idx < num_tile_items))
       {
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 279878d048f..dea2776eb62 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -977,8 +977,7 @@ struct DeviceSelect
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT,
-            typename NumItemsT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -992,9 +991,9 @@ struct DeviceSelect
 
     using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
     using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
-    using FlagIterator = NullType*; // FlagT iterator type (not used)
-    using SelectOp     = NullType; // Selection op (not used)
-    using EqualityOp   = Equality; // Default == operator
+    using FlagIterator  = NullType*; // FlagT iterator type (not used)
+    using SelectOp      = NullType; // Selection op (not used)
+    using EqualityOp    = Equality; // Default == operator
 
     // Check if the number of items exceeds the range covered by the selected signed offset type
     cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
@@ -1024,8 +1023,7 @@ struct DeviceSelect
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT,
-            typename NumItemsT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 07f8ed3049b..7c4ebd3a4a2 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -130,7 +130,7 @@ public:
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT) const
   {
     return total_num_items;
-  };
+  }
 
   template <typename NumSelectedIteratorT, typename OffsetT>
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
@@ -181,7 +181,7 @@ public:
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT num_partition_items) const
   {
     return num_partition_items;
-  };
+  }
 
   template <typename NumSelectedIteratorT, typename OffsetT>
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu
index 3f7cc402b93..6eb1219cae7 100644
--- a/cub/test/catch2_test_device_select_if.cu
+++ b/cub/test/catch2_test_device_select_if.cu
@@ -343,8 +343,8 @@ try
   offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   // Run test
-  constexpr offset_t match_every_nth     = 1000000;
-  offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
+  constexpr offset_t match_every_nth = 1000000;
+  offset_t expected_num_copied       = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
   c2h::device_vector<type> out(expected_num_copied);
   select_if(
     in, out.begin(), d_first_num_selected_out, num_items, mod_n<offset_t>{static_cast<offset_t>(match_every_nth)});

From 5f38f5e89e9baf351eb31c43964018ebfe50cb77 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 18 Sep 2024 20:39:18 -0700
Subject: [PATCH 25/41] fixes mixup for partition perf fix

---
 cub/cub/agent/agent_select_if.cuh | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 2c88e2a5e0a..61020afa1f7 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -732,14 +732,14 @@ struct AgentSelectIf
    */
   template <bool IS_LAST_TILE,
             typename PartitionedOutputItT,
-            typename _InputT                                                 = InputT,
-            typename ::cuda::std::enable_if<sizeof(_InputT) == 1, int>::type = 0>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
-    int num_tile_items,
-    int tile_num_rejections,
-    OffsetT num_selections_prefix,
-    OffsetT num_rejected_prefix,
-    PartitionedOutputItT partitioned_out_it)
+            typename _InputT = InputT,
+            typename ::cuda::std::enable_if < sizeof(_InputT) > 1,
+            int > ::type = 0 > _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
+                             int num_tile_items,
+                             int tile_num_rejections,
+                             OffsetT num_selections_prefix,
+                             OffsetT num_rejected_prefix,
+                             PartitionedOutputItT partitioned_out_it)
   {
     using total_offset_t = typename StreamingContextT::total_num_items_t;
 
@@ -769,8 +769,8 @@ struct AgentSelectIf
 
   template <bool IS_LAST_TILE,
             typename PartitionedOutputItT,
-            typename _InputT                                                  = InputT,
-            typename ::cuda::std::enable_if<(sizeof(_InputT) > 1), int>::type = 0>
+            typename _InputT                                                   = InputT,
+            typename ::cuda::std::enable_if<(sizeof(_InputT) == 1), int>::type = 0>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,

From 16a2a75333c2cd5f11ecd020b3b4e5da3659a5e1 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 18 Sep 2024 20:52:49 -0700
Subject: [PATCH 26/41] fixes syntax

---
 cub/cub/agent/agent_select_if.cuh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 61020afa1f7..4db3cf4dee5 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -732,14 +732,14 @@ struct AgentSelectIf
    */
   template <bool IS_LAST_TILE,
             typename PartitionedOutputItT,
-            typename _InputT = InputT,
-            typename ::cuda::std::enable_if < sizeof(_InputT) > 1,
-            int > ::type = 0 > _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
-                             int num_tile_items,
-                             int tile_num_rejections,
-                             OffsetT num_selections_prefix,
-                             OffsetT num_rejected_prefix,
-                             PartitionedOutputItT partitioned_out_it)
+            typename _InputT                                                 = InputT,
+            typename ::cuda::std::enable_if<(sizeof(_InputT) > 1), int>::type = 0>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
+    int num_tile_items,
+    int tile_num_rejections,
+    OffsetT num_selections_prefix,
+    OffsetT num_rejected_prefix,
+    PartitionedOutputItT partitioned_out_it)
   {
     using total_offset_t = typename StreamingContextT::total_num_items_t;
 
@@ -769,7 +769,7 @@ struct AgentSelectIf
 
   template <bool IS_LAST_TILE,
             typename PartitionedOutputItT,
-            typename _InputT                                                   = InputT,
+            typename _InputT                                                  = InputT,
             typename ::cuda::std::enable_if<(sizeof(_InputT) == 1), int>::type = 0>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,

From be83ebef5f45f8d4af8f77d41668da53db38d166 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 18 Sep 2024 21:20:31 -0700
Subject: [PATCH 27/41] fixes partition:flagged perf

---
 cub/cub/agent/agent_select_if.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 4db3cf4dee5..8397a96ddd4 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -732,8 +732,8 @@ struct AgentSelectIf
    */
   template <bool IS_LAST_TILE,
             typename PartitionedOutputItT,
-            typename _InputT                                                 = InputT,
-            typename ::cuda::std::enable_if<(sizeof(_InputT) > 1), int>::type = 0>
+            typename _InputT = InputT,
+            typename ::cuda::std::enable_if<!(sizeof(_InputT) == 1 && SELECT_METHOD == USE_SELECT_OP), int>::type = 0>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
@@ -769,8 +769,8 @@ struct AgentSelectIf
 
   template <bool IS_LAST_TILE,
             typename PartitionedOutputItT,
-            typename _InputT                                                  = InputT,
-            typename ::cuda::std::enable_if<(sizeof(_InputT) == 1), int>::type = 0>
+            typename _InputT = InputT,
+            typename ::cuda::std::enable_if<(sizeof(_InputT) == 1 && SELECT_METHOD == USE_SELECT_OP), int>::type = 0>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,

From d4816b6a62f5e451e24e8683c8e0897e10c9ed5c Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 19 Sep 2024 13:07:46 -0700
Subject: [PATCH 28/41] fixes perf for partition::flagged

---
 cub/cub/agent/agent_select_if.cuh | 48 +++----------------------------
 1 file changed, 4 insertions(+), 44 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 8397a96ddd4..df0d663b9e1 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -730,10 +730,7 @@ struct AgentSelectIf
    * iterator, where selected items are written in order from the beginning of the itereator and rejected items are
    * writtem from the iterators end backwards.
    */
-  template <bool IS_LAST_TILE,
-            typename PartitionedOutputItT,
-            typename _InputT = InputT,
-            typename ::cuda::std::enable_if<!(sizeof(_InputT) == 1 && SELECT_METHOD == USE_SELECT_OP), int>::type = 0>
+  template <bool IS_LAST_TILE, typename PartitionedOutputItT>
   _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
     int num_tile_items,
     int tile_num_rejections,
@@ -767,45 +764,6 @@ struct AgentSelectIf
     }
   }
 
-  template <bool IS_LAST_TILE,
-            typename PartitionedOutputItT,
-            typename _InputT = InputT,
-            typename ::cuda::std::enable_if<(sizeof(_InputT) == 1 && SELECT_METHOD == USE_SELECT_OP), int>::type = 0>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
-    int num_tile_items,
-    int tile_num_rejections,
-    OffsetT num_selections_prefix,
-    OffsetT num_rejected_prefix,
-    PartitionedOutputItT partitioned_out_it)
-  {
-    using total_offset_t = typename StreamingContextT::total_num_items_t;
-
-    total_offset_t _num_rejected_prefix =
-      streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected();
-    _num_rejected_prefix -= static_cast<total_offset_t>(num_rejected_prefix + 1);
-    total_offset_t _num_selections_prefix =
-      streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix);
-
-#pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-      int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
-      InputT item  = temp_storage.raw_exchange.Alias()[item_idx];
-
-      int rejection_idx                     = item_idx;
-      int selection_idx                     = item_idx - tile_num_rejections;
-      total_offset_t scatter_rejected_index = _num_rejected_prefix - static_cast<total_offset_t>(rejection_idx);
-      total_offset_t scatter_selected_index = _num_selections_prefix + static_cast<total_offset_t>(selection_idx);
-      total_offset_t scatter_offset =
-        (item_idx < tile_num_rejections) ? scatter_rejected_index : scatter_selected_index;
-
-      if (!IS_LAST_TILE || (item_idx < num_tile_items))
-      {
-        partitioned_out_it[scatter_offset] = item;
-      }
-    }
-  }
-
   //---------------------------------------------------------------------
   // Cooperatively scan a device-wide sequence of tiles with other CTAs
   //---------------------------------------------------------------------
@@ -1025,7 +983,9 @@ struct AgentSelectIf
     auto tile_state_wrapper = MemoryOrderedTileStateT{tile_state};
 
     // Blocks are launched in increasing order, so just assign one tile per block
-    int tile_idx        = blockIdx.x;
+    // TODO (elstehle): replacing this term with just `blockIdx.x` degrades perf for partition. Once we get to re-tune
+    // the algorithm, we want to replace this term with `blockIdx.x`
+    int tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
     OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(TILE_ITEMS);
 
     if (tile_idx < num_tiles - 1)

From 3c9788d13962b346faabfd3cb27c9c0cd17255a6 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 19 Sep 2024 21:42:26 -0700
Subject: [PATCH 29/41] switches unique to always use i64 offset types

---
 cub/cub/device/device_select.cuh             | 23 ++++------------
 cub/test/catch2_test_device_select_unique.cu | 29 ++++++++++----------
 2 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index dea2776eb62..35428013574 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -950,9 +950,6 @@ struct DeviceSelect
   //! @tparam NumSelectedIteratorT
   //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
   //!
-  //! @tparam NumItemsT
-  //!   **[inferred]** Type of num_items
-  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -977,30 +974,22 @@ struct DeviceSelect
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename NumItemsT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    NumItemsT num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique");
 
-    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
-    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
-    using FlagIterator  = NullType*; // FlagT iterator type (not used)
-    using SelectOp      = NullType; // Selection op (not used)
-    using EqualityOp    = Equality; // Default == operator
-
-    // Check if the number of items exceeds the range covered by the selected signed offset type
-    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
-    if (error)
-    {
-      return error;
-    }
+    using OffsetT      = ::cuda::std::int64_t;
+    using FlagIterator = NullType*; // FlagT iterator type (not used)
+    using SelectOp     = NullType; // Selection op (not used)
+    using EqualityOp   = Equality; // Default == operator
 
     return DispatchSelectIf<
       InputIteratorT,
diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu
index fa015a50a27..9c49ce5a10f 100644
--- a/cub/test/catch2_test_device_select_unique.cu
+++ b/cub/test/catch2_test_device_select_unique.cu
@@ -100,9 +100,6 @@ using all_types =
 
 using types = c2h::type_list<std::uint8_t, std::uint32_t>;
 
-// List of offset types to be used for testing large number of items
-using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
-
 CUB_TEST("DeviceSelect::Unique can run with empty input", "[device][select_unique]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -271,22 +268,24 @@ CUB_TEST("DeviceSelect::Unique works with a different output type", "[device][se
   REQUIRE(reference == out);
 }
 
-CUB_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]", offset_types)
+CUB_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]")
 try
 {
   using type     = std::int64_t;
-  using offset_t = typename c2h::get<0, TestType>;
-
-  auto num_items_max_ull =
-    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
-             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
-  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
-  offset_t num_items_min =
-    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
   offset_t num_items = GENERATE_COPY(
-    values(
-      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
-    take(2, random(num_items_min, num_items_max)));
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
 
   // All unique
   SECTION("AllUnique")

From 49a50f5ac86df36e7ecdb1da5de80a682de6a9b6 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 19 Sep 2024 23:58:02 -0700
Subject: [PATCH 30/41] adds benchmark for partition with distinct iterators

---
 cub/benchmarks/bench/partition/flagged.cu | 35 +++++++++++++---
 cub/benchmarks/bench/partition/if.cu      | 51 +++++++++++++++++------
 2 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
index d881000701f..b3562ba8408 100644
--- a/cub/benchmarks/bench/partition/flagged.cu
+++ b/cub/benchmarks/bench/partition/flagged.cu
@@ -27,6 +27,10 @@
 
 #include <cub/device/device_partition.cuh>
 
+#include <thrust/count.h>
+
+#include <cuda/std/type_traits>
+
 #include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
 
@@ -77,16 +81,32 @@ struct policy_hub_t
 };
 #endif // TUNE_BASE
 
-template <typename T, typename OffsetT>
-void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+template <typename FlagsItT, typename T, typename OffsetT>
+void init_output_partition_buffer(
+  FlagsItT d_flags, OffsetT num_items, T* d_out, cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
+{
+  const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true);
+  d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
+}
+
+template <typename FlagsItT, typename T, typename OffsetT>
+void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_out_buffer)
+{
+  d_partition_out_buffer = d_out;
+}
+
+template <typename T, typename OffsetT, typename UseDistinctPartitionT>
+void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
 {
   using input_it_t        = const T*;
   using flag_it_t         = const bool*;
-  using output_it_t       = T*;
   using num_selected_it_t = OffsetT*;
   using select_op_t       = cub::NullType;
   using equality_op_t     = cub::NullType;
   using offset_t          = OffsetT;
+  constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
+  using output_it_t                          = typename ::cuda::std::
+    conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
@@ -127,8 +147,9 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 
   input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
   flag_it_t d_flags                = thrust::raw_pointer_cast(flags.data());
-  output_it_t d_out                = thrust::raw_pointer_cast(out.data());
   num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  output_it_t d_out{};
+  init_output_partition_buffer(flags.cbegin(), elements, thrust::raw_pointer_cast(out.data()), d_out);
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements);
@@ -158,8 +179,10 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
   .set_name("base")
-  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
   .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
index fac7c8e5c36..5fc4f82f6d9 100644
--- a/cub/benchmarks/bench/partition/if.cu
+++ b/cub/benchmarks/bench/partition/if.cu
@@ -27,6 +27,10 @@
 
 #include <cub/device/device_partition.cuh>
 
+#include <thrust/count.h>
+
+#include <cuda/std/type_traits>
+
 #include <look_back_helper.cuh>
 #include <nvbench_helper.cuh>
 
@@ -102,16 +106,36 @@ T value_from_entropy(double percentage)
   return static_cast<T>(result);
 }
 
-template <typename T, typename OffsetT>
-void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
+void init_output_partition_buffer(
+  InItT d_in,
+  OffsetT num_items,
+  T* d_out,
+  SelectOpT select_op,
+  cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
 {
-  using input_it_t        = const T*;
-  using flag_it_t         = cub::NullType*;
-  using output_it_t       = T*;
-  using num_selected_it_t = OffsetT*;
-  using select_op_t       = less_then_t<T>;
-  using equality_op_t     = cub::NullType;
-  using offset_t          = OffsetT;
+  const auto selected_elements = thrust::count_if(d_in, d_in + num_items, select_op);
+  d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
+}
+
+template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
+void init_output_partition_buffer(InItT, OffsetT, T* d_out, SelectOpT, T*& d_partition_out_buffer)
+{
+  d_partition_out_buffer = d_out;
+}
+
+template <typename T, typename OffsetT, typename UseDistinctPartitionT>
+void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
+{
+  using input_it_t                           = const T*;
+  using flag_it_t                            = cub::NullType*;
+  using num_selected_it_t                    = OffsetT*;
+  using select_op_t                          = less_then_t<T>;
+  using equality_op_t                        = cub::NullType;
+  using offset_t                             = OffsetT;
+  constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
+  using output_it_t                          = typename ::cuda::std::
+    conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;
 
 #if !TUNE_BASE
   using policy_t   = policy_hub_t<T>;
@@ -153,8 +177,9 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 
   input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
   flag_it_t d_flags                = nullptr;
-  output_it_t d_out                = thrust::raw_pointer_cast(out.data());
   num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  output_it_t d_out{};
+  init_output_partition_buffer(in.cbegin(), elements, thrust::raw_pointer_cast(out.data()), select_op, d_out);
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements);
@@ -183,8 +208,10 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
   .set_name("base")
-  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
   .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});

From 5b747b9f0ee1b430491f5aec5745bd0fead12121 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 20 Sep 2024 07:47:09 -0700
Subject: [PATCH 31/41] resolves merge conflicts

---
 cub/test/catch2_test_device_partition_flagged.cu | 4 +++-
 cub/test/catch2_test_device_partition_if.cu      | 4 +++-
 cub/test/catch2_test_device_select_unique.cu     | 6 ++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu
index 4e8f04bdecf..0405dc010ee 100644
--- a/cub/test/catch2_test_device_partition_flagged.cu
+++ b/cub/test/catch2_test_device_partition_flagged.cu
@@ -39,6 +39,8 @@
 #include <thrust/partition.h>
 #include <thrust/reverse.h>
 
+#include <cuda/cmath>
+
 #include <algorithm>
 
 #include "catch2_test_device_select_common.cuh"
@@ -409,7 +411,7 @@ try
   // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item
   // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
   static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
-  c2h::device_vector<std::uint32_t> correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element));
+  c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
   auto expected_selected_it = thrust::make_counting_iterator(offset_t{0});
   auto expected_rejected_it = thrust::make_reverse_iterator(
     thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index));
diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu
index a301ee6061e..5feab9de316 100644
--- a/cub/test/catch2_test_device_partition_if.cu
+++ b/cub/test/catch2_test_device_partition_if.cu
@@ -39,6 +39,8 @@
 #include <thrust/partition.h>
 #include <thrust/reverse.h>
 
+#include <cuda/cmath>
+
 #include <algorithm>
 
 #include "catch2_test_device_select_common.cuh"
@@ -329,7 +331,7 @@ try
   // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item
   // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
   static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
-  c2h::device_vector<std::uint32_t> correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element));
+  c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
   auto expected_selected_it = thrust::make_counting_iterator(offset_t{0});
   auto expected_rejected_it = thrust::make_reverse_iterator(
     thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index));
diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu
index 9c49ce5a10f..080134f73b1 100644
--- a/cub/test/catch2_test_device_select_unique.cu
+++ b/cub/test/catch2_test_device_select_unique.cu
@@ -35,6 +35,8 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/tabulate_output_iterator.h>
 
+#include <cuda/cmath>
+
 #include <algorithm>
 
 #include "catch2_test_device_select_common.cuh"
@@ -296,7 +298,7 @@ try
     // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item
     // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
     static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
-    c2h::device_vector<std::uint32_t> correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element));
+    c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
     auto expected_result_it = in;
     auto check_result_op =
       make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
@@ -333,7 +335,7 @@ try
     // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item
     // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
     static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
-    c2h::device_vector<std::uint32_t> correctness_flags(cub::DivideAndRoundUp(expected_num_unique, bits_per_element));
+    c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(expected_num_unique, bits_per_element));
     auto expected_result_it = in;
     auto check_result_op =
       make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));

From 54618155afb7a35838a5dfd4b7948cc25a8af96f Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 23 Sep 2024 10:34:16 -0700
Subject: [PATCH 32/41] fixes merge conflict

---
 cub/cub/device/dispatch/dispatch_select_if.cuh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 7c214457685..2210bba17cf 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -612,8 +612,7 @@ struct DispatchSelectIf : SelectedPolicy
     auto const num_partitions = ::cuda::ceil_div(num_items, max_partition_size);
 
     // The maximum number of tiles for which we will ever invoke the kernel
-    auto const max_num_tiles_per_invocation =
-      static_cast<OffsetT>(::cuda::ceil_div(max_partition_size, tile_size));
+    auto const max_num_tiles_per_invocation = static_cast<OffsetT>(::cuda::ceil_div(max_partition_size, tile_size));
 
     // The amount of virtual shared memory to allocate
     const auto vsmem_size = max_num_tiles_per_invocation * VsmemHelperT::vsmem_per_block;
@@ -888,7 +887,7 @@ struct DispatchSelectIf : SelectedPolicy
       num_items,
       stream);
   }
-};
 #endif // DOXYGEN_SHOULD_SKIP_THIS
+};
 
 CUB_NAMESPACE_END

From 18cd2d7777e9298c0fe672d4b18e330c237c8dd8 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Mon, 23 Sep 2024 10:45:42 -0700
Subject: [PATCH 33/41] makes sass identical to main for i32 partition

---
 cub/cub/agent/agent_select_if.cuh | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index df0d663b9e1..a5e7370aa56 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -740,21 +740,19 @@ struct AgentSelectIf
   {
     using total_offset_t = typename StreamingContextT::total_num_items_t;
 
-    total_offset_t total_rejected_prefix =
-      streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected() - num_rejected_prefix;
-    total_offset_t total_selected_prefix =
-      streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix);
-
 #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
     {
-      int item_idx                          = (ITEM * BLOCK_THREADS) + threadIdx.x;
-      int rejection_idx                     = item_idx;
-      int selection_idx                     = item_idx - tile_num_rejections;
-      total_offset_t scatter_rejected_index = total_rejected_prefix - rejection_idx - 1;
-      total_offset_t scatter_selected_index = total_selected_prefix + selection_idx;
+      int item_idx      = (ITEM * BLOCK_THREADS) + threadIdx.x;
+      int rejection_idx = item_idx;
+      int selection_idx = item_idx - tile_num_rejections;
       total_offset_t scatter_offset =
-        (item_idx < tile_num_rejections) ? scatter_rejected_index : scatter_selected_index;
+        (item_idx < tile_num_rejections)
+          ? (streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected()
+             - static_cast<total_offset_t>(num_rejected_prefix) - static_cast<total_offset_t>(rejection_idx)
+             - total_offset_t{1})
+          : (streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix)
+             + static_cast<total_offset_t>(selection_idx));
 
       InputT item = temp_storage.raw_exchange.Alias()[item_idx];
       if (!IS_LAST_TILE || (item_idx < num_tile_items))

From 6ccdaf9d66b430228f562bae0d3d723dc5172afa Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 1 Oct 2024 10:16:27 -0700
Subject: [PATCH 34/41] updates thrust copy_if to always use i64 offset types

---
 thrust/thrust/system/cuda/detail/copy_if.h | 23 ++++++++--------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h
index 19dd014f59a..034e850e0e2 100644
--- a/thrust/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/thrust/system/cuda/detail/copy_if.h
@@ -219,19 +219,16 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
   cudaError_t status        = cudaSuccess;
   size_t temp_storage_bytes = 0;
 
-  // 32-bit offset-type dispatch
-  using dispatch32_t = DispatchCopyIf<MayAlias, Derived, InputIt, StencilIt, OutputIt, Predicate, std::int32_t>;
-
   // 64-bit offset-type dispatch
+  // Since https://github.com/NVIDIA/cccl/pull/2400, cub::DeviceSelect is using a streaming approach that splits up
+  // inputs larger than INT_MAX into partitions of up to `INT_MAX` items each, repeatedly invoking the respective
+  // algorithm. With that approach, we can always use i64 offset types for DispatchSelectIf, because there's only very
+  // limited performance upside for using i32 offset types. This avoids potentially duplicate kernel compilation.
   using dispatch64_t = DispatchCopyIf<MayAlias, Derived, InputIt, StencilIt, OutputIt, Predicate, std::int64_t>;
 
   // Query temporary storage requirements
-  THRUST_INDEX_TYPE_DISPATCH2(
-    status,
-    dispatch32_t::dispatch,
-    dispatch64_t::dispatch,
-    num_items,
-    (policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed));
+  status = dispatch64_t::dispatch(
+    policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, static_cast<std::int64_t>(num_items));
   cuda_cub::throw_on_error(status, "copy_if failed on 1st step");
 
   // Allocate temporary storage.
@@ -239,12 +236,8 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
   void* temp_storage = static_cast<void*>(tmp.data().get());
 
   // Run algorithm
-  THRUST_INDEX_TYPE_DISPATCH2(
-    status,
-    dispatch32_t::dispatch,
-    dispatch64_t::dispatch,
-    num_items,
-    (policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed));
+  dispatch64_t::dispatch(
+    policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, static_cast<std::int64_t>(num_items));
   cuda_cub::throw_on_error(status, "copy_if failed on 2nd step");
 
   return output;

From 95de26f23d78c4d803dbfe48c2a38b8622268ec7 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 1 Oct 2024 10:17:00 -0700
Subject: [PATCH 35/41] fixes formatting

---
 cub/benchmarks/bench/partition/flagged.cu | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
index b3562ba8408..ab2fd83dca7 100644
--- a/cub/benchmarks/bench/partition/flagged.cu
+++ b/cub/benchmarks/bench/partition/flagged.cu
@@ -83,7 +83,10 @@ struct policy_hub_t
 
 template <typename FlagsItT, typename T, typename OffsetT>
 void init_output_partition_buffer(
-  FlagsItT d_flags, OffsetT num_items, T* d_out, cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
+  FlagsItT d_flags,
+  OffsetT num_items,
+  T* d_out,
+  cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
 {
   const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true);
   d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
@@ -98,12 +101,12 @@ void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_o
 template <typename T, typename OffsetT, typename UseDistinctPartitionT>
 void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
 {
-  using input_it_t        = const T*;
-  using flag_it_t         = const bool*;
-  using num_selected_it_t = OffsetT*;
-  using select_op_t       = cub::NullType;
-  using equality_op_t     = cub::NullType;
-  using offset_t          = OffsetT;
+  using input_it_t                           = const T*;
+  using flag_it_t                            = const bool*;
+  using num_selected_it_t                    = OffsetT*;
+  using select_op_t                          = cub::NullType;
+  using equality_op_t                        = cub::NullType;
+  using offset_t                             = OffsetT;
   constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
   using output_it_t                          = typename ::cuda::std::
     conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;

From 8750240c20e16d2f118f011572e52f9df551832a Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 3 Oct 2024 07:52:40 -0700
Subject: [PATCH 36/41] minor style improvements

---
 cub/cub/agent/agent_select_if.cuh             |  3 +++
 .../device/dispatch/dispatch_select_if.cuh    | 14 +++++------
 .../catch2_test_device_partition_flagged.cu   | 10 +-------
 cub/test/catch2_test_device_partition_if.cu   | 10 +-------
 cub/test/catch2_test_device_select_common.cuh | 25 ++++++++++++++++---
 cub/test/catch2_test_device_select_unique.cu  | 23 ++---------------
 6 files changed, 36 insertions(+), 49 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index a5e7370aa56..e765be68a18 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -342,6 +342,9 @@ struct AgentSelectIf
    *
    * @param num_items
    *   Total number of input items
+   *
+   * @param streaming_context
+   *   Context for the current partition
    */
   _CCCL_DEVICE _CCCL_FORCEINLINE AgentSelectIf(
     TempStorage& temp_storage,
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 2210bba17cf..bff0252d357 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -457,7 +457,7 @@ struct DispatchSelectIf : SelectedPolicy
   // supports streaming (i.e., splitting the input into partitions of up to partition_size number of items)
   static constexpr bool may_require_streaming =
     (static_cast<::cuda::std::uint64_t>(partition_size)
-     < static_cast<::cuda::std::uint64_t>(cuda::std::numeric_limits<OffsetT>::max()));
+     < static_cast<::cuda::std::uint64_t>(::cuda::std::numeric_limits<OffsetT>::max()));
 
   using streaming_context_t = detail::select::streaming_context_t<num_total_items_t, may_require_streaming>;
 
@@ -674,7 +674,7 @@ struct DispatchSelectIf : SelectedPolicy
         error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0]));
         if (cudaSuccess != error)
         {
-          break;
+          return error;
         }
 
         // Log scan_init_kernel configuration
@@ -695,14 +695,14 @@ struct DispatchSelectIf : SelectedPolicy
         error = CubDebug(cudaPeekAtLastError());
         if (cudaSuccess != error)
         {
-          break;
+          return error;
         }
 
         // Sync the stream if specified to flush runtime errors
         error = CubDebug(detail::DebugSyncStream(stream));
         if (cudaSuccess != error)
         {
-          break;
+          return error;
         }
 
 // Log select_if_kernel configuration
@@ -715,7 +715,7 @@ struct DispatchSelectIf : SelectedPolicy
                                           block_threads));
           if (cudaSuccess != error)
           {
-            break;
+            return error;
           }
 
           _CubLog("Invoking select_if_kernel<<<%d, %d, 0, "
@@ -747,14 +747,14 @@ struct DispatchSelectIf : SelectedPolicy
         error = CubDebug(cudaPeekAtLastError());
         if (cudaSuccess != error)
         {
-          break;
+          return error;
         }
 
         // Sync the stream if specified to flush runtime errors
         error = CubDebug(detail::DebugSyncStream(stream));
         if (cudaSuccess != error)
         {
-          break;
+          return error;
         }
 
         // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.)
diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu
index 0405dc010ee..2d2c1ecaa06 100644
--- a/cub/test/catch2_test_device_partition_flagged.cu
+++ b/cub/test/catch2_test_device_partition_flagged.cu
@@ -431,16 +431,8 @@ try
 
   // Ensure that we created the correct output
   REQUIRE(num_selected_out[0] == cut_off_index);
-  bool all_results_correct = thrust::equal(
-    correctness_flags.cbegin(),
-    correctness_flags.cbegin() + (num_items / bits_per_element),
-    thrust::make_constant_iterator(0xFFFFFFFFU));
+  bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
   REQUIRE(all_results_correct == true);
-  if (num_items % bits_per_element != 0)
-  {
-    std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U;
-    REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
-  }
 }
 catch (std::bad_alloc&)
 {
diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu
index 5feab9de316..1db5774da42 100644
--- a/cub/test/catch2_test_device_partition_if.cu
+++ b/cub/test/catch2_test_device_partition_if.cu
@@ -352,16 +352,8 @@ try
 
   // Ensure that we created the correct output
   REQUIRE(num_selected_out[0] == cut_off_index);
-  bool all_results_correct = thrust::equal(
-    correctness_flags.cbegin(),
-    correctness_flags.cbegin() + (num_items / bits_per_element),
-    thrust::make_constant_iterator(0xFFFFFFFFU));
+  bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
   REQUIRE(all_results_correct == true);
-  if (num_items % bits_per_element != 0)
-  {
-    std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U;
-    REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
-  }
 }
 catch (std::bad_alloc&)
 {
diff --git a/cub/test/catch2_test_device_select_common.cuh b/cub/test/catch2_test_device_select_common.cuh
index 3ee93ee6cae..39f82d04612 100644
--- a/cub/test/catch2_test_device_select_common.cuh
+++ b/cub/test/catch2_test_device_select_common.cuh
@@ -3,8 +3,12 @@
 
 #pragma once
 
+#include <thrust/iterator/constant_iterator.h>
+
 #include <cuda/std/type_traits>
 
+#include "catch2_test_helper.h"
+
 template <typename T>
 struct less_than_t
 {
@@ -70,7 +74,7 @@ struct index_to_expected_partition_op
 };
 
 template <typename SelectedItT, typename RejectedItT>
-index_to_expected_partition_op<SelectedItT, RejectedItT> make_index_to_expected_partition_op(
+static index_to_expected_partition_op<SelectedItT, RejectedItT> make_index_to_expected_partition_op(
   SelectedItT expected_selected_it, RejectedItT expected_rejected_it, std::int64_t expected_num_selected)
 {
   return index_to_expected_partition_op<SelectedItT, RejectedItT>{
@@ -98,8 +102,23 @@ struct flag_correct_writes_op
 };
 
 template <typename ExpectedValuesItT>
-flag_correct_writes_op<ExpectedValuesItT>
-make_checking_write_op(ExpectedValuesItT expected_it, std::uint32_t* d_correctness_flags)
+flag_correct_writes_op<ExpectedValuesItT> static make_checking_write_op(
+  ExpectedValuesItT expected_it, std::uint32_t* d_correctness_flags)
 {
   return flag_correct_writes_op<ExpectedValuesItT>{expected_it, d_correctness_flags};
 }
+
+static bool are_all_flags_set(c2h::device_vector<std::uint32_t>& flag_vector, std::size_t num_flags_to_check)
+{
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  bool all_flags_set                     = thrust::equal(
+    flag_vector.cbegin(),
+    flag_vector.cbegin() + (num_flags_to_check / bits_per_element),
+    thrust::make_constant_iterator(0xFFFFFFFFU));
+  if (num_flags_to_check % bits_per_element != 0)
+  {
+    std::uint32_t last_element_flags = (0x00000001U << (num_flags_to_check % bits_per_element)) - 0x01U;
+    all_flags_set = all_flags_set && (flag_vector[num_flags_to_check / bits_per_element] == last_element_flags);
+  }
+  return all_flags_set;
+}
diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu
index 080134f73b1..47afc2fc2a7 100644
--- a/cub/test/catch2_test_device_select_unique.cu
+++ b/cub/test/catch2_test_device_select_unique.cu
@@ -313,16 +313,8 @@ try
 
     // Ensure that we created the correct output
     REQUIRE(num_selected_out[0] == num_items);
-    bool all_results_correct = thrust::equal(
-      correctness_flags.cbegin(),
-      correctness_flags.cbegin() + (num_items / bits_per_element),
-      thrust::make_constant_iterator(0xFFFFFFFFU));
+    bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
     REQUIRE(all_results_correct == true);
-    if (num_items % bits_per_element != 0)
-    {
-      std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U;
-      REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
-    }
   }
 
   // All the same -> single unique
@@ -350,19 +342,8 @@ try
 
     // Ensure that we created the correct output
     REQUIRE(num_selected_out[0] == expected_num_unique);
-    bool all_results_correct = thrust::equal(
-      correctness_flags.cbegin(),
-      correctness_flags.cbegin() + (expected_num_unique / bits_per_element),
-      thrust::make_constant_iterator(0xFFFFFFFFU));
+    bool all_results_correct = are_all_flags_set(correctness_flags, expected_num_unique);
     REQUIRE(all_results_correct == true);
-    _CCCL_DIAG_PUSH
-    _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */
-    _CCCL_IF_CONSTEXPR (expected_num_unique % bits_per_element != 0)
-    {
-      std::uint32_t last_element_flags = (0x00000001U << (expected_num_unique % bits_per_element)) - 0x01U;
-      REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags);
-    }
-    _CCCL_DIAG_POP
   }
 }
 catch (std::bad_alloc&)

From 8abd5e900e139ed854854620df4a6679e7fb29ce Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 8 Oct 2024 02:55:31 -0700
Subject: [PATCH 37/41] addresses review comments

---
 cub/cub/agent/agent_select_if.cuh              |  1 +
 cub/cub/detail/choose_offset.cuh               | 10 ++++------
 cub/cub/device/device_select.cuh               |  6 +++---
 cub/cub/device/dispatch/dispatch_select_if.cuh |  7 +++----
 thrust/thrust/system/cuda/detail/copy_if.h     |  2 +-
 5 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index e765be68a18..ff09051be7e 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -169,6 +169,7 @@ struct partition_distinct_output_t
  * @tparam StreamingContextT
  *   Type providing the context information for the current partition, with the following member functions:
  *    input_offset() -> base offset for the input (and flags) iterator
+ *    is_first_partition() -> [Select::Unique-only] whether this is the first partition
  *    num_previously_selected() -> base offset for the output iterator for selected items
  *    num_previously_rejected() -> base offset for the output iterator for rejected items (partition only)
  *    num_total_items() -> total number of items across all partitions (partition only)
diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh
index 3e3b51f9504..b87b02f2fcb 100644
--- a/cub/cub/detail/choose_offset.cuh
+++ b/cub/cub/detail/choose_offset.cuh
@@ -29,8 +29,6 @@
 
 #include <cub/config.cuh>
 
-#include "cuda/std/__type_traits/is_unsigned.h"
-
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -63,7 +61,7 @@ struct choose_offset
                 "NumItemsT must be an integral type, but not bool");
 
   // Unsigned integer type for global offsets.
-  using type = typename ::cuda::std::conditional<sizeof(NumItemsT) <= 4, std::uint32_t, unsigned long long>::type;
+  using type = typename ::cuda::std::_If<sizeof(NumItemsT) <= 4, std::uint32_t, unsigned long long>::type;
 };
 
 /**
@@ -86,7 +84,7 @@ struct promote_small_offset
                 "NumItemsT must be an integral type, but not bool");
 
   // Unsigned integer type for global offsets.
-  using type = typename ::cuda::std::conditional<sizeof(NumItemsT) < 4, std::int32_t, NumItemsT>::type;
+  using type = typename ::cuda::std::_If<sizeof(NumItemsT) < 4, std::int32_t, NumItemsT>::type;
 };
 
 /**
@@ -113,10 +111,10 @@ struct choose_signed_offset
   // uint32 -> int64, else
   // LEQ 4B -> int32, else
   // int64
-  using type = typename ::cuda::std::conditional<
+  using type = typename ::cuda::std::_If<
     (::cuda::std::is_integral<NumItemsT>::value && ::cuda::std::is_unsigned<NumItemsT>::value),
     ::cuda::std::int64_t,
-    typename ::cuda::std::conditional<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>::type>::type;
+    typename ::cuda::std::_If<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>::type>::type;
 
   /**
    * Checks if the given num_items can be covered by the selected offset type. If not, returns cudaErrorInvalidValue,
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 35428013574..22c9380ebe1 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -1012,20 +1012,20 @@ struct DeviceSelect
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename NumItemsT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
-    NumItemsT num_items,
+    ::cuda::std::int64_t num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return Unique<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, NumItemsT>(
+    return Unique<InputIteratorT, OutputIteratorT, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream);
   }
 #endif // DOXYGEN_SHOULD_SKIP_THIS
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index bff0252d357..f8cb1bc292c 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -116,12 +116,12 @@ public:
     return first_partition;
   };
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
   {
     return first_partition ? TotalNumItemsT{0} : *d_num_selected_in;
   };
 
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
   {
     return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected());
   };
@@ -450,8 +450,7 @@ struct DispatchSelectIf : SelectedPolicy
   using num_total_items_t = OffsetT;
 
   // Type used to provide streaming information about each partition's context
-  static constexpr per_partition_offset_t const partition_size =
-    cuda::std::numeric_limits<per_partition_offset_t>::max();
+  static constexpr per_partition_offset_t partition_size = ::cuda::std::numeric_limits<per_partition_offset_t>::max();
 
   // If the values representable by OffsetT exceed the partition_size, we use a kernel template specialization that
   // supports streaming (i.e., splitting the input into partitions of up to partition_size number of items)
diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h
index 034e850e0e2..d8676a9749a 100644
--- a/thrust/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/thrust/system/cuda/detail/copy_if.h
@@ -236,7 +236,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if(
   void* temp_storage = static_cast<void*>(tmp.data().get());
 
   // Run algorithm
-  dispatch64_t::dispatch(
+  status = dispatch64_t::dispatch(
     policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, static_cast<std::int64_t>(num_items));
   cuda_cub::throw_on_error(status, "copy_if failed on 2nd step");
 

From f8edfb47c4c5907b6c7531078af7e250db464449 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 8 Oct 2024 03:21:53 -0700
Subject: [PATCH 38/41] fixes conditional type usage

---
 cub/cub/detail/choose_offset.cuh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh
index b87b02f2fcb..7dc5c6fd6fb 100644
--- a/cub/cub/detail/choose_offset.cuh
+++ b/cub/cub/detail/choose_offset.cuh
@@ -61,7 +61,7 @@ struct choose_offset
                 "NumItemsT must be an integral type, but not bool");
 
   // Unsigned integer type for global offsets.
-  using type = typename ::cuda::std::_If<sizeof(NumItemsT) <= 4, std::uint32_t, unsigned long long>::type;
+  using type = ::cuda::std::_If<(sizeof(NumItemsT) <= 4), std::uint32_t, unsigned long long>;
 };
 
 /**
@@ -84,7 +84,7 @@ struct promote_small_offset
                 "NumItemsT must be an integral type, but not bool");
 
   // Unsigned integer type for global offsets.
-  using type = typename ::cuda::std::_If<sizeof(NumItemsT) < 4, std::int32_t, NumItemsT>::type;
+  using type = ::cuda::std::_If<(sizeof(NumItemsT) < 4), std::int32_t, NumItemsT>;
 };
 
 /**
@@ -111,10 +111,10 @@ struct choose_signed_offset
   // uint32 -> int64, else
   // LEQ 4B -> int32, else
   // int64
-  using type = typename ::cuda::std::_If<
-    (::cuda::std::is_integral<NumItemsT>::value && ::cuda::std::is_unsigned<NumItemsT>::value),
-    ::cuda::std::int64_t,
-    typename ::cuda::std::_If<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>::type>::type;
+  using type =
+    ::cuda::std::_If<(::cuda::std::is_integral<NumItemsT>::value && ::cuda::std::is_unsigned<NumItemsT>::value),
+                     ::cuda::std::int64_t,
+                     ::cuda::std::_If<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>>;
 
   /**
    * Checks if the given num_items can be covered by the selected offset type. If not, returns cudaErrorInvalidValue,

From fe112dab7dc6aa17900953bf1dfd655d2673d962 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 8 Oct 2024 06:08:01 -0700
Subject: [PATCH 39/41] makes tests on empty input more robust

---
 cub/test/catch2_test_device_partition_flagged.cu | 2 +-
 cub/test/catch2_test_device_partition_if.cu      | 2 +-
 cub/test/catch2_test_device_select_flagged.cu    | 2 +-
 cub/test/catch2_test_device_select_if.cu         | 2 +-
 cub/test/catch2_test_device_select_unique.cu     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu
index 2d2c1ecaa06..a5dfe079fa5 100644
--- a/cub/test/catch2_test_device_partition_flagged.cu
+++ b/cub/test/catch2_test_device_partition_flagged.cu
@@ -106,7 +106,7 @@ CUB_TEST("DevicePartition::Flagged can run with empty input", "[device][partitio
   c2h::device_vector<char> flags(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu
index 1db5774da42..b6721b3c9e9 100644
--- a/cub/test/catch2_test_device_partition_if.cu
+++ b/cub/test/catch2_test_device_partition_if.cu
@@ -95,7 +95,7 @@ CUB_TEST("DevicePartition::If can run with empty input", "[device][partition_if]
   c2h::device_vector<type> out(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   partition_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{});
diff --git a/cub/test/catch2_test_device_select_flagged.cu b/cub/test/catch2_test_device_select_flagged.cu
index 0ff05b3f00c..84dd38f0c3e 100644
--- a/cub/test/catch2_test_device_select_flagged.cu
+++ b/cub/test/catch2_test_device_select_flagged.cu
@@ -96,7 +96,7 @@ CUB_TEST("DeviceSelect::Flagged can run with empty input", "[device][select_flag
   c2h::device_vector<int> flags(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu
index 6eb1219cae7..4cd6a043700 100644
--- a/cub/test/catch2_test_device_select_if.cu
+++ b/cub/test/catch2_test_device_select_if.cu
@@ -100,7 +100,7 @@ CUB_TEST("DeviceSelect::If can run with empty input", "[device][select_if]", typ
   c2h::device_vector<type> out(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   select_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{});
diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu
index 47afc2fc2a7..ddecf4664e6 100644
--- a/cub/test/catch2_test_device_select_unique.cu
+++ b/cub/test/catch2_test_device_select_unique.cu
@@ -111,7 +111,7 @@ CUB_TEST("DeviceSelect::Unique can run with empty input", "[device][select_uniqu
   c2h::device_vector<type> out(num_items);
 
   // Needs to be device accessible
-  c2h::device_vector<int> num_selected_out(1, 0);
+  c2h::device_vector<int> num_selected_out(1, 42);
   int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   select_unique(in.begin(), out.begin(), d_num_selected_out, num_items);

From 9475d95373c2b13c50fbff4e86ea813724048425 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 8 Oct 2024 09:05:40 -0700
Subject: [PATCH 40/41] restores empty problem behaviour

---
 .../device/dispatch/dispatch_select_if.cuh    | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index f8cb1bc292c..0ddfe163852 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -580,19 +580,6 @@ struct DispatchSelectIf : SelectedPolicy
       EqualityOpT,
       per_partition_offset_t,
       streaming_context_t>;
-
-    // Return for empty problem (also needed to avoid division by zero)
-    // TODO(elstehle): In this case d_num_selected_out will never be written. Maybe we want to write it despite?
-    if (num_items == 0)
-    {
-      // If this was just to query temporary storage requirements, return non-empty bytes
-      if (d_temp_storage == nullptr)
-      {
-        temp_storage_bytes = std::size_t{1};
-      }
-      return cudaSuccess;
-    }
-
     cudaError error = cudaSuccess;
 
     constexpr auto block_threads    = VsmemHelperT::agent_policy_t::BLOCK_THREADS;
@@ -607,8 +594,9 @@ struct DispatchSelectIf : SelectedPolicy
         ? static_cast<OffsetT>(partition_size)
         : num_items;
 
-    // The number of partitions required to "iterate" over the total input
-    auto const num_partitions = ::cuda::ceil_div(num_items, max_partition_size);
+    // The number of partitions required to "iterate" over the total input (ternary to avoid div-by-zero)
+    auto const num_partitions =
+      (max_partition_size == 0) ? static_cast<OffsetT>(1) : ::cuda::ceil_div(num_items, max_partition_size);
 
     // The maximum number of tiles for which we will ever invoke the kernel
     auto const max_num_tiles_per_invocation = static_cast<OffsetT>(::cuda::ceil_div(max_partition_size, tile_size));
@@ -704,6 +692,13 @@ struct DispatchSelectIf : SelectedPolicy
           return error;
         }
 
+        // No more items to process (note, we do not want to return early for num_items==0, because we need to make sure
+        // that `scan_init_kernel` has written '0' to d_num_selected_out)
+        if (current_num_items == 0)
+        {
+          return cudaSuccess;
+        }
+
 // Log select_if_kernel configuration
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
         {

From 6f0bd0013337c2a51ffcae920d099b6fe5afc09d Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 8 Oct 2024 09:05:47 -0700
Subject: [PATCH 41/41] adds comment on const ref

---
 cub/cub/agent/agent_select_if.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index ff09051be7e..ea2d1c24b90 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -316,6 +316,8 @@ struct AgentSelectIf
   InequalityWrapper<EqualityOpT> inequality_op; ///< T inequality operator
   SelectOpT select_op; ///< Selection operator
   OffsetT num_items; ///< Total number of input items
+
+  // Note: This is a const reference because we have seen double-digit percentage perf regressions otherwise
   const StreamingContextT& streaming_context; ///< Context for the current partition
 
   //---------------------------------------------------------------------