From 97f3eecfc3a659b3f70cd69dde9a49a04d845930 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sun, 8 Sep 2024 22:02:54 -0700 Subject: [PATCH 01/41] adds streaming selection and partition --- cub/cub/agent/agent_select_if.cuh | 67 ++-- .../device/dispatch/dispatch_select_if.cuh | 348 ++++++++++++------ 2 files changed, 286 insertions(+), 129 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 2a9958901ff..fd9e2bbbc61 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -164,10 +164,15 @@ struct partition_distinct_output_t * selection) * * @tparam OffsetT - * Signed integer type for global offsets + * Signed integer type for offsets within a partition * - * @tparam ScanTileStateT - * The tile state class used in the decoupled look-back + * @tparam StreamingContextT + * Type providing the context information for the current partition, with the following member functions: + * input_offset() -> base offset for the input (and flags) iterator + * num_previously_selected() -> base offset for the output iterator for selected items + * num_previously_rejected() -> base offset for the output iterator for rejected items (partition only) + * num_total_items() -> total number of items across all partitions (partition only) + * update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected * * @tparam KEEP_REJECTS * Whether or not we push rejected items to the back of the output @@ -179,6 +184,7 @@ template struct AgentSelectIf @@ -304,11 +310,12 @@ struct AgentSelectIf _TempStorage& temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Input items - OutputIteratorWrapperT d_selected_out; ///< Unique output items + OutputIteratorWrapperT d_selected_out; ///< Output iterator for the selected items WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) InequalityWrapper inequality_op; ///< T inequality operator SelectOpT select_op; ///< Selection operator OffsetT num_items; ///< Total number of input items + StreamingContextT streaming_context; ///< Context for the current partition //--------------------------------------------------------------------- // Constructor @@ -343,7 +350,8 @@ struct AgentSelectIf OutputIteratorWrapperT d_selected_out, SelectOpT select_op, EqualityOpT equality_op, - OffsetT num_items) + OffsetT num_items, + StreamingContextT streaming_context) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_selected_out(d_selected_out) @@ -351,6 +359,7 @@ struct AgentSelectIf , inequality_op(equality_op) , select_op(select_op) , num_items(num_items) + , streaming_context(streaming_context) {} //--------------------------------------------------------------------- @@ -395,6 +404,7 @@ struct AgentSelectIf CTA_SYNC(); FlagT flags[ITEMS_PER_THREAD]; + auto d_base_flags = d_flags_in + streaming_context.input_offset(); if (IS_LAST_TILE) { // Initialize the out-of-bounds flags @@ -404,11 +414,11 @@ struct AgentSelectIf selection_flags[ITEM] = true; } // Guarded loads - BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items); + BlockLoadFlags(temp_storage.load_flags).Load(d_base_flags + tile_offset, flags, num_tile_items); } else { - BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); + BlockLoadFlags(temp_storage.load_flags).Load(d_base_flags + tile_offset, flags); } #pragma unroll @@ -513,6 +523,9 @@ struct AgentSelectIf OffsetT (&selection_indices)[ITEMS_PER_THREAD], OffsetT num_selections) { + // Get the output iterator with a given base offset + auto d_base_selected_out = d_selected_out + streaming_context.num_previously_selected(); + // Scatter flagged items #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -521,7 +534,7 @@ struct AgentSelectIf { if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) { - d_selected_out[selection_indices[ITEM]] = items[ITEM]; + d_base_selected_out[selection_indices[ITEM]] = items[ITEM]; } } } @@ -568,9 +581,11 @@ struct AgentSelectIf CTA_SYNC(); + // Get the output iterator with a given base offset + auto d_base_selected_out = d_selected_out + streaming_context.num_previously_selected(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { - d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; + d_base_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; } } @@ -683,8 +698,11 @@ struct AgentSelectIf int tile_num_rejections, OffsetT num_selections_prefix, OffsetT num_rejected_prefix, - detail::partition_distinct_output_t partitioned_out_it_wrapper) + detail::partition_distinct_output_t partitioned_out_wrapper) { + auto selected_out_it = partitioned_out_wrapper.selected_it + streaming_context.num_previously_selected(); + auto rejected_out_it = partitioned_out_wrapper.rejected_it + streaming_context.num_previously_rejected(); + #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { @@ -700,11 +718,11 @@ struct AgentSelectIf { if (item_idx >= tile_num_rejections) { - partitioned_out_it_wrapper.selected_it[scatter_offset] = item; + selected_out_it[scatter_offset] = item; } else { - partitioned_out_it_wrapper.rejected_it[scatter_offset] = item; + rejected_out_it[scatter_offset] = item; } } } @@ -723,16 +741,19 @@ struct AgentSelectIf OffsetT num_rejected_prefix, PartitionedOutputItT partitioned_out_it) { + using total_offset_t = typename StreamingContextT::total_num_items_t; + auto const selected_base_begin = streaming_context.num_previously_selected(); + auto const rejected_base_end = streaming_context.num_total_items() - streaming_context.num_previously_rejected(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; int rejection_idx = item_idx; int selection_idx = item_idx - tile_num_rejections; - OffsetT scatter_offset = + total_offset_t scatter_offset = (item_idx < tile_num_rejections) - ? num_items - num_rejected_prefix - rejection_idx - 1 - : num_selections_prefix + selection_idx; + ? rejected_base_end - static_cast(num_rejected_prefix - rejection_idx - 1) + : selected_base_begin + num_selections_prefix + selection_idx; InputT item = temp_storage.raw_exchange.Alias()[item_idx]; @@ -771,13 +792,14 @@ struct AgentSelectIf OffsetT selection_indices[ITEMS_PER_THREAD]; // Load items + auto d_base_in = d_in + streaming_context.input_offset(); if (IS_LAST_TILE) { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items, num_tile_items); } else { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items); } // Initialize selection_flags @@ -850,13 +872,14 @@ struct AgentSelectIf OffsetT selection_indices[ITEMS_PER_THREAD]; // Load items + auto d_base_in = d_in + streaming_context.input_offset(); if (IS_LAST_TILE) { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items, num_tile_items); } else { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items); } // Initialize selection_flags @@ -960,7 +983,7 @@ struct AgentSelectIf auto tile_state_wrapper = MemoryOrderedTileStateT{tile_state}; // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + int tile_idx = blockIdx.x; OffsetT tile_offset = static_cast(tile_idx) * static_cast(TILE_ITEMS); if (tile_idx < num_tiles - 1) @@ -976,8 +999,8 @@ struct AgentSelectIf if (threadIdx.x == 0) { - // Output the total number of items selection_flags - *d_num_selected_out = num_selections; + // Update the number of selected items with this partition's selections + streaming_context.update_num_selected(d_num_selected_out, num_selections); } } } diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 50a2022184a..35717403c03 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -65,6 +65,85 @@ CUB_NAMESPACE_BEGIN namespace detail { + +template +class streaming_select_context_t +{ +private: + TotalNumItemsT first_partition = true; + TotalNumItemsT last_partition = false; + TotalNumItemsT total_num_items{}; + TotalNumItemsT total_previous_num_items{}; + + // We use a double-buffer for keeping track of the number of previously selected items + // : -> + // 0: '0' -> buffer 1 + // 1: buffer 1 -> buffer 0 + // 2: buffer 0 -> buffer 1 + // ... + TotalNumItemsT selector = 0x00U; + TotalNumItemsT* d_num_selected_dbuff = nullptr; + +public: + using total_num_items_t = TotalNumItemsT; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_select_context_t( + TotalNumItemsT* d_num_selected_dbuff, TotalNumItemsT total_num_items, bool is_last_partition) + : last_partition(is_last_partition) + , total_num_items(total_num_items) + , d_num_selected_dbuff(d_num_selected_dbuff) + {} + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last) + { + first_partition = false; + last_partition = next_partition_is_the_last; + selector ^= 0x01; + total_previous_num_items += num_items; + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() + { + return first_partition ? TotalNumItemsT{0} : total_previous_num_items; + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() + { + if (threadIdx.x == 0 && blockIdx.x < 2) + { + } + return first_partition ? TotalNumItemsT{0} : d_num_selected_dbuff[selector]; + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() + { + if (threadIdx.x == 0 && blockIdx.x < 2) + { + } + return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected()); + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() + { + return total_num_items; + }; + + template + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void + update_num_selected(NumSelectedIteratorT d_num_selected_out, OffsetT num_selections) + { + if (last_partition) + { + *d_num_selected_out = num_previously_selected() + static_cast(num_selections); + } + else + { + d_num_selected_dbuff[(selector ^ 0x01U)] = + num_previously_selected() + static_cast(num_selections); + } + }; +}; + /** * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`. */ @@ -79,7 +158,8 @@ struct agent_select_if_wrapper_t typename SelectedOutputIteratorT, typename SelectOpT, typename EqualityOpT, - typename OffsetT> + typename OffsetT, + typename StreamingContextT> struct agent_t : public AgentSelectIf { @@ -98,6 +179,7 @@ struct agent_select_if_wrapper_t SelectOpT, EqualityOpT, OffsetT, + StreamingContextT, KeepRejects, MayAlias>::AgentSelectIf; }; @@ -140,7 +222,15 @@ struct agent_select_if_wrapper_t * to be used for selection) * * @tparam OffsetT - * Signed integer type for global offsets + * Signed integer type for offsets within a partition + * + * @tparam StreamingContextT + * Type providing the context information for the current partition, with the following member functions: + * input_offset() -> base offset for the input (and flags) iterator + * num_previously_selected() -> base offset for the output iterator for selected items + * num_previously_rejected() -> base offset for the output iterator for rejected items (partition only) + * num_total_items() -> total number of items across all partitions (partition only) + * update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected * * @tparam KEEP_REJECTS * Whether or not we push rejected items to the back of the output @@ -172,6 +262,9 @@ struct agent_select_if_wrapper_t * @param[in] num_tiles * Total number of tiles for the entire problem * + * @param[in] streaming_context + * The context information for the current partition + * * @param[in] vsmem * Memory to support virtual shared memory */ @@ -184,6 +277,7 @@ template __launch_bounds__(int( @@ -195,7 +289,8 @@ __launch_bounds__(int( SelectedOutputIteratorT, SelectOpT, EqualityOpT, - OffsetT>::agent_policy_t::BLOCK_THREADS)) + OffsetT, + StreamingContextT>::agent_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel( InputIteratorT d_in, FlagsInputIteratorT d_flags, @@ -206,6 +301,7 @@ __launch_bounds__(int( EqualityOpT equality_op, OffsetT num_items, int num_tiles, + StreamingContextT streaming_context, cub::detail::vsmem_t vsmem) { using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< @@ -216,7 +312,8 @@ __launch_bounds__(int( SelectedOutputIteratorT, SelectOpT, EqualityOpT, - OffsetT>; + OffsetT, + StreamingContextT>; using AgentSelectIfPolicyT = typename VsmemHelperT::agent_policy_t; @@ -230,7 +327,7 @@ __launch_bounds__(int( typename AgentSelectIfT::TempStorage& temp_storage = VsmemHelperT::get_temp_storage(static_temp_storage, vsmem); // Process tiles - AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items) + AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items, streaming_context) .ConsumeRange(num_tiles, tile_status, d_num_selected_out); // If applicable, hints to discard modified cache lines for vsmem @@ -290,7 +387,16 @@ struct DispatchSelectIf : SelectedPolicy /****************************************************************************** * Types and constants ******************************************************************************/ - using ScanTileStateT = ScanTileState; + + // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition + using PerPartitionOffsetT = ::cuda::std::int32_t; + + // Offset type large enough to represent any index within the input and output iterators + using NumTotalItemsT = ::cuda::std::int64_t; + + using streaming_context_t = detail::streaming_select_context_t; + + using ScanTileStateT = ScanTileState; static constexpr int INIT_KERNEL_THREADS = 128; @@ -408,15 +514,42 @@ struct DispatchSelectIf : SelectedPolicy SelectedOutputIteratorT, SelectOpT, EqualityOpT, - OffsetT>; + PerPartitionOffsetT, + streaming_context_t>; + + // Return for empty problem (also needed to avoid division by zero) + // TODO(elstehle): In this case d_num_selected_out will never be written. Maybe we want to write it despite? + if (num_items == 0) + { + // If this was just to query temporary storage requirements, return non-empty bytes + if (d_temp_storage == nullptr) + { + temp_storage_bytes = std::size_t{1}; + } + return cudaSuccess; + } cudaError error = cudaSuccess; constexpr auto block_threads = VsmemHelperT::agent_policy_t::BLOCK_THREADS; constexpr auto items_per_thread = VsmemHelperT::agent_policy_t::ITEMS_PER_THREAD; - constexpr int tile_size = block_threads * items_per_thread; - int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); - const auto vsmem_size = num_tiles * VsmemHelperT::vsmem_per_block; + constexpr auto tile_size = static_cast(block_threads * items_per_thread); + + // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size) + auto const max_partition_size = + num_items > static_cast(cuda::std::numeric_limits::max()) + ? static_cast(cuda::std::numeric_limits::max()) + : num_items; + + // The number of partitions required to "iterate" over the total input + auto const num_partitions = cub::DivideAndRoundUp(num_items, max_partition_size); + + // The maximum number of tiles for which we will ever invoke the kernel + auto const max_num_tiles_per_invocation = + static_cast(cub::DivideAndRoundUp(max_partition_size, tile_size)); + + // The amount of virtual shared memory to allocate + const auto vsmem_size = max_num_tiles_per_invocation * VsmemHelperT::vsmem_per_block; do { @@ -429,17 +562,19 @@ struct DispatchSelectIf : SelectedPolicy } // Specify temporary storage allocation requirements - size_t allocation_sizes[2] = {0ULL, vsmem_size}; + ::cuda::std::size_t streaming_selection_storage_bytes = + (num_partitions > 1) ? 2 * sizeof(NumTotalItemsT) : ::cuda::std::size_t{0}; + ::cuda::std::size_t allocation_sizes[3] = {0ULL, vsmem_size, streaming_selection_storage_bytes}; - // bytes needed for tile status descriptors - error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + // Bytes needed for tile status descriptors + error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles_per_invocation, allocation_sizes[0])); if (cudaSuccess != error) { break; } // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[2] = {}; + void* allocations[3] = {}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) @@ -453,111 +588,109 @@ struct DispatchSelectIf : SelectedPolicy break; } - // Construct the tile status interface - ScanTileStateT tile_status; - error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); - if (cudaSuccess != error) - { - break; - } - - // Log scan_init_kernel configuration - int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); - -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG - _CubLog( - "Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG + // Initialize the streaming context with the temporary storage for double-buffering the previously selected items + // and the total number (across all partitions) of items + streaming_context_t streaming_context{ + reinterpret_cast(allocations[2]), num_items, (num_partitions <= 1)}; - // Invoke scan_init_kernel to initialize tile descriptors - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) - .doit(scan_init_kernel, tile_status, num_tiles, d_num_selected_out); - - // Check for failure to launch - error = CubDebug(cudaPeekAtLastError()); - if (cudaSuccess != error) + // Iterate over the partitions until all input is processed + for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++) { - break; - } - - // Sync the stream if specified to flush runtime errors - error = CubDebug(detail::DebugSyncStream(stream)); - if (cudaSuccess != error) - { - break; - } + OffsetT current_partition_offset = partition_idx * max_partition_size; + OffsetT current_num_items = + (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : max_partition_size; + + // Construct the tile status interface + const auto current_num_tiles = static_cast(cub::DivideAndRoundUp(current_num_items, tile_size)); + ScanTileStateT tile_status; + error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0])); + if (cudaSuccess != error) + { + break; + } - // Return if empty problem - if (num_items == 0) - { - break; - } + // Log scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(current_num_tiles, INIT_KERNEL_THREADS)); - // Get max x-dimension of grid - int max_dim_x; - error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); - if (cudaSuccess != error) - { - break; - } +#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG + _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", + init_grid_size, + INIT_KERNEL_THREADS, + (long long) stream); +#endif + + // Invoke scan_init_kernel to initialize tile descriptors + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + .doit(scan_init_kernel, tile_status, current_num_tiles, d_num_selected_out); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } - // Get grid size for scanning tiles - dim3 scan_grid_size; - scan_grid_size.z = 1; - scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + // Sync the stream if specified to flush runtime errors + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) + { + break; + } // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG - { - // Get SM occupancy for select_if_kernel - int range_select_sm_occupancy; - error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out - select_if_kernel, - block_threads)); + { + // Get SM occupancy for select_if_kernel + int range_select_sm_occupancy; + error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out + select_if_kernel, + block_threads)); + if (cudaSuccess != error) + { + break; + } + + _CubLog("Invoking select_if_kernel<<<%d, %d, 0, " + "%lld>>>(), %d items per thread, %d SM occupancy\n", + current_num_tiles, + block_threads, + (long long) stream, + items_per_thread, + range_select_sm_occupancy); + } +#endif + + // Invoke select_if_kernel + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream) + .doit(select_if_kernel, + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + tile_status, + select_op, + equality_op, + static_cast(current_num_items), + current_num_tiles, + streaming_context, + cub::detail::vsmem_t{allocations[1]}); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } - _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, " - "%lld>>>(), %d items per thread, %d SM occupancy\n", - scan_grid_size.x, - scan_grid_size.y, - scan_grid_size.z, - block_threads, - (long long) stream, - items_per_thread, - range_select_sm_occupancy); - } -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG - - // Invoke select_if_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) - .doit(select_if_kernel, - d_in, - d_flags, - d_selected_out, - d_num_selected_out, - tile_status, - select_op, - equality_op, - num_items, - num_tiles, - cub::detail::vsmem_t{allocations[1]}); - - // Check for failure to launch - error = CubDebug(cudaPeekAtLastError()); - if (cudaSuccess != error) - { - break; - } + // Sync the stream if specified to flush runtime errors + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) + { + break; + } - // Sync the stream if specified to flush runtime errors - error = CubDebug(detail::DebugSyncStream(stream)); - if (cudaSuccess != error) - { - break; + // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.) + streaming_context.advance(current_num_items, (partition_idx + OffsetT{2} == num_partitions)); } } while (0); @@ -580,7 +713,8 @@ struct DispatchSelectIf : SelectedPolicy ScanTileStateT, SelectOpT, EqualityOpT, - OffsetT, + PerPartitionOffsetT, + streaming_context_t, KEEP_REJECTS, MayAlias>); } @@ -685,7 +819,7 @@ struct DispatchSelectIf : SelectedPolicy num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS }; +#endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END From 3085f8671b5b69db5acc8e5f297cacce4a5a84ee Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sun, 8 Sep 2024 22:43:56 -0700 Subject: [PATCH 02/41] ensures policy lookup uses per-partition offset type --- .../device/dispatch/dispatch_select_if.cuh | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 35717403c03..2a30ad2f7ed 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -66,6 +66,14 @@ CUB_NAMESPACE_BEGIN namespace detail { +namespace select +{ +// Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition +using per_partition_offset_t = ::cuda::std::int32_t; + +// Offset type large enough to represent any index within the input and output iterators +using num_total_items_t = ::cuda::std::int64_t; + template class streaming_select_context_t { @@ -184,6 +192,7 @@ struct agent_select_if_wrapper_t MayAlias>::AgentSelectIf; }; }; +} // namespace select } // namespace detail /****************************************************************************** @@ -283,7 +292,7 @@ template ::template agent_t, + detail::select::agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, @@ -306,7 +315,7 @@ __launch_bounds__(int( { using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT, - detail::agent_select_if_wrapper_t::template agent_t, + detail::select::agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, @@ -379,7 +388,7 @@ template , cub::detail::value_t, - OffsetT, + detail::select::per_partition_offset_t, MayAlias, KEEP_REJECTS>> struct DispatchSelectIf : SelectedPolicy @@ -389,14 +398,15 @@ struct DispatchSelectIf : SelectedPolicy ******************************************************************************/ // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition - using PerPartitionOffsetT = ::cuda::std::int32_t; + using per_partition_offset_t = detail::select::per_partition_offset_t; // Offset type large enough to represent any index within the input and output iterators - using NumTotalItemsT = ::cuda::std::int64_t; + using num_total_items_t = detail::select::num_total_items_t; - using streaming_context_t = detail::streaming_select_context_t; + // Type used to provide streaming information about each partition's context + using streaming_context_t = detail::select::streaming_select_context_t; - using ScanTileStateT = ScanTileState; + using ScanTileStateT = ScanTileState; static constexpr int INIT_KERNEL_THREADS = 128; @@ -508,13 +518,13 @@ struct DispatchSelectIf : SelectedPolicy using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< Policy, - detail::agent_select_if_wrapper_t::template agent_t, + detail::select::agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, SelectOpT, EqualityOpT, - PerPartitionOffsetT, + per_partition_offset_t, streaming_context_t>; // Return for empty problem (also needed to avoid division by zero) @@ -537,8 +547,8 @@ struct DispatchSelectIf : SelectedPolicy // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size) auto const max_partition_size = - num_items > static_cast(cuda::std::numeric_limits::max()) - ? static_cast(cuda::std::numeric_limits::max()) + num_items > static_cast(cuda::std::numeric_limits::max()) + ? static_cast(cuda::std::numeric_limits::max()) : num_items; // The number of partitions required to "iterate" over the total input @@ -563,7 +573,7 @@ struct DispatchSelectIf : SelectedPolicy // Specify temporary storage allocation requirements ::cuda::std::size_t streaming_selection_storage_bytes = - (num_partitions > 1) ? 2 * sizeof(NumTotalItemsT) : ::cuda::std::size_t{0}; + (num_partitions > 1) ? 2 * sizeof(num_total_items_t) : ::cuda::std::size_t{0}; ::cuda::std::size_t allocation_sizes[3] = {0ULL, vsmem_size, streaming_selection_storage_bytes}; // Bytes needed for tile status descriptors @@ -591,7 +601,7 @@ struct DispatchSelectIf : SelectedPolicy // Initialize the streaming context with the temporary storage for double-buffering the previously selected items // and the total number (across all partitions) of items streaming_context_t streaming_context{ - reinterpret_cast(allocations[2]), num_items, (num_partitions <= 1)}; + reinterpret_cast(allocations[2]), num_items, (num_partitions <= 1)}; // Iterate over the partitions until all input is processed for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++) @@ -670,7 +680,7 @@ struct DispatchSelectIf : SelectedPolicy tile_status, select_op, equality_op, - static_cast(current_num_items), + static_cast(current_num_items), current_num_tiles, streaming_context, cub::detail::vsmem_t{allocations[1]}); @@ -713,7 +723,7 @@ struct DispatchSelectIf : SelectedPolicy ScanTileStateT, SelectOpT, EqualityOpT, - PerPartitionOffsetT, + per_partition_offset_t, streaming_context_t, KEEP_REJECTS, MayAlias>); From 7f264ecf0b3a661cf69a3cbc72d6f32bf441a639 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:40:51 -0700 Subject: [PATCH 03/41] mitigates perf degradation on select --- cub/cub/agent/agent_select_if.cuh | 34 +++++------- .../device/dispatch/dispatch_select_if.cuh | 55 ++++++++----------- 2 files changed, 39 insertions(+), 50 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index fd9e2bbbc61..1dc70230863 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -315,7 +315,7 @@ struct AgentSelectIf InequalityWrapper inequality_op; ///< T inequality operator SelectOpT select_op; ///< Selection operator OffsetT num_items; ///< Total number of input items - StreamingContextT streaming_context; ///< Context for the current partition + const StreamingContextT& streaming_context; ///< Context for the current partition //--------------------------------------------------------------------- // Constructor @@ -351,7 +351,7 @@ struct AgentSelectIf SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, - StreamingContextT streaming_context) + const StreamingContextT& streaming_context) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_selected_out(d_selected_out) @@ -404,7 +404,6 @@ struct AgentSelectIf CTA_SYNC(); FlagT flags[ITEMS_PER_THREAD]; - auto d_base_flags = d_flags_in + streaming_context.input_offset(); if (IS_LAST_TILE) { // Initialize the out-of-bounds flags @@ -414,11 +413,12 @@ struct AgentSelectIf selection_flags[ITEM] = true; } // Guarded loads - BlockLoadFlags(temp_storage.load_flags).Load(d_base_flags + tile_offset, flags, num_tile_items); + BlockLoadFlags(temp_storage.load_flags) + .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items); } else { - BlockLoadFlags(temp_storage.load_flags).Load(d_base_flags + tile_offset, flags); + BlockLoadFlags(temp_storage.load_flags).Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags); } #pragma unroll @@ -523,9 +523,6 @@ struct AgentSelectIf OffsetT (&selection_indices)[ITEMS_PER_THREAD], OffsetT num_selections) { - // Get the output iterator with a given base offset - auto d_base_selected_out = d_selected_out + streaming_context.num_previously_selected(); - // Scatter flagged items #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -534,7 +531,7 @@ struct AgentSelectIf { if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) { - d_base_selected_out[selection_indices[ITEM]] = items[ITEM]; + *((d_selected_out + streaming_context.num_previously_selected()) + selection_indices[ITEM]) = items[ITEM]; } } } @@ -581,11 +578,10 @@ struct AgentSelectIf CTA_SYNC(); - // Get the output iterator with a given base offset - auto d_base_selected_out = d_selected_out + streaming_context.num_previously_selected(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { - d_base_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; + *((d_selected_out + streaming_context.num_previously_selected()) + (num_selections_prefix + item)) = + temp_storage.raw_exchange.Alias()[item]; } } @@ -752,7 +748,7 @@ struct AgentSelectIf int selection_idx = item_idx - tile_num_rejections; total_offset_t scatter_offset = (item_idx < tile_num_rejections) - ? rejected_base_end - static_cast(num_rejected_prefix - rejection_idx - 1) + ? rejected_base_end - static_cast(num_rejected_prefix + rejection_idx + 1) : selected_base_begin + num_selections_prefix + selection_idx; InputT item = temp_storage.raw_exchange.Alias()[item_idx]; @@ -792,14 +788,14 @@ struct AgentSelectIf OffsetT selection_indices[ITEMS_PER_THREAD]; // Load items - auto d_base_in = d_in + streaming_context.input_offset(); if (IS_LAST_TILE) { - BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items, num_tile_items); + BlockLoadT(temp_storage.load_items) + .Load((d_in + streaming_context.input_offset()) + tile_offset, items, num_tile_items); } else { - BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items); + BlockLoadT(temp_storage.load_items).Load((d_in + streaming_context.input_offset()) + tile_offset, items); } // Initialize selection_flags @@ -872,14 +868,14 @@ struct AgentSelectIf OffsetT selection_indices[ITEMS_PER_THREAD]; // Load items - auto d_base_in = d_in + streaming_context.input_offset(); if (IS_LAST_TILE) { - BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items, num_tile_items); + BlockLoadT(temp_storage.load_items) + .Load((d_in + streaming_context.input_offset()) + tile_offset, items, num_tile_items); } else { - BlockLoadT(temp_storage.load_items).Load(d_base_in + tile_offset, items); + BlockLoadT(temp_storage.load_items).Load((d_in + streaming_context.input_offset()) + tile_offset, items); } // Initialize selection_flags diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 2a30ad2f7ed..27e5a68a1f3 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -75,79 +75,71 @@ using per_partition_offset_t = ::cuda::std::int32_t; using num_total_items_t = ::cuda::std::int64_t; template -class streaming_select_context_t +class streaming_context_t { private: - TotalNumItemsT first_partition = true; - TotalNumItemsT last_partition = false; + bool first_partition = true; + bool last_partition = false; TotalNumItemsT total_num_items{}; TotalNumItemsT total_previous_num_items{}; // We use a double-buffer for keeping track of the number of previously selected items - // : -> - // 0: '0' -> buffer 1 - // 1: buffer 1 -> buffer 0 - // 2: buffer 0 -> buffer 1 - // ... - TotalNumItemsT selector = 0x00U; - TotalNumItemsT* d_num_selected_dbuff = nullptr; + TotalNumItemsT* d_num_selected_in = nullptr; + TotalNumItemsT* d_num_selected_out = nullptr; public: using total_num_items_t = TotalNumItemsT; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_select_context_t( - TotalNumItemsT* d_num_selected_dbuff, TotalNumItemsT total_num_items, bool is_last_partition) + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_context_t( + TotalNumItemsT* d_num_selected_in, + TotalNumItemsT* d_num_selected_out, + TotalNumItemsT total_num_items, + bool is_last_partition) : last_partition(is_last_partition) , total_num_items(total_num_items) - , d_num_selected_dbuff(d_num_selected_dbuff) + , d_num_selected_in(d_num_selected_in) + , d_num_selected_out(d_num_selected_out) {} _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last) { + ::cuda::std::swap(d_num_selected_in, d_num_selected_out); first_partition = false; last_partition = next_partition_is_the_last; - selector ^= 0x01; total_previous_num_items += num_items; }; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const { return first_partition ? TotalNumItemsT{0} : total_previous_num_items; }; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const { - if (threadIdx.x == 0 && blockIdx.x < 2) - { - } - return first_partition ? TotalNumItemsT{0} : d_num_selected_dbuff[selector]; + return first_partition ? TotalNumItemsT{0} : *d_num_selected_in; }; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const { - if (threadIdx.x == 0 && blockIdx.x < 2) - { - } return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected()); }; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() const { return total_num_items; }; template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void - update_num_selected(NumSelectedIteratorT d_num_selected_out, OffsetT num_selections) + update_num_selected(NumSelectedIteratorT user_num_selected_out_it, OffsetT num_selections) const { if (last_partition) { - *d_num_selected_out = num_previously_selected() + static_cast(num_selections); + *user_num_selected_out_it = num_previously_selected() + static_cast(num_selections); } else { - d_num_selected_dbuff[(selector ^ 0x01U)] = - num_previously_selected() + static_cast(num_selections); + *d_num_selected_out = num_previously_selected() + static_cast(num_selections); } }; }; @@ -404,7 +396,7 @@ struct DispatchSelectIf : SelectedPolicy using num_total_items_t = detail::select::num_total_items_t; // Type used to provide streaming information about each partition's context - using streaming_context_t = detail::select::streaming_select_context_t; + using streaming_context_t = detail::select::streaming_context_t; using ScanTileStateT = ScanTileState; @@ -600,8 +592,9 @@ struct DispatchSelectIf : SelectedPolicy // Initialize the streaming context with the temporary storage for double-buffering the previously selected items // and the total number (across all partitions) of items + num_total_items_t* tmp_num_selected_out = reinterpret_cast(allocations[2]); streaming_context_t streaming_context{ - reinterpret_cast(allocations[2]), num_items, (num_partitions <= 1)}; + tmp_num_selected_out, (tmp_num_selected_out + 1), num_items, (num_partitions <= 1)}; // Iterate over the partitions until all input is processed for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++) From 412ea3c361bfac297e7a938f1902335060c33deb Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 10 Sep 2024 00:56:15 -0700 Subject: [PATCH 04/41] makes device interfaces use i64 num_items --- cub/cub/device/device_partition.cuh | 32 ++++++++++++------------ cub/cub/device/device_select.cuh | 38 ++++++++++++++--------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 28bfc377bdc..bd0873634dc 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -177,11 +177,11 @@ struct DevicePartition FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) using DispatchSelectIfT = @@ -216,7 +216,7 @@ struct DevicePartition FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream, bool debug_synchronous) { @@ -254,10 +254,10 @@ struct DevicePartition //! { //! int compare; //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! explicit LessThan(int compare) : compare(compare) {} //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! bool operator()(const int &a) const //! { //! return (a < compare); @@ -338,12 +338,12 @@ struct DevicePartition InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -378,7 +378,7 @@ struct DevicePartition InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) @@ -416,12 +416,12 @@ private: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) { - using OffsetT = int; + using OffsetT = ::cuda::std::int64_t; using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf< InputIteratorT, FirstOutputIteratorT, @@ -500,10 +500,10 @@ public: //! { //! int compare; //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! explicit LessThan(int compare) : compare(compare) {} //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! bool operator()(const int &a) const //! { //! return a < compare; @@ -515,10 +515,10 @@ public: //! { //! int compare; //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! explicit GreaterThan(int compare) : compare(compare) {} //! - //! CUB_RUNTIME_FUNCTION __forceinline__ + //! __host__ __device__ __forceinline__ //! bool operator()(const int &a) const //! { //! return a > compare; @@ -641,7 +641,7 @@ public: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) @@ -677,7 +677,7 @@ public: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream, diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 332bbe6c7d2..e24526cd821 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -174,12 +174,12 @@ struct DeviceSelect FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -212,7 +212,7 @@ struct DeviceSelect FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream, bool debug_synchronous) { @@ -309,12 +309,12 @@ struct DeviceSelect IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -349,7 +349,7 @@ struct DeviceSelect IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream, bool debug_synchronous) { @@ -468,13 +468,13 @@ struct DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -506,7 +506,7 @@ struct DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) @@ -615,13 +615,13 @@ struct DeviceSelect size_t& temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) @@ -655,7 +655,7 @@ struct DeviceSelect size_t& temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) @@ -756,13 +756,13 @@ struct DeviceSelect FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using EqualityOp = NullType; // Equality operator (not used) return DispatchSelectIf< @@ -861,13 +861,13 @@ struct DeviceSelect IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using EqualityOp = NullType; // Equality operator (not used) constexpr bool may_alias = true; @@ -981,12 +981,12 @@ struct DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique"); - using OffsetT = int; // Signed integer type for global offsets + using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using SelectOp = NullType; // Selection op (not used) using EqualityOp = Equality; // Default == operator @@ -1019,7 +1019,7 @@ struct DeviceSelect InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream, bool debug_synchronous) { From 70d562bc6f27f03c5d158b13a3ddcccb36944c06 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 10 Sep 2024 22:58:47 -0700 Subject: [PATCH 05/41] updates select::if large num_items tests --- cub/test/catch2_test_device_select_if.cu | 125 +++++++++-------------- 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu index e38f9957d6b..5d6a5421ec7 100644 --- a/cub/test/catch2_test_device_select_if.cu +++ b/cub/test/catch2_test_device_select_if.cu @@ -45,48 +45,7 @@ #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" -// TODO replace with DeviceSelect::If interface once https://github.com/NVIDIA/cccl/issues/50 is addressed -// Temporary wrapper that allows specializing the DeviceSelect algorithm for different offset types -template -CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_select_if_wrapper( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - SelectOp select_op, - cudaStream_t stream = 0) -{ - using flag_iterator_t = cub::NullType*; - using equality_op_t = cub::NullType; - - return cub::DispatchSelectIf< - InputIteratorT, - flag_iterator_t, - OutputIteratorT, - NumSelectedIteratorT, - SelectOp, - equality_op_t, - OffsetT, - false>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - nullptr, - d_out, - d_num_selected_out, - select_op, - equality_op_t{}, - num_items, - stream); -} - DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::If, select_if); -DECLARE_LAUNCH_WRAPPER(dispatch_select_if_wrapper, dispatch_select_if); // %PARAM% TEST_LAUNCH lid 0:1:2 @@ -152,6 +111,18 @@ struct multiply_n } }; +template +struct modx_and_add_divy +{ + T mod; + T div; + + __host__ __device__ TargetT operator()(T x) + { + return static_cast((x % mod) + (x / div)); + } +}; + using all_types = c2h::type_list>; -using offset_types = c2h::type_list; - CUB_TEST("DeviceSelect::If can run with empty input", "[device][select_if]", types) { using type = typename c2h::get<0, TestType>; @@ -393,25 +362,24 @@ CUB_TEST("DeviceSelect::If works with a different output type", "[device][select REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{})); } -CUB_TEST("DeviceSelect::If works for very large number of items", "[device][select_if]", offset_types) +CUB_TEST("DeviceSelect::If works for very large number of items", "[device][select_if]") try { using type = std::int64_t; - using offset_t = typename c2h::get<0, TestType>; - - // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items - auto num_items_max_ull = - std::min(static_cast(::cuda::std::numeric_limits::max()), - ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); - offset_t num_items_max = static_cast(num_items_max_ull); - offset_t num_items_min = - num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; + using offset_t = std::int64_t; + + // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary + constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); + offset_t num_items = GENERATE_COPY( values({ - num_items_max, - static_cast(num_items_max - 1), + offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions + offset_t{2} * max_partition_size, // 2 partitions + max_partition_size + offset_t{1}, // 2 partitions + max_partition_size, // 1 partitions + max_partition_size - offset_t{1} // 1 partitions }), - take(2, random(num_items_min, num_items_max))); + take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); // Input auto in = thrust::make_counting_iterator(static_cast(0)); @@ -421,11 +389,10 @@ try offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); // Run test - std::size_t match_every_nth = 1000000; - offset_t expected_num_copied = - static_cast((static_cast(num_items) + match_every_nth - 1ULL) / match_every_nth); + offset_t match_every_nth = 1000000; + offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth; c2h::device_vector out(expected_num_copied); - dispatch_select_if( + select_if( in, out.begin(), d_first_num_selected_out, num_items, mod_n{static_cast(match_every_nth)}); // Ensure that we created the correct output @@ -440,29 +407,30 @@ catch (std::bad_alloc&) // Exceeding memory is not a failure. } -CUB_TEST("DeviceSelect::If works for very large number of output items", "[device][select_if]", offset_types) +CUB_TEST("DeviceSelect::If works for very large number of output items", "[device][select_if]") try { using type = std::uint8_t; - using offset_t = typename c2h::get<0, TestType>; - - // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items - auto num_items_max_ull = - std::min(static_cast(::cuda::std::numeric_limits::max()), - ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); - offset_t num_items_max = static_cast(num_items_max_ull); - offset_t num_items_min = - num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; + using offset_t = std::int64_t; + + // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary + constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); + offset_t num_items = GENERATE_COPY( values({ - num_items_max, - static_cast(num_items_max - 1), + offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions + offset_t{2} * max_partition_size, // 2 partitions + max_partition_size + offset_t{1}, // 2 partitions + max_partition_size, // 1 partitions + max_partition_size - offset_t{1} // 1 partitions }), - take(2, random(num_items_min, num_items_max))); + take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); - // Prepare input - c2h::device_vector in(num_items); - c2h::gen(CUB_SEED(1), in); + // Prepare input iterator: it[i] = (i%mod)+(i/div) + static constexpr offset_t mod = 200; + static constexpr offset_t div = 1000000000; + auto in = thrust::make_transform_iterator( + thrust::make_counting_iterator(offset_t{0}), modx_and_add_divy{mod, div}); // Prepare output c2h::device_vector out(num_items); @@ -472,11 +440,12 @@ try offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); // Run test - dispatch_select_if(in.cbegin(), out.begin(), d_first_num_selected_out, num_items, always_true_t{}); + select_if(in, out.begin(), d_first_num_selected_out, num_items, always_true_t{}); // Ensure that we created the correct output REQUIRE(num_selected_out[0] == num_items); - REQUIRE(in == out); + bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), in); + REQUIRE(all_results_correct == true); } catch (std::bad_alloc&) { From b274e1b49f71a95f91406e525b4df655994ae3db Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 10 Sep 2024 23:05:19 -0700 Subject: [PATCH 06/41] fixes syntax --- cub/cub/device/dispatch/dispatch_select_if.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 27e5a68a1f3..d78f5218fb9 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -141,7 +141,7 @@ public: { *d_num_selected_out = num_previously_selected() + static_cast(num_selections); } - }; + } }; /** From d6b21ba83911564e24ea1b7f7d6bbdc6f836d877 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 11 Sep 2024 01:07:39 -0700 Subject: [PATCH 07/41] adds tests for large num_items for select::flagged --- cub/cub/agent/agent_select_if.cuh | 5 +- cub/test/catch2_test_device_select_common.cuh | 36 ++++++++++++++ cub/test/catch2_test_device_select_flagged.cu | 48 ++++++++++++++++++- cub/test/catch2_test_device_select_if.cu | 35 +------------- 4 files changed, 88 insertions(+), 36 deletions(-) create mode 100644 cub/test/catch2_test_device_select_common.cuh diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 1dc70230863..3693822b385 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -450,11 +450,12 @@ struct AgentSelectIf if (IS_LAST_TILE) { // Out-of-bounds items are selection_flags - BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1); + BlockLoadFlags(temp_storage.load_flags) + .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items, 1); } else { - BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); + BlockLoadFlags(temp_storage.load_flags).Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags); } // Convert flag type to selection_flags type diff --git a/cub/test/catch2_test_device_select_common.cuh b/cub/test/catch2_test_device_select_common.cuh new file mode 100644 index 00000000000..a686909cbcd --- /dev/null +++ b/cub/test/catch2_test_device_select_common.cuh @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +template +struct mod_n +{ + T mod; + __host__ __device__ bool operator()(T x) + { + return (x % mod == 0) ? true : false; + } +}; + +template +struct multiply_n +{ + T multiplier; + __host__ __device__ T operator()(T x) + { + return x * multiplier; + } +}; + +template +struct modx_and_add_divy +{ + T mod; + T div; + + __host__ __device__ TargetT operator()(T x) + { + return static_cast((x % mod) + (x / div)); + } +}; \ No newline at end of file diff --git a/cub/test/catch2_test_device_select_flagged.cu b/cub/test/catch2_test_device_select_flagged.cu index f3477787ecd..0ff05b3f00c 100644 --- a/cub/test/catch2_test_device_select_flagged.cu +++ b/cub/test/catch2_test_device_select_flagged.cu @@ -36,6 +36,7 @@ #include +#include "catch2_test_device_select_common.cuh" #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" @@ -325,7 +326,7 @@ CUB_TEST("DeviceSelect::Flagged works with flags that alias input", "[device][se REQUIRE(reference == out); } -CUB_TEST("DeviceSelect::Flagged works in place", "[device][select_if]", types) +CUB_TEST("DeviceSelect::Flagged works in place", "[device][select_flagged]", types) { using type = typename c2h::get<0, TestType>; @@ -416,3 +417,48 @@ CUB_TEST("DeviceSelect::Flagged works with a different output type", "[device][s REQUIRE(num_selected == num_selected_out[0]); REQUIRE(reference == out); } + +CUB_TEST("DeviceSelect::Flagged works for very large number of items", "[device][select_flagged]") +try +{ + using type = std::int64_t; + using offset_t = std::int64_t; + + // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary + constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); + + offset_t num_items = GENERATE_COPY( + values({ + offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions + offset_t{2} * max_partition_size, // 2 partitions + max_partition_size + offset_t{1}, // 2 partitions + max_partition_size, // 1 partitions + max_partition_size - offset_t{1} // 1 partitions + }), + take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); + + // Input + constexpr offset_t match_every_nth = 1000000; + auto in = thrust::make_counting_iterator(static_cast(0)); + auto flags_in = thrust::make_transform_iterator(in, mod_n{static_cast(match_every_nth)}); + + // Needs to be device accessible + c2h::device_vector num_selected_out(1, 0); + offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth; + c2h::device_vector out(expected_num_copied); + select_flagged(in, flags_in, out.begin(), d_first_num_selected_out, num_items); + + // Ensure that we created the correct output + REQUIRE(num_selected_out[0] == expected_num_copied); + auto expected_out_it = + thrust::make_transform_iterator(in, multiply_n{static_cast(match_every_nth)}); + bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), expected_out_it); + REQUIRE(all_results_correct == true); +} +catch (std::bad_alloc&) +{ + // Exceeding memory is not a failure. +} diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu index 5d6a5421ec7..bfa1d31d6ac 100644 --- a/cub/test/catch2_test_device_select_if.cu +++ b/cub/test/catch2_test_device_select_if.cu @@ -42,6 +42,7 @@ #include +#include "catch2_test_device_select_common.cuh" #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" @@ -91,38 +92,6 @@ struct always_true_t } }; -template -struct mod_n -{ - T mod; - __host__ __device__ bool operator()(T x) - { - return (x % mod == 0) ? true : false; - } -}; - -template -struct multiply_n -{ - T multiplier; - __host__ __device__ T operator()(T x) - { - return x * multiplier; - } -}; - -template -struct modx_and_add_divy -{ - T mod; - T div; - - __host__ __device__ TargetT operator()(T x) - { - return static_cast((x % mod) + (x / div)); - } -}; - using all_types = c2h::type_list out(expected_num_copied); select_if( From d26eb655d15209cf1c2dec878eebfac81dfa3e39 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 11 Sep 2024 05:52:17 -0700 Subject: [PATCH 08/41] adds tests for large num_items for partition::if --- cub/test/catch2_test_device_partition_if.cu | 86 +++++++++++++++---- cub/test/catch2_test_device_select_common.cuh | 71 ++++++++++++++- 2 files changed, 141 insertions(+), 16 deletions(-) diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu index 84890a1233f..7c3c778584d 100644 --- a/cub/test/catch2_test_device_partition_if.cu +++ b/cub/test/catch2_test_device_partition_if.cu @@ -31,11 +31,17 @@ #include #include +#include +#include +#include +#include +#include #include #include #include +#include "catch2_test_device_select_common.cuh" #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" @@ -43,21 +49,6 @@ DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::If, partition_if); // %PARAM% TEST_LAUNCH lid 0:1:2 -template -struct less_than_t -{ - T compare; - - explicit __host__ less_than_t(T compare) - : compare(compare) - {} - - __host__ __device__ bool operator()(const T& a) const - { - return a < compare; - } -}; - struct always_false_t { template @@ -308,3 +299,68 @@ CUB_TEST("DevicePartition::If works with a different output type", "[device][par REQUIRE(num_selected_out[0] == thrust::distance(reference.begin(), boundary)); REQUIRE(reference == out); } + +CUB_TEST("DevicePartition::If works for very large number of items", "[device][partition_if]") +try +{ + using type = std::int64_t; + using offset_t = std::int64_t; + + // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary + constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); + + offset_t num_items = GENERATE_COPY( + values({ + offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions + offset_t{2} * max_partition_size, // 2 partitions + max_partition_size + offset_t{1}, // 2 partitions + max_partition_size, // 1 partitions + max_partition_size - offset_t{1} // 1 partitions + }), + take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); + + auto in = thrust::make_counting_iterator(offset_t{0}); + + // We select the first items and reject the rest + const offset_t cut_off_index = num_items / 4; + + // Prepare tabulate output iterator to verify results in a memory-efficient way: + // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item + // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index. + static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); + c2h::device_vector correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element)); + auto expected_selected_it = thrust::make_counting_iterator(offset_t{0}); + auto expected_rejected_it = thrust::make_reverse_iterator( + thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index)); + auto expected_result_op = + make_index_to_expected_partition_op(expected_selected_it, expected_rejected_it, cut_off_index); + auto expected_result_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(offset_t{0}), expected_result_op); + auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data())); + auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op); + + // Needs to be device accessible + c2h::device_vector num_selected_out(1, 0); + offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + partition_if( + in, check_result_it, d_first_num_selected_out, num_items, less_than_t{static_cast(cut_off_index)}); + + // Ensure that we created the correct output + REQUIRE(num_selected_out[0] == cut_off_index); + bool all_results_correct = thrust::equal( + correctness_flags.cbegin(), + correctness_flags.cbegin() + (num_items / bits_per_element), + thrust::make_constant_iterator(0xFFFFFFFFU)); + REQUIRE(all_results_correct == true); + if (num_items % bits_per_element != 0) + { + std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U; + REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); + } +} +catch (std::bad_alloc&) +{ + // Exceeding memory is not a failure. +} diff --git a/cub/test/catch2_test_device_select_common.cuh b/cub/test/catch2_test_device_select_common.cuh index a686909cbcd..3ee93ee6cae 100644 --- a/cub/test/catch2_test_device_select_common.cuh +++ b/cub/test/catch2_test_device_select_common.cuh @@ -3,6 +3,23 @@ #pragma once +#include + +template +struct less_than_t +{ + T compare; + + explicit __host__ less_than_t(T compare) + : compare(compare) + {} + + __host__ __device__ bool operator()(const T& a) const + { + return a < compare; + } +}; + template struct mod_n { @@ -33,4 +50,56 @@ struct modx_and_add_divy { return static_cast((x % mod) + (x / div)); } -}; \ No newline at end of file +}; + +template +struct index_to_expected_partition_op +{ + using value_t = typename ::cuda::std::iterator_traits::value_type; + SelectedItT expected_selected_it; + RejectedItT expected_rejected_it; + std::int64_t expected_num_selected; + + template + __host__ __device__ value_t operator()(OffsetT index) + { + return (index < static_cast(expected_num_selected)) + ? expected_selected_it[index] + : expected_rejected_it[index - expected_num_selected]; + } +}; + +template +index_to_expected_partition_op make_index_to_expected_partition_op( + SelectedItT expected_selected_it, RejectedItT expected_rejected_it, std::int64_t expected_num_selected) +{ + return index_to_expected_partition_op{ + expected_selected_it, expected_rejected_it, expected_num_selected}; +} + +template +struct flag_correct_writes_op +{ + ExpectedValuesItT expected_it; + std::uint32_t* d_correctness_flags; + + static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); + template + __host__ __device__ void operator()(OffsetT index, T val) + { + // Set bit-flag if the correct result has been written at the given index + if (expected_it[index] == val) + { + OffsetT uint_index = index / static_cast(bits_per_element); + std::uint32_t bit_flag = 0x00000001U << (index % bits_per_element); + atomicOr(&d_correctness_flags[uint_index], bit_flag); + } + } +}; + +template +flag_correct_writes_op +make_checking_write_op(ExpectedValuesItT expected_it, std::uint32_t* d_correctness_flags) +{ + return flag_correct_writes_op{expected_it, d_correctness_flags}; +} From 1be152ce0f0d5cee0b8bf616382a6052aa586a73 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 11 Sep 2024 08:20:51 -0700 Subject: [PATCH 09/41] adds tests for large num_items for partition::flagged --- .../catch2_test_device_partition_flagged.cu | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu index 2317c4bfb2e..7cd5e6d10bb 100644 --- a/cub/test/catch2_test_device_partition_flagged.cu +++ b/cub/test/catch2_test_device_partition_flagged.cu @@ -31,11 +31,17 @@ #include #include +#include +#include +#include +#include +#include #include #include #include +#include "catch2_test_device_select_common.cuh" #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" @@ -371,3 +377,69 @@ CUB_TEST("DevicePartition::Flagged works with different output type", "[device][ REQUIRE(num_selected == num_selected_out[0]); REQUIRE(reference == out); } + +CUB_TEST("DevicePartition::Flagged works for very large number of items", "[device][partition_flagged]") +try +{ + using type = std::int64_t; + using offset_t = std::int64_t; + + // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary + constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); + + offset_t num_items = GENERATE_COPY( + values({ + offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions + offset_t{2} * max_partition_size, // 2 partitions + max_partition_size + offset_t{1}, // 2 partitions + max_partition_size, // 1 partitions + max_partition_size - offset_t{1} // 1 partitions + }), + take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); + + // We select the first items and reject the rest + const offset_t cut_off_index = num_items / 4; + + auto in = thrust::make_counting_iterator(offset_t{0}); + auto in_flags = thrust::make_transform_iterator( + thrust::make_counting_iterator(offset_t{0}), less_than_t{static_cast(cut_off_index)}); + + // Prepare tabulate output iterator to verify results in a memory-efficient way: + // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item + // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index. + static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); + c2h::device_vector correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element)); + auto expected_selected_it = thrust::make_counting_iterator(offset_t{0}); + auto expected_rejected_it = thrust::make_reverse_iterator( + thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index)); + auto expected_result_op = + make_index_to_expected_partition_op(expected_selected_it, expected_rejected_it, cut_off_index); + auto expected_result_it = + thrust::make_transform_iterator(thrust::make_counting_iterator(offset_t{0}), expected_result_op); + auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data())); + auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op); + + // Needs to be device accessible + c2h::device_vector num_selected_out(1, 0); + offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + partition_flagged(in, in_flags, check_result_it, d_first_num_selected_out, num_items); + + // Ensure that we created the correct output + REQUIRE(num_selected_out[0] == cut_off_index); + bool all_results_correct = thrust::equal( + correctness_flags.cbegin(), + correctness_flags.cbegin() + (num_items / bits_per_element), + thrust::make_constant_iterator(0xFFFFFFFFU)); + REQUIRE(all_results_correct == true); + if (num_items % bits_per_element != 0) + { + std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U; + REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); + } +} +catch (std::bad_alloc&) +{ + // Exceeding memory is not a failure. +} From 466f91513385a70448d088e7c5ca75280a8f0c13 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 11 Sep 2024 08:55:35 -0700 Subject: [PATCH 10/41] fixes redundant definition --- cub/test/catch2_test_device_select_if.cu | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu index bfa1d31d6ac..3f7cc402b93 100644 --- a/cub/test/catch2_test_device_select_if.cu +++ b/cub/test/catch2_test_device_select_if.cu @@ -50,21 +50,6 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::If, select_if); // %PARAM% TEST_LAUNCH lid 0:1:2 -template -struct less_than_t -{ - T compare; - - explicit __host__ less_than_t(T compare) - : compare(compare) - {} - - __host__ __device__ bool operator()(const T& a) const - { - return a < compare; - } -}; - struct equal_to_default_t { template From d0f6e7fab18e40907811aa4e11ff0c7fb3da7ad4 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 11 Sep 2024 14:20:12 -0700 Subject: [PATCH 11/41] fixes implicit conversion --- cub/cub/device/dispatch/dispatch_select_if.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index d78f5218fb9..27cf88025e5 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -569,7 +569,8 @@ struct DispatchSelectIf : SelectedPolicy ::cuda::std::size_t allocation_sizes[3] = {0ULL, vsmem_size, streaming_selection_storage_bytes}; // Bytes needed for tile status descriptors - error = CubDebug(ScanTileStateT::AllocationSize(max_num_tiles_per_invocation, allocation_sizes[0])); + error = + CubDebug(ScanTileStateT::AllocationSize(static_cast(max_num_tiles_per_invocation), allocation_sizes[0])); if (cudaSuccess != error) { break; From cea85090bf5cd41c60f424f7402a379095d5d1d2 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sun, 15 Sep 2024 22:21:18 -0700 Subject: [PATCH 12/41] fixes f32 select::if perf regression --- cub/cub/device/dispatch/dispatch_select_if.cuh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 27cf88025e5..67d241ce9eb 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -190,6 +190,12 @@ struct agent_select_if_wrapper_t /****************************************************************************** * Kernel entry points *****************************************************************************/ +// TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want to make this a CCCL macro? +#if _CCCL_CUDACC_BELOW_11_7 +#define _CUB_GRID_CONSTANT +#else +#define _CUB_GRID_CONSTANT __grid_constant__ +#endif /** * Select kernel entry point (multi-block) @@ -302,7 +308,7 @@ __launch_bounds__(int( EqualityOpT equality_op, OffsetT num_items, int num_tiles, - StreamingContextT streaming_context, + _CUB_GRID_CONSTANT StreamingContextT streaming_context, cub::detail::vsmem_t vsmem) { using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< From 96be1e8ee07dbf35a8062110469d52e45bfeb2f8 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:35:35 -0700 Subject: [PATCH 13/41] fixes perf regression for partition --- cub/cub/agent/agent_select_if.cuh | 25 +++--- cub/cub/detail/choose_offset.cuh | 46 +++++++++++ cub/cub/device/device_partition.cuh | 81 ++++++++++++++----- .../device/dispatch/dispatch_select_if.cuh | 7 +- cub/test/catch2_test_util_choose_offset.cu | 32 ++++++++ 5 files changed, 155 insertions(+), 36 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 3693822b385..ebfe03dc1c2 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -681,7 +681,7 @@ struct AgentSelectIf CTA_SYNC(); // Gather items from shared memory and scatter to global - ScatterPartitionsToGlobal( + ScatterPartitionsToGlobal( num_tile_items, tile_num_rejections, num_selections_prefix, num_rejected_prefix, d_selected_out); } @@ -689,7 +689,7 @@ struct AgentSelectIf * @brief Second phase of scattering partitioned items to global memory. Specialized for partitioning to two * distinct partitions. */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, @@ -730,7 +730,7 @@ struct AgentSelectIf * iterator, where selected items are written in order from the beginning of the itereator and rejected items are * writtem from the iterators end backwards. */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, @@ -739,20 +739,25 @@ struct AgentSelectIf PartitionedOutputItT partitioned_out_it) { using total_offset_t = typename StreamingContextT::total_num_items_t; - auto const selected_base_begin = streaming_context.num_previously_selected(); - auto const rejected_base_end = streaming_context.num_total_items() - streaming_context.num_previously_rejected(); + + total_offset_t _num_rejected_prefix = streaming_context.num_total_items() - streaming_context.num_previously_rejected(); + _num_rejected_prefix -= static_cast(num_rejected_prefix + 1); + total_offset_t _num_selections_prefix = streaming_context.num_previously_selected() + static_cast(num_selections_prefix); + #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { + { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + InputT item = temp_storage.raw_exchange.Alias()[item_idx]; + int rejection_idx = item_idx; int selection_idx = item_idx - tile_num_rejections; + total_offset_t scatter_rejected_index = _num_rejected_prefix - static_cast(rejection_idx); + total_offset_t scatter_selected_index = _num_selections_prefix + static_cast(selection_idx); total_offset_t scatter_offset = (item_idx < tile_num_rejections) - ? rejected_base_end - static_cast(num_rejected_prefix + rejection_idx + 1) - : selected_base_begin + num_selections_prefix + selection_idx; - - InputT item = temp_storage.raw_exchange.Alias()[item_idx]; + ? scatter_rejected_index + : scatter_selected_index; if (!IS_LAST_TILE || (item_idx < num_tile_items)) { diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh index 18fd568d9b8..28639a1bf56 100644 --- a/cub/cub/detail/choose_offset.cuh +++ b/cub/cub/detail/choose_offset.cuh @@ -29,6 +29,8 @@ #include +#include "cuda/std/__type_traits/is_unsigned.h" + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -37,6 +39,7 @@ # pragma system_header #endif // no system header +#include #include #include @@ -93,6 +96,49 @@ struct promote_small_offset template using promote_small_offset_t = typename promote_small_offset::type; +/** + * choose_signed_offset checks NumItemsT, the type of the num_items parameter, and + * selects the offset type to be either int32 or int64, such that the selected offset type covers the range of NumItemsT + * unless it was uint64, in which case int64 will be used. + */ +template +struct choose_signed_offset +{ + // NumItemsT must be an integral type (but not bool). + static_assert(::cuda::std::is_integral::value + && !::cuda::std::is_same::type, bool>::value, + "NumItemsT must be an integral type, but not bool"); + + // Signed integer type for global offsets. + // uint32 -> int64, else + // LEQ 4B -> int32, else + // int64 + using type = typename ::cuda::std::conditional< + (::cuda::std::is_integral::value && ::cuda::std::is_unsigned::value), + ::cuda::std::int64_t, + typename ::cuda::std::conditional<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>::type>::type; + + /** + * Checks if the given num_items can be covered by the selected offset type. If not, returns cudaErrorInvalidValue, + * otherwise returns cudaSuccess. + */ + static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t is_exceeding_offset_type(NumItemsT num_items) + { + if (sizeof(NumItemsT) >= 8 && num_items > static_cast(::cuda::std::numeric_limits::max())) + { + return cudaErrorInvalidValue; + } + return cudaSuccess; + } +}; + +/** + * choose_signed_offset_t is an alias template that checks NumItemsT, the type of the num_items parameter, and + * selects the corresponding signed offset type based on it. + */ +template +using choose_signed_offset_t = typename choose_signed_offset::type; + /** * common_iterator_value sets member type to the common_type of * value_type for all argument types. used to get OffsetT in diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index bd0873634dc..48666f1370b 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -42,6 +42,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -142,6 +143,9 @@ struct DevicePartition //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -169,7 +173,11 @@ struct DevicePartition //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, size_t& temp_storage_bytes, @@ -177,13 +185,14 @@ struct DevicePartition FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged"); - using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets - using SelectOp = NullType; // Selection op (not used) - using EqualityOp = NullType; // Equality operator (not used) + using ChooseOffsetT = detail::choose_signed_offset; + using OffsetT = typename ChooseOffsetT::type; // Signed integer type for global offsets + using SelectOp = NullType; // Selection op (not used) + using EqualityOp = NullType; // Equality operator (not used) using DispatchSelectIfT = DispatchSelectIf; + // Check if the number of items exceeds the range covered by the selected signed offset type + cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items); + if (error) + { + return error; + } + return DispatchSelectIfT::Dispatch( d_temp_storage, temp_storage_bytes, @@ -208,7 +224,11 @@ struct DevicePartition } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, size_t& temp_storage_bytes, @@ -216,7 +236,7 @@ struct DevicePartition FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -254,10 +274,10 @@ struct DevicePartition //! { //! int compare; //! - //! __host__ __device__ __forceinline__ + //! CUB_RUNTIME_FUNCTION __forceinline__ //! explicit LessThan(int compare) : compare(compare) {} //! - //! __host__ __device__ __forceinline__ + //! CUB_RUNTIME_FUNCTION __forceinline__ //! bool operator()(const int &a) const //! { //! return (a < compare); @@ -305,6 +325,9 @@ struct DevicePartition //! @tparam SelectOp //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -331,21 +354,33 @@ struct DevicePartition //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + NumItemsT num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If"); - using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets - using FlagIterator = NullType*; // FlagT iterator type (not used) - using EqualityOp = NullType; // Equality operator (not used) + using ChooseOffsetT = detail::choose_signed_offset; + using OffsetT = typename ChooseOffsetT::type; // Signed integer type for global offsets + using FlagIterator = NullType*; // FlagT iterator type (not used) + using EqualityOp = NullType; // Equality operator (not used) + + // Check if the number of items exceeds the range covered by the selected signed offset type + cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items); + if (error) + { + return error; + } using DispatchSelectIfT = DispatchSelectIf + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + NumItemsT num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - return If( + return If( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } #endif // DOXYGEN_SHOULD_SKIP_THIS @@ -416,12 +455,12 @@ private: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + int num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) { - using OffsetT = ::cuda::std::int64_t; + using OffsetT = int; using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf< InputIteratorT, FirstOutputIteratorT, @@ -641,7 +680,7 @@ public: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + int num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) @@ -677,7 +716,7 @@ public: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + int num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream, diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 67d241ce9eb..fff8b0ca527 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -71,9 +71,6 @@ namespace select // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition using per_partition_offset_t = ::cuda::std::int32_t; -// Offset type large enough to represent any index within the input and output iterators -using num_total_items_t = ::cuda::std::int64_t; - template class streaming_context_t { @@ -308,7 +305,7 @@ __launch_bounds__(int( EqualityOpT equality_op, OffsetT num_items, int num_tiles, - _CUB_GRID_CONSTANT StreamingContextT streaming_context, + _CUB_GRID_CONSTANT const StreamingContextT streaming_context, cub::detail::vsmem_t vsmem) { using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< @@ -399,7 +396,7 @@ struct DispatchSelectIf : SelectedPolicy using per_partition_offset_t = detail::select::per_partition_offset_t; // Offset type large enough to represent any index within the input and output iterators - using num_total_items_t = detail::select::num_total_items_t; + using num_total_items_t = OffsetT; // Type used to provide streaming information about each partition's context using streaming_context_t = detail::select::streaming_context_t; diff --git a/cub/test/catch2_test_util_choose_offset.cu b/cub/test/catch2_test_util_choose_offset.cu index 62b8e4757af..9da5b59239f 100644 --- a/cub/test/catch2_test_util_choose_offset.cu +++ b/cub/test/catch2_test_util_choose_offset.cu @@ -27,6 +27,8 @@ #include +#include +#include #include #include "catch2_test_helper.h" @@ -43,6 +45,36 @@ CUB_TEST("Tests choose_offset", "[util][type]") STATIC_REQUIRE(::cuda::std::is_same, unsigned long long>::value); } +CUB_TEST("Tests choose_signed_offset", "[util][type]") +{ + // Uses signed 64-bit type for unsigned signed 32-bit type + STATIC_REQUIRE(::cuda::std::is_same, std::int64_t>::value); + + // Uses signed 32-bit type for signed 32-bit type + STATIC_REQUIRE(::cuda::std::is_same, std::int32_t>::value); + + // Uses signed 32-bit type for type smaller than 32 bits + STATIC_REQUIRE(::cuda::std::is_same, std::int32_t>::value); + + // Uses signed 64-bit type for signed 64-bit type + STATIC_REQUIRE(::cuda::std::is_same, std::int64_t>::value); + + // Offset type covers maximum number representable by a signed 32-bit integer + REQUIRE(cudaSuccess + == cub::detail::choose_signed_offset::is_exceeding_offset_type( + ::cuda::std::numeric_limits::max())); + + // Offset type covers maximum number representable by a signed 64-bit integer + REQUIRE(cudaSuccess + == cub::detail::choose_signed_offset::is_exceeding_offset_type( + ::cuda::std::numeric_limits::max())); + + // Offset type does not support maximum number representable by an unsigned 64-bit integer + REQUIRE(cudaErrorInvalidValue + == cub::detail::choose_signed_offset::is_exceeding_offset_type( + ::cuda::std::numeric_limits::max())); +} + CUB_TEST("Tests promote_small_offset", "[util][type]") { // Uses input type for types of at least 32 bits From a117b452312c6da8385727c52694c79431d4e274 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:43:05 -0700 Subject: [PATCH 14/41] fix feature macro --- cub/cub/device/dispatch/dispatch_select_if.cuh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index fff8b0ca527..abfd611ca5f 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -187,11 +187,12 @@ struct agent_select_if_wrapper_t /****************************************************************************** * Kernel entry points *****************************************************************************/ -// TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want to make this a CCCL macro? -#if _CCCL_CUDACC_BELOW_11_7 -#define _CUB_GRID_CONSTANT +// TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want +// to make this a CCCL macro? +#if defined(_CCCL_CUDACC_BELOW_11_7) +# define _CUB_GRID_CONSTANT #else -#define _CUB_GRID_CONSTANT __grid_constant__ +# define _CUB_GRID_CONSTANT __grid_constant__ #endif /** From 480da39881591f1c4cccc600ab190c0700dc237e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 16 Sep 2024 12:51:19 -0700 Subject: [PATCH 15/41] fixes feature macro --- cub/cub/device/dispatch/dispatch_select_if.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index abfd611ca5f..d1f901ec2c7 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -189,7 +189,7 @@ struct agent_select_if_wrapper_t *****************************************************************************/ // TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want // to make this a CCCL macro? -#if defined(_CCCL_CUDACC_BELOW_11_7) +#if defined(_CCCL_CUDACC_BELOW_11_7) && (CUB_PTX_ARCH >= 700) # define _CUB_GRID_CONSTANT #else # define _CUB_GRID_CONSTANT __grid_constant__ From aaed489e58958ad5c967a2f865f2e95ad5d686a6 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 16 Sep 2024 13:16:47 -0700 Subject: [PATCH 16/41] fixes feature macro --- cub/cub/device/dispatch/dispatch_select_if.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index d1f901ec2c7..0d0b6dd3030 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -189,7 +189,7 @@ struct agent_select_if_wrapper_t *****************************************************************************/ // TODO (elstehle) gird-private constants were introduced in CTK 11.7. The macro is temporarily placed here, do we want // to make this a CCCL macro? -#if defined(_CCCL_CUDACC_BELOW_11_7) && (CUB_PTX_ARCH >= 700) +#if defined(_CCCL_CUDACC_BELOW_11_7) || (CUB_PTX_ARCH < 700) # define _CUB_GRID_CONSTANT #else # define _CUB_GRID_CONSTANT __grid_constant__ From 33337380057b2175d47e66944a50491a822673b7 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 16 Sep 2024 22:00:07 -0700 Subject: [PATCH 17/41] silences msvc constant conditional warning --- cub/cub/detail/choose_offset.cuh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh index 28639a1bf56..3e3b51f9504 100644 --- a/cub/cub/detail/choose_offset.cuh +++ b/cub/cub/detail/choose_offset.cuh @@ -124,10 +124,13 @@ struct choose_signed_offset */ static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t is_exceeding_offset_type(NumItemsT num_items) { + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */ if (sizeof(NumItemsT) >= 8 && num_items > static_cast(::cuda::std::numeric_limits::max())) { return cudaErrorInvalidValue; } + _CCCL_DIAG_POP return cudaSuccess; } }; From 66ca8fd5c604a4545d6a524f3aca10976b3ed312 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 17 Sep 2024 22:14:03 -0700 Subject: [PATCH 18/41] add support for streamin ctx dummy for partition with small offset types --- .../device/dispatch/dispatch_select_if.cuh | 63 +++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 0d0b6dd3030..70edebbfc65 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -71,7 +71,7 @@ namespace select // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition using per_partition_offset_t = ::cuda::std::int32_t; -template +template class streaming_context_t { private: @@ -141,6 +141,50 @@ public: } }; +template +class streaming_context_t +{ +private: + TotalNumItemsT total_num_items{}; + +public: + using total_num_items_t = TotalNumItemsT; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE + streaming_context_t(TotalNumItemsT*, TotalNumItemsT*, TotalNumItemsT total_num_items, bool) + : total_num_items(total_num_items) + {} + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT, bool) {}; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const + { + return TotalNumItemsT{0}; + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const + { + return TotalNumItemsT{0}; + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const + { + return TotalNumItemsT{0}; + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() const + { + return total_num_items; + }; + + template + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void + update_num_selected(NumSelectedIteratorT user_num_selected_out_it, OffsetT num_selections) const + { + *user_num_selected_out_it = num_selections; + } +}; + /** * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`. */ @@ -400,7 +444,16 @@ struct DispatchSelectIf : SelectedPolicy using num_total_items_t = OffsetT; // Type used to provide streaming information about each partition's context - using streaming_context_t = detail::select::streaming_context_t; + static constexpr per_partition_offset_t const partition_size = + cuda::std::numeric_limits::max(); + + // If the values representable by OffsetT exceed the partition_size, we use a kernel template specialization that + // supports streaming (i.e., splitting the input into partitions of up to partition_size number of items) + static constexpr bool may_require_streaming = + (static_cast<::cuda::std::uint64_t>(partition_size) + < static_cast<::cuda::std::uint64_t>(cuda::std::numeric_limits::max())); + + using streaming_context_t = detail::select::streaming_context_t; using ScanTileStateT = ScanTileState; @@ -542,9 +595,11 @@ struct DispatchSelectIf : SelectedPolicy constexpr auto tile_size = static_cast(block_threads * items_per_thread); // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size) + // The extra check of may_require_streaming ensures that OffsetT is larger than per_partition_offset_t to avoid + // truncation of partition_size auto const max_partition_size = - num_items > static_cast(cuda::std::numeric_limits::max()) - ? static_cast(cuda::std::numeric_limits::max()) + (may_require_streaming && num_items > static_cast(partition_size)) + ? static_cast(partition_size) : num_items; // The number of partitions required to "iterate" over the total input From 2781775512855fdee53dd25d3e5e116545b4eb53 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 17 Sep 2024 22:34:42 -0700 Subject: [PATCH 19/41] removes superfluous template parameter --- cub/cub/agent/agent_select_if.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index ebfe03dc1c2..e3a450a4a83 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -517,7 +517,7 @@ struct AgentSelectIf /** * Scatter flagged items to output offsets (specialized for direct scattering). */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedDirect( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], @@ -556,7 +556,7 @@ struct AgentSelectIf * @param is_keep_rejects * Marker type indicating whether to keep rejected items in the second partition */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedTwoPhase( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], @@ -604,7 +604,7 @@ struct AgentSelectIf * @param num_selections * Total number of selections including this tile */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], @@ -620,12 +620,12 @@ struct AgentSelectIf // greater than one if (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)) { - ScatterSelectedTwoPhase( + ScatterSelectedTwoPhase( items, selection_flags, selection_indices, num_tile_selections, num_selections_prefix); } else { - ScatterSelectedDirect(items, selection_flags, selection_indices, num_selections); + ScatterSelectedDirect(items, selection_flags, selection_indices, num_selections); } } @@ -648,7 +648,7 @@ struct AgentSelectIf * @param is_keep_rejects * Marker type indicating whether to keep rejected items in the second partition */ - template + template _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], @@ -833,7 +833,7 @@ struct AgentSelectIf } // Scatter flagged items - Scatter( + Scatter( items, selection_flags, selection_indices, @@ -915,7 +915,7 @@ struct AgentSelectIf // previous tiles' input items, in case of in-place compaction), because this is implicitly ensured through // execution dependency: The scatter stage requires the offset from the prefix-sum and it can only know the // prefix-sum after having read that from the decoupled look-back. Scatter flagged items - Scatter( + Scatter( items, selection_flags, selection_indices, From 73736960e47acec31659e44640bbb89220feec81 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 17 Sep 2024 22:55:55 -0700 Subject: [PATCH 20/41] adds test for different offset types for partition::if & ::flagged --- .../catch2_test_device_partition_flagged.cu | 29 ++++++++++--------- cub/test/catch2_test_device_partition_if.cu | 29 ++++++++++--------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu index 7cd5e6d10bb..4e8f04bdecf 100644 --- a/cub/test/catch2_test_device_partition_flagged.cu +++ b/cub/test/catch2_test_device_partition_flagged.cu @@ -91,6 +91,9 @@ using all_types = using types = c2h::type_list>; +// List of offset types to be used for testing large number of items +using offset_types = c2h::type_list; + CUB_TEST("DevicePartition::Flagged can run with empty input", "[device][partition_flagged]", types) { using type = typename c2h::get<0, TestType>; @@ -378,24 +381,22 @@ CUB_TEST("DevicePartition::Flagged works with different output type", "[device][ REQUIRE(reference == out); } -CUB_TEST("DevicePartition::Flagged works for very large number of items", "[device][partition_flagged]") +CUB_TEST("DevicePartition::Flagged works for very large number of items", "[device][partition_flagged]", offset_types) try { using type = std::int64_t; - using offset_t = std::int64_t; - - // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary - constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); - + using offset_t = typename c2h::get<0, TestType>; + + auto num_items_max_ull = + std::min(static_cast(::cuda::std::numeric_limits::max()), + ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); + offset_t num_items_max = static_cast(num_items_max_ull); + offset_t num_items_min = + num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; offset_t num_items = GENERATE_COPY( - values({ - offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions - offset_t{2} * max_partition_size, // 2 partitions - max_partition_size + offset_t{1}, // 2 partitions - max_partition_size, // 1 partitions - max_partition_size - offset_t{1} // 1 partitions - }), - take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); // We select the first items and reject the rest const offset_t cut_off_index = num_items / 4; diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu index 7c3c778584d..a301ee6061e 100644 --- a/cub/test/catch2_test_device_partition_if.cu +++ b/cub/test/catch2_test_device_partition_if.cu @@ -81,6 +81,9 @@ using all_types = using types = c2h:: type_list>; +// List of offset types to be used for testing large number of items +using offset_types = c2h::type_list; + CUB_TEST("DevicePartition::If can run with empty input", "[device][partition_if]", types) { using type = typename c2h::get<0, TestType>; @@ -300,24 +303,22 @@ CUB_TEST("DevicePartition::If works with a different output type", "[device][par REQUIRE(reference == out); } -CUB_TEST("DevicePartition::If works for very large number of items", "[device][partition_if]") +CUB_TEST("DevicePartition::If works for very large number of items", "[device][partition_if]", offset_types) try { using type = std::int64_t; - using offset_t = std::int64_t; - - // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary - constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); - + using offset_t = typename c2h::get<0, TestType>; + + auto num_items_max_ull = + std::min(static_cast(::cuda::std::numeric_limits::max()), + ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); + offset_t num_items_max = static_cast(num_items_max_ull); + offset_t num_items_min = + num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; offset_t num_items = GENERATE_COPY( - values({ - offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions - offset_t{2} * max_partition_size, // 2 partitions - max_partition_size + offset_t{1}, // 2 partitions - max_partition_size, // 1 partitions - max_partition_size - offset_t{1} // 1 partitions - }), - take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); auto in = thrust::make_counting_iterator(offset_t{0}); From 687fb9978f29be0faa20fdea507182f72594425d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 18 Sep 2024 02:14:07 -0700 Subject: [PATCH 21/41] adds tests and support for streaming select::unique --- cub/cub/agent/agent_select_if.cuh | 4 +- cub/cub/device/device_select.cuh | 25 +++-- .../device/dispatch/dispatch_select_if.cuh | 10 ++ cub/test/catch2_test_device_select_unique.cu | 101 ++++++++++++++++++ 4 files changed, 132 insertions(+), 8 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index e3a450a4a83..42e4869318d 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -477,7 +477,7 @@ struct AgentSelectIf OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { - if (IS_FIRST_TILE) + if (IS_FIRST_TILE && streaming_context.is_first_partition()) { CTA_SYNC(); @@ -489,7 +489,7 @@ struct AgentSelectIf InputT tile_predecessor; if (threadIdx.x == 0) { - tile_predecessor = d_in[tile_offset - 1]; + tile_predecessor = d_in[tile_offset + streaming_context.input_offset() - 1]; } CTA_SYNC(); diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index e24526cd821..279878d048f 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -950,6 +950,9 @@ struct DeviceSelect //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -974,23 +977,32 @@ struct DeviceSelect //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique"); - using OffsetT = ::cuda::std::int64_t; // Signed integer type for global offsets + using ChooseOffsetT = detail::choose_signed_offset; + using OffsetT = typename ChooseOffsetT::type; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using SelectOp = NullType; // Selection op (not used) using EqualityOp = Equality; // Default == operator + // Check if the number of items exceeds the range covered by the selected signed offset type + cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items); + if (error) + { + return error; + } + return DispatchSelectIf< InputIteratorT, FlagIterator, @@ -1012,20 +1024,21 @@ struct DeviceSelect } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - return Unique( + return Unique( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream); } #endif // DOXYGEN_SHOULD_SKIP_THIS diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 70edebbfc65..926d6e9caac 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -111,6 +111,11 @@ public: return first_partition ? TotalNumItemsT{0} : total_previous_num_items; }; + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT is_first_partition() const + { + return first_partition; + }; + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const { return first_partition ? TotalNumItemsT{0} : *d_num_selected_in; @@ -162,6 +167,11 @@ public: return TotalNumItemsT{0}; }; + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT is_first_partition() const + { + return true; + }; + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const { return TotalNumItemsT{0}; diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu index 51c6200c624..a5397457438 100644 --- a/cub/test/catch2_test_device_select_unique.cu +++ b/cub/test/catch2_test_device_select_unique.cu @@ -30,11 +30,14 @@ #include +#include #include #include +#include #include +#include "catch2_test_device_select_common.cuh" #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" @@ -97,6 +100,9 @@ using all_types = using types = c2h::type_list; +// List of offset types to be used for testing large number of items +using offset_types = c2h::type_list; + CUB_TEST("DeviceSelect::Unique can run with empty input", "[device][select_unique]", types) { using type = typename c2h::get<0, TestType>; @@ -264,3 +270,98 @@ CUB_TEST("DeviceSelect::Unique works with a different output type", "[device][se reference.resize(num_selected_out[0]); REQUIRE(reference == out); } + +CUB_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]", offset_types) +try +{ + using type = std::int64_t; + using offset_t = typename c2h::get<0, TestType>; + + auto num_items_max_ull = + std::min(static_cast(::cuda::std::numeric_limits::max()), + ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); + offset_t num_items_max = static_cast(num_items_max_ull); + offset_t num_items_min = + num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; + offset_t num_items = GENERATE_COPY( + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); + + // All unique + SECTION("AllUnique") + { + auto in = thrust::make_counting_iterator(offset_t{0}); + + // Prepare tabulate output iterator to verify results in a memory-efficient way: + // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item + // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index. + static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); + c2h::device_vector correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element)); + auto expected_result_it = in; + auto check_result_op = + make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data())); + auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op); + + // Needs to be device accessible + c2h::device_vector num_selected_out(1, 0); + offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + select_unique(in, check_result_it, d_first_num_selected_out, num_items); + + // Ensure that we created the correct output + REQUIRE(num_selected_out[0] == num_items); + bool all_results_correct = thrust::equal( + correctness_flags.cbegin(), + correctness_flags.cbegin() + (num_items / bits_per_element), + thrust::make_constant_iterator(0xFFFFFFFFU)); + REQUIRE(all_results_correct == true); + if (num_items % bits_per_element != 0) + { + std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U; + REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); + } + } + + // All the same -> single unique + SECTION("AllSame") + { + auto in = thrust::make_constant_iterator(offset_t{0}); + constexpr offset_t expected_num_unique{1}; + + // Prepare tabulate output iterator to verify results in a memory-efficient way: + // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item + // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index. + static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); + c2h::device_vector correctness_flags(cub::DivideAndRoundUp(expected_num_unique, bits_per_element)); + auto expected_result_it = in; + auto check_result_op = + make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data())); + auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op); + + // Needs to be device accessible + c2h::device_vector num_selected_out(1, 0); + offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + select_unique(in, check_result_it, d_first_num_selected_out, num_items); + + // Ensure that we created the correct output + REQUIRE(num_selected_out[0] == expected_num_unique); + bool all_results_correct = thrust::equal( + correctness_flags.cbegin(), + correctness_flags.cbegin() + (expected_num_unique / bits_per_element), + thrust::make_constant_iterator(0xFFFFFFFFU)); + REQUIRE(all_results_correct == true); + if (expected_num_unique % bits_per_element != 0) + { + std::uint32_t last_element_flags = (0x00000001U << (expected_num_unique % bits_per_element)) - 0x01U; + REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); + } + } +} +catch (std::bad_alloc&) +{ + // Exceeding memory is not a failure. +} From 0e3f602bcf94ca267b5740d35bbea45dfd779dff Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 18 Sep 2024 05:32:33 -0700 Subject: [PATCH 22/41] fixes msvc warning --- cub/test/catch2_test_device_select_unique.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu index a5397457438..fa015a50a27 100644 --- a/cub/test/catch2_test_device_select_unique.cu +++ b/cub/test/catch2_test_device_select_unique.cu @@ -354,11 +354,14 @@ try correctness_flags.cbegin() + (expected_num_unique / bits_per_element), thrust::make_constant_iterator(0xFFFFFFFFU)); REQUIRE(all_results_correct == true); - if (expected_num_unique % bits_per_element != 0) + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */ + _CCCL_IF_CONSTEXPR (expected_num_unique % bits_per_element != 0) { std::uint32_t last_element_flags = (0x00000001U << (expected_num_unique % bits_per_element)) - 0x01U; REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); } + _CCCL_DIAG_POP } } catch (std::bad_alloc&) From 305b3073bb26dc7cc6b257c35c544934f27ab45b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 18 Sep 2024 09:05:43 -0700 Subject: [PATCH 23/41] fixes perf for partition --- cub/cub/agent/agent_select_if.cuh | 38 ++++++++++++++++++- .../device/dispatch/dispatch_select_if.cuh | 16 +++----- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 42e4869318d..5618265baa7 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -730,7 +730,41 @@ struct AgentSelectIf * iterator, where selected items are written in order from the beginning of the itereator and rejected items are * writtem from the iterators end backwards. */ - template + template ::type = 0> + _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( + int num_tile_items, + int tile_num_rejections, + OffsetT num_selections_prefix, + OffsetT num_rejected_prefix, + PartitionedOutputItT partitioned_out_it) + { + using total_offset_t = typename StreamingContextT::total_num_items_t; + + total_offset_t total_rejected_prefix = + streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected() - num_rejected_prefix; + total_offset_t total_selected_prefix = + streaming_context.num_previously_selected() + static_cast(num_selections_prefix); + +#pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + int rejection_idx = item_idx; + int selection_idx = item_idx - tile_num_rejections; + total_offset_t scatter_rejected_index = total_rejected_prefix - rejection_idx - 1; + total_offset_t scatter_selected_index = total_selected_prefix + selection_idx; + total_offset_t scatter_offset = + (item_idx < tile_num_rejections) ? scatter_rejected_index : scatter_selected_index; + + InputT item = temp_storage.raw_exchange.Alias()[item_idx]; + if (!IS_LAST_TILE || (item_idx < num_tile_items)) + { + partitioned_out_it[scatter_offset] = item; + } + } + } + + template 1), int>::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, @@ -740,7 +774,7 @@ struct AgentSelectIf { using total_offset_t = typename StreamingContextT::total_num_items_t; - total_offset_t _num_rejected_prefix = streaming_context.num_total_items() - streaming_context.num_previously_rejected(); + total_offset_t _num_rejected_prefix = streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected(); _num_rejected_prefix -= static_cast(num_rejected_prefix + 1); total_offset_t _num_selections_prefix = streaming_context.num_previously_selected() + static_cast(num_selections_prefix); diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 926d6e9caac..07f8ed3049b 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -126,7 +126,8 @@ public: return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected()); }; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() const + template + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT) const { return total_num_items; }; @@ -149,16 +150,10 @@ public: template class streaming_context_t { -private: - TotalNumItemsT total_num_items{}; - public: using total_num_items_t = TotalNumItemsT; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE - streaming_context_t(TotalNumItemsT*, TotalNumItemsT*, TotalNumItemsT total_num_items, bool) - : total_num_items(total_num_items) - {} + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_context_t(TotalNumItemsT*, TotalNumItemsT*, TotalNumItemsT, bool) {} _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT, bool) {}; @@ -182,9 +177,10 @@ public: return TotalNumItemsT{0}; }; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items() const + template + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT num_partition_items) const { - return total_num_items; + return num_partition_items; }; template From db36ae9c5278fedfa3bb188805c2f64fac5e3f45 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 18 Sep 2024 13:32:52 -0700 Subject: [PATCH 24/41] fixes format --- cub/cub/agent/agent_select_if.cuh | 38 +++++++++++-------- cub/cub/device/device_select.cuh | 12 +++--- .../device/dispatch/dispatch_select_if.cuh | 4 +- cub/test/catch2_test_device_select_if.cu | 4 +- 4 files changed, 31 insertions(+), 27 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 5618265baa7..2c88e2a5e0a 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -730,7 +730,10 @@ struct AgentSelectIf * iterator, where selected items are written in order from the beginning of the itereator and rejected items are * writtem from the iterators end backwards. */ - template ::type = 0> + template ::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, @@ -764,7 +767,10 @@ struct AgentSelectIf } } - template 1), int>::type = 0> + template 1), int>::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, @@ -772,26 +778,26 @@ struct AgentSelectIf OffsetT num_rejected_prefix, PartitionedOutputItT partitioned_out_it) { - using total_offset_t = typename StreamingContextT::total_num_items_t; - - total_offset_t _num_rejected_prefix = streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected(); + using total_offset_t = typename StreamingContextT::total_num_items_t; + + total_offset_t _num_rejected_prefix = + streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected(); _num_rejected_prefix -= static_cast(num_rejected_prefix + 1); - total_offset_t _num_selections_prefix = streaming_context.num_previously_selected() + static_cast(num_selections_prefix); - + total_offset_t _num_selections_prefix = + streaming_context.num_previously_selected() + static_cast(num_selections_prefix); + #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; - InputT item = temp_storage.raw_exchange.Alias()[item_idx]; - - int rejection_idx = item_idx; - int selection_idx = item_idx - tile_num_rejections; + { + int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + InputT item = temp_storage.raw_exchange.Alias()[item_idx]; + + int rejection_idx = item_idx; + int selection_idx = item_idx - tile_num_rejections; total_offset_t scatter_rejected_index = _num_rejected_prefix - static_cast(rejection_idx); total_offset_t scatter_selected_index = _num_selections_prefix + static_cast(selection_idx); total_offset_t scatter_offset = - (item_idx < tile_num_rejections) - ? scatter_rejected_index - : scatter_selected_index; + (item_idx < tile_num_rejections) ? scatter_rejected_index : scatter_selected_index; if (!IS_LAST_TILE || (item_idx < num_tile_items)) { diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 279878d048f..dea2776eb62 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -977,8 +977,7 @@ struct DeviceSelect //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, size_t& temp_storage_bytes, @@ -992,9 +991,9 @@ struct DeviceSelect using ChooseOffsetT = detail::choose_signed_offset; using OffsetT = typename ChooseOffsetT::type; // Signed integer type for global offsets - using FlagIterator = NullType*; // FlagT iterator type (not used) - using SelectOp = NullType; // Selection op (not used) - using EqualityOp = Equality; // Default == operator + using FlagIterator = NullType*; // FlagT iterator type (not used) + using SelectOp = NullType; // Selection op (not used) + using EqualityOp = Equality; // Default == operator // Check if the number of items exceeds the range covered by the selected signed offset type cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items); @@ -1024,8 +1023,7 @@ struct DeviceSelect } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, size_t& temp_storage_bytes, diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 07f8ed3049b..7c4ebd3a4a2 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -130,7 +130,7 @@ public: _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT) const { return total_num_items; - }; + } template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void @@ -181,7 +181,7 @@ public: _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT num_partition_items) const { return num_partition_items; - }; + } template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu index 3f7cc402b93..6eb1219cae7 100644 --- a/cub/test/catch2_test_device_select_if.cu +++ b/cub/test/catch2_test_device_select_if.cu @@ -343,8 +343,8 @@ try offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); // Run test - constexpr offset_t match_every_nth = 1000000; - offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth; + constexpr offset_t match_every_nth = 1000000; + offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth; c2h::device_vector out(expected_num_copied); select_if( in, out.begin(), d_first_num_selected_out, num_items, mod_n{static_cast(match_every_nth)}); From 5f38f5e89e9baf351eb31c43964018ebfe50cb77 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 18 Sep 2024 20:39:18 -0700 Subject: [PATCH 25/41] fixes mixup for partition perf fix --- cub/cub/agent/agent_select_if.cuh | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 2c88e2a5e0a..61020afa1f7 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -732,14 +732,14 @@ struct AgentSelectIf */ template ::type = 0> - _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( - int num_tile_items, - int tile_num_rejections, - OffsetT num_selections_prefix, - OffsetT num_rejected_prefix, - PartitionedOutputItT partitioned_out_it) + typename _InputT = InputT, + typename ::cuda::std::enable_if < sizeof(_InputT) > 1, + int > ::type = 0 > _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( + int num_tile_items, + int tile_num_rejections, + OffsetT num_selections_prefix, + OffsetT num_rejected_prefix, + PartitionedOutputItT partitioned_out_it) { using total_offset_t = typename StreamingContextT::total_num_items_t; @@ -769,8 +769,8 @@ struct AgentSelectIf template 1), int>::type = 0> + typename _InputT = InputT, + typename ::cuda::std::enable_if<(sizeof(_InputT) == 1), int>::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, From 16a2a75333c2cd5f11ecd020b3b4e5da3659a5e1 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 18 Sep 2024 20:52:49 -0700 Subject: [PATCH 26/41] fixes syntax --- cub/cub/agent/agent_select_if.cuh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 61020afa1f7..4db3cf4dee5 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -732,14 +732,14 @@ struct AgentSelectIf */ template 1, - int > ::type = 0 > _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( - int num_tile_items, - int tile_num_rejections, - OffsetT num_selections_prefix, - OffsetT num_rejected_prefix, - PartitionedOutputItT partitioned_out_it) + typename _InputT = InputT, + typename ::cuda::std::enable_if<(sizeof(_InputT) > 1), int>::type = 0> + _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( + int num_tile_items, + int tile_num_rejections, + OffsetT num_selections_prefix, + OffsetT num_rejected_prefix, + PartitionedOutputItT partitioned_out_it) { using total_offset_t = typename StreamingContextT::total_num_items_t; @@ -769,7 +769,7 @@ struct AgentSelectIf template ::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, From be83ebef5f45f8d4af8f77d41668da53db38d166 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 18 Sep 2024 21:20:31 -0700 Subject: [PATCH 27/41] fixes partition:flagged perf --- cub/cub/agent/agent_select_if.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 4db3cf4dee5..8397a96ddd4 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -732,8 +732,8 @@ struct AgentSelectIf */ template 1), int>::type = 0> + typename _InputT = InputT, + typename ::cuda::std::enable_if::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, @@ -769,8 +769,8 @@ struct AgentSelectIf template ::type = 0> + typename _InputT = InputT, + typename ::cuda::std::enable_if<(sizeof(_InputT) == 1 && SELECT_METHOD == USE_SELECT_OP), int>::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, From d4816b6a62f5e451e24e8683c8e0897e10c9ed5c Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 19 Sep 2024 13:07:46 -0700 Subject: [PATCH 28/41] fixes perf for partition::flagged --- cub/cub/agent/agent_select_if.cuh | 48 +++---------------------------- 1 file changed, 4 insertions(+), 44 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 8397a96ddd4..df0d663b9e1 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -730,10 +730,7 @@ struct AgentSelectIf * iterator, where selected items are written in order from the beginning of the itereator and rejected items are * writtem from the iterators end backwards. */ - template ::type = 0> + template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, @@ -767,45 +764,6 @@ struct AgentSelectIf } } - template ::type = 0> - _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( - int num_tile_items, - int tile_num_rejections, - OffsetT num_selections_prefix, - OffsetT num_rejected_prefix, - PartitionedOutputItT partitioned_out_it) - { - using total_offset_t = typename StreamingContextT::total_num_items_t; - - total_offset_t _num_rejected_prefix = - streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected(); - _num_rejected_prefix -= static_cast(num_rejected_prefix + 1); - total_offset_t _num_selections_prefix = - streaming_context.num_previously_selected() + static_cast(num_selections_prefix); - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; - InputT item = temp_storage.raw_exchange.Alias()[item_idx]; - - int rejection_idx = item_idx; - int selection_idx = item_idx - tile_num_rejections; - total_offset_t scatter_rejected_index = _num_rejected_prefix - static_cast(rejection_idx); - total_offset_t scatter_selected_index = _num_selections_prefix + static_cast(selection_idx); - total_offset_t scatter_offset = - (item_idx < tile_num_rejections) ? scatter_rejected_index : scatter_selected_index; - - if (!IS_LAST_TILE || (item_idx < num_tile_items)) - { - partitioned_out_it[scatter_offset] = item; - } - } - } - //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- @@ -1025,7 +983,9 @@ struct AgentSelectIf auto tile_state_wrapper = MemoryOrderedTileStateT{tile_state}; // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = blockIdx.x; + // TODO (elstehle): replacing this term with just `blockIdx.x` degrades perf for partition. Once we get to re-tune + // the algorithm, we want to replace this term with `blockIdx.x` + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = static_cast(tile_idx) * static_cast(TILE_ITEMS); if (tile_idx < num_tiles - 1) From 3c9788d13962b346faabfd3cb27c9c0cd17255a6 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 19 Sep 2024 21:42:26 -0700 Subject: [PATCH 29/41] switches unique to always use i64 offset types --- cub/cub/device/device_select.cuh | 23 ++++------------ cub/test/catch2_test_device_select_unique.cu | 29 ++++++++++---------- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index dea2776eb62..35428013574 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -950,9 +950,6 @@ struct DeviceSelect //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! - //! @tparam NumItemsT - //! **[inferred]** Type of num_items - //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -977,30 +974,22 @@ struct DeviceSelect //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique"); - using ChooseOffsetT = detail::choose_signed_offset; - using OffsetT = typename ChooseOffsetT::type; // Signed integer type for global offsets - using FlagIterator = NullType*; // FlagT iterator type (not used) - using SelectOp = NullType; // Selection op (not used) - using EqualityOp = Equality; // Default == operator - - // Check if the number of items exceeds the range covered by the selected signed offset type - cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items); - if (error) - { - return error; - } + using OffsetT = ::cuda::std::int64_t; + using FlagIterator = NullType*; // FlagT iterator type (not used) + using SelectOp = NullType; // Selection op (not used) + using EqualityOp = Equality; // Default == operator return DispatchSelectIf< InputIteratorT, diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu index fa015a50a27..9c49ce5a10f 100644 --- a/cub/test/catch2_test_device_select_unique.cu +++ b/cub/test/catch2_test_device_select_unique.cu @@ -100,9 +100,6 @@ using all_types = using types = c2h::type_list; -// List of offset types to be used for testing large number of items -using offset_types = c2h::type_list; - CUB_TEST("DeviceSelect::Unique can run with empty input", "[device][select_unique]", types) { using type = typename c2h::get<0, TestType>; @@ -271,22 +268,24 @@ CUB_TEST("DeviceSelect::Unique works with a different output type", "[device][se REQUIRE(reference == out); } -CUB_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]", offset_types) +CUB_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]") try { using type = std::int64_t; - using offset_t = typename c2h::get<0, TestType>; - - auto num_items_max_ull = - std::min(static_cast(::cuda::std::numeric_limits::max()), - ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); - offset_t num_items_max = static_cast(num_items_max_ull); - offset_t num_items_min = - num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; + using offset_t = std::int64_t; + + // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary + constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); + offset_t num_items = GENERATE_COPY( - values( - {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), - take(2, random(num_items_min, num_items_max))); + values({ + offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions + offset_t{2} * max_partition_size, // 2 partitions + max_partition_size + offset_t{1}, // 2 partitions + max_partition_size, // 1 partitions + max_partition_size - offset_t{1} // 1 partitions + }), + take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); // All unique SECTION("AllUnique") From 49a50f5ac86df36e7ecdb1da5de80a682de6a9b6 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 19 Sep 2024 23:58:02 -0700 Subject: [PATCH 30/41] adds benchmark for partition with distinct iterators --- cub/benchmarks/bench/partition/flagged.cu | 35 +++++++++++++--- cub/benchmarks/bench/partition/if.cu | 51 +++++++++++++++++------ 2 files changed, 68 insertions(+), 18 deletions(-) diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index d881000701f..b3562ba8408 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -27,6 +27,10 @@ #include +#include + +#include + #include #include @@ -77,16 +81,32 @@ struct policy_hub_t }; #endif // TUNE_BASE -template -void flagged(nvbench::state& state, nvbench::type_list) +template +void init_output_partition_buffer( + FlagsItT d_flags, OffsetT num_items, T* d_out, cub::detail::partition_distinct_output_t& d_partition_out_buffer) +{ + const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true); + d_partition_out_buffer = cub::detail::partition_distinct_output_t{d_out, d_out + selected_elements}; +} + +template +void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_out_buffer) +{ + d_partition_out_buffer = d_out; +} + +template +void flagged(nvbench::state& state, nvbench::type_list) { using input_it_t = const T*; using flag_it_t = const bool*; - using output_it_t = T*; using num_selected_it_t = OffsetT*; using select_op_t = cub::NullType; using equality_op_t = cub::NullType; using offset_t = OffsetT; + constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value; + using output_it_t = typename ::cuda::std:: + conditional, T*>::type; #if !TUNE_BASE using policy_t = policy_hub_t; @@ -127,8 +147,9 @@ void flagged(nvbench::state& state, nvbench::type_list) input_it_t d_in = thrust::raw_pointer_cast(in.data()); flag_it_t d_flags = thrust::raw_pointer_cast(flags.data()); - output_it_t d_out = thrust::raw_pointer_cast(out.data()); num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); + output_it_t d_out{}; + init_output_partition_buffer(flags.cbegin(), elements, thrust::raw_pointer_cast(out.data()), d_out); state.add_element_count(elements); state.add_global_memory_reads(elements); @@ -158,8 +179,10 @@ void flagged(nvbench::state& state, nvbench::type_list) }); } -NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) +using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>; + +NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions)) .set_name("base") - .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu index fac7c8e5c36..5fc4f82f6d9 100644 --- a/cub/benchmarks/bench/partition/if.cu +++ b/cub/benchmarks/bench/partition/if.cu @@ -27,6 +27,10 @@ #include +#include + +#include + #include #include @@ -102,16 +106,36 @@ T value_from_entropy(double percentage) return static_cast(result); } -template -void partition(nvbench::state& state, nvbench::type_list) +template +void init_output_partition_buffer( + InItT d_in, + OffsetT num_items, + T* d_out, + SelectOpT select_op, + cub::detail::partition_distinct_output_t& d_partition_out_buffer) { - using input_it_t = const T*; - using flag_it_t = cub::NullType*; - using output_it_t = T*; - using num_selected_it_t = OffsetT*; - using select_op_t = less_then_t; - using equality_op_t = cub::NullType; - using offset_t = OffsetT; + const auto selected_elements = thrust::count_if(d_in, d_in + num_items, select_op); + d_partition_out_buffer = cub::detail::partition_distinct_output_t{d_out, d_out + selected_elements}; +} + +template +void init_output_partition_buffer(InItT, OffsetT, T* d_out, SelectOpT, T*& d_partition_out_buffer) +{ + d_partition_out_buffer = d_out; +} + +template +void partition(nvbench::state& state, nvbench::type_list) +{ + using input_it_t = const T*; + using flag_it_t = cub::NullType*; + using num_selected_it_t = OffsetT*; + using select_op_t = less_then_t; + using equality_op_t = cub::NullType; + using offset_t = OffsetT; + constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value; + using output_it_t = typename ::cuda::std:: + conditional, T*>::type; #if !TUNE_BASE using policy_t = policy_hub_t; @@ -153,8 +177,9 @@ void partition(nvbench::state& state, nvbench::type_list) input_it_t d_in = thrust::raw_pointer_cast(in.data()); flag_it_t d_flags = nullptr; - output_it_t d_out = thrust::raw_pointer_cast(out.data()); num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); + output_it_t d_out{}; + init_output_partition_buffer(in.cbegin(), elements, thrust::raw_pointer_cast(out.data()), select_op, d_out); state.add_element_count(elements); state.add_global_memory_reads(elements); @@ -183,8 +208,10 @@ void partition(nvbench::state& state, nvbench::type_list) }); } -NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) +using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>; + +NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions)) .set_name("base") - .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); From 5b747b9f0ee1b430491f5aec5745bd0fead12121 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 20 Sep 2024 07:47:09 -0700 Subject: [PATCH 31/41] resolves merge conflicts --- cub/test/catch2_test_device_partition_flagged.cu | 4 +++- cub/test/catch2_test_device_partition_if.cu | 4 +++- cub/test/catch2_test_device_select_unique.cu | 6 ++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu index 4e8f04bdecf..0405dc010ee 100644 --- a/cub/test/catch2_test_device_partition_flagged.cu +++ b/cub/test/catch2_test_device_partition_flagged.cu @@ -39,6 +39,8 @@ #include #include +#include + #include #include "catch2_test_device_select_common.cuh" @@ -409,7 +411,7 @@ try // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index. static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); - c2h::device_vector correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element)); + c2h::device_vector correctness_flags(::cuda::ceil_div(num_items, bits_per_element)); auto expected_selected_it = thrust::make_counting_iterator(offset_t{0}); auto expected_rejected_it = thrust::make_reverse_iterator( thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index)); diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu index a301ee6061e..5feab9de316 100644 --- a/cub/test/catch2_test_device_partition_if.cu +++ b/cub/test/catch2_test_device_partition_if.cu @@ -39,6 +39,8 @@ #include #include +#include + #include #include "catch2_test_device_select_common.cuh" @@ -329,7 +331,7 @@ try // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index. static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); - c2h::device_vector correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element)); + c2h::device_vector correctness_flags(::cuda::ceil_div(num_items, bits_per_element)); auto expected_selected_it = thrust::make_counting_iterator(offset_t{0}); auto expected_rejected_it = thrust::make_reverse_iterator( thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index)); diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu index 9c49ce5a10f..080134f73b1 100644 --- a/cub/test/catch2_test_device_select_unique.cu +++ b/cub/test/catch2_test_device_select_unique.cu @@ -35,6 +35,8 @@ #include #include +#include + #include #include "catch2_test_device_select_common.cuh" @@ -296,7 +298,7 @@ try // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index. static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); - c2h::device_vector correctness_flags(cub::DivideAndRoundUp(num_items, bits_per_element)); + c2h::device_vector correctness_flags(::cuda::ceil_div(num_items, bits_per_element)); auto expected_result_it = in; auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data())); @@ -333,7 +335,7 @@ try // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index. static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); - c2h::device_vector correctness_flags(cub::DivideAndRoundUp(expected_num_unique, bits_per_element)); + c2h::device_vector correctness_flags(::cuda::ceil_div(expected_num_unique, bits_per_element)); auto expected_result_it = in; auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data())); From 54618155afb7a35838a5dfd4b7948cc25a8af96f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 23 Sep 2024 10:34:16 -0700 Subject: [PATCH 32/41] fixes merge conflict --- cub/cub/device/dispatch/dispatch_select_if.cuh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 7c214457685..2210bba17cf 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -612,8 +612,7 @@ struct DispatchSelectIf : SelectedPolicy auto const num_partitions = ::cuda::ceil_div(num_items, max_partition_size); // The maximum number of tiles for which we will ever invoke the kernel - auto const max_num_tiles_per_invocation = - static_cast(::cuda::ceil_div(max_partition_size, tile_size)); + auto const max_num_tiles_per_invocation = static_cast(::cuda::ceil_div(max_partition_size, tile_size)); // The amount of virtual shared memory to allocate const auto vsmem_size = max_num_tiles_per_invocation * VsmemHelperT::vsmem_per_block; @@ -888,7 +887,7 @@ struct DispatchSelectIf : SelectedPolicy num_items, stream); } -}; #endif // DOXYGEN_SHOULD_SKIP_THIS +}; CUB_NAMESPACE_END From 18cd2d7777e9298c0fe672d4b18e330c237c8dd8 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 23 Sep 2024 10:45:42 -0700 Subject: [PATCH 33/41] makes sass identical to main for i32 partition --- cub/cub/agent/agent_select_if.cuh | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index df0d663b9e1..a5e7370aa56 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -740,21 +740,19 @@ struct AgentSelectIf { using total_offset_t = typename StreamingContextT::total_num_items_t; - total_offset_t total_rejected_prefix = - streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected() - num_rejected_prefix; - total_offset_t total_selected_prefix = - streaming_context.num_previously_selected() + static_cast(num_selections_prefix); - #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { - int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; - int rejection_idx = item_idx; - int selection_idx = item_idx - tile_num_rejections; - total_offset_t scatter_rejected_index = total_rejected_prefix - rejection_idx - 1; - total_offset_t scatter_selected_index = total_selected_prefix + selection_idx; + int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + int rejection_idx = item_idx; + int selection_idx = item_idx - tile_num_rejections; total_offset_t scatter_offset = - (item_idx < tile_num_rejections) ? scatter_rejected_index : scatter_selected_index; + (item_idx < tile_num_rejections) + ? (streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected() + - static_cast(num_rejected_prefix) - static_cast(rejection_idx) + - total_offset_t{1}) + : (streaming_context.num_previously_selected() + static_cast(num_selections_prefix) + + static_cast(selection_idx)); InputT item = temp_storage.raw_exchange.Alias()[item_idx]; if (!IS_LAST_TILE || (item_idx < num_tile_items)) From 6ccdaf9d66b430228f562bae0d3d723dc5172afa Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 1 Oct 2024 10:16:27 -0700 Subject: [PATCH 34/41] updates thrust copy_if to always use i64 offset types --- thrust/thrust/system/cuda/detail/copy_if.h | 23 ++++++++-------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h index 19dd014f59a..034e850e0e2 100644 --- a/thrust/thrust/system/cuda/detail/copy_if.h +++ b/thrust/thrust/system/cuda/detail/copy_if.h @@ -219,19 +219,16 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if( cudaError_t status = cudaSuccess; size_t temp_storage_bytes = 0; - // 32-bit offset-type dispatch - using dispatch32_t = DispatchCopyIf; - // 64-bit offset-type dispatch + // Since https://github.com/NVIDIA/cccl/pull/2400, cub::DeviceSelect is using a streaming approach that splits up + // inputs larger than INT_MAX into partitions of up to `INT_MAX` items each, repeatedly invoking the respective + // algorithm. With that approach, we can always use i64 offset types for DispatchSelectIf, because there's only very + // limited performance upside for using i32 offset types. This avoids potentially duplicate kernel compilation. using dispatch64_t = DispatchCopyIf; // Query temporary storage requirements - THRUST_INDEX_TYPE_DISPATCH2( - status, - dispatch32_t::dispatch, - dispatch64_t::dispatch, - num_items, - (policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed)); + status = dispatch64_t::dispatch( + policy, nullptr, temp_storage_bytes, first, stencil, output, predicate, static_cast(num_items)); cuda_cub::throw_on_error(status, "copy_if failed on 1st step"); // Allocate temporary storage. @@ -239,12 +236,8 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if( void* temp_storage = static_cast(tmp.data().get()); // Run algorithm - THRUST_INDEX_TYPE_DISPATCH2( - status, - dispatch32_t::dispatch, - dispatch64_t::dispatch, - num_items, - (policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, num_items_fixed)); + dispatch64_t::dispatch( + policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, static_cast(num_items)); cuda_cub::throw_on_error(status, "copy_if failed on 2nd step"); return output; From 95de26f23d78c4d803dbfe48c2a38b8622268ec7 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 1 Oct 2024 10:17:00 -0700 Subject: [PATCH 35/41] fixes formatting --- cub/benchmarks/bench/partition/flagged.cu | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index b3562ba8408..ab2fd83dca7 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -83,7 +83,10 @@ struct policy_hub_t template void init_output_partition_buffer( - FlagsItT d_flags, OffsetT num_items, T* d_out, cub::detail::partition_distinct_output_t& d_partition_out_buffer) + FlagsItT d_flags, + OffsetT num_items, + T* d_out, + cub::detail::partition_distinct_output_t& d_partition_out_buffer) { const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true); d_partition_out_buffer = cub::detail::partition_distinct_output_t{d_out, d_out + selected_elements}; @@ -98,12 +101,12 @@ void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_o template void flagged(nvbench::state& state, nvbench::type_list) { - using input_it_t = const T*; - using flag_it_t = const bool*; - using num_selected_it_t = OffsetT*; - using select_op_t = cub::NullType; - using equality_op_t = cub::NullType; - using offset_t = OffsetT; + using input_it_t = const T*; + using flag_it_t = const bool*; + using num_selected_it_t = OffsetT*; + using select_op_t = cub::NullType; + using equality_op_t = cub::NullType; + using offset_t = OffsetT; constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value; using output_it_t = typename ::cuda::std:: conditional, T*>::type; From 8750240c20e16d2f118f011572e52f9df551832a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 3 Oct 2024 07:52:40 -0700 Subject: [PATCH 36/41] minor style improvements --- cub/cub/agent/agent_select_if.cuh | 3 +++ .../device/dispatch/dispatch_select_if.cuh | 14 +++++------ .../catch2_test_device_partition_flagged.cu | 10 +------- cub/test/catch2_test_device_partition_if.cu | 10 +------- cub/test/catch2_test_device_select_common.cuh | 25 ++++++++++++++++--- cub/test/catch2_test_device_select_unique.cu | 23 ++--------------- 6 files changed, 36 insertions(+), 49 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index a5e7370aa56..e765be68a18 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -342,6 +342,9 @@ struct AgentSelectIf * * @param num_items * Total number of input items + * + * @param streaming_context + * Context for the current partition */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentSelectIf( TempStorage& temp_storage, diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 2210bba17cf..bff0252d357 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -457,7 +457,7 @@ struct DispatchSelectIf : SelectedPolicy // supports streaming (i.e., splitting the input into partitions of up to partition_size number of items) static constexpr bool may_require_streaming = (static_cast<::cuda::std::uint64_t>(partition_size) - < static_cast<::cuda::std::uint64_t>(cuda::std::numeric_limits::max())); + < static_cast<::cuda::std::uint64_t>(::cuda::std::numeric_limits::max())); using streaming_context_t = detail::select::streaming_context_t; @@ -674,7 +674,7 @@ struct DispatchSelectIf : SelectedPolicy error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { - break; + return error; } // Log scan_init_kernel configuration @@ -695,14 +695,14 @@ struct DispatchSelectIf : SelectedPolicy error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { - break; + return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { - break; + return error; } // Log select_if_kernel configuration @@ -715,7 +715,7 @@ struct DispatchSelectIf : SelectedPolicy block_threads)); if (cudaSuccess != error) { - break; + return error; } _CubLog("Invoking select_if_kernel<<<%d, %d, 0, " @@ -747,14 +747,14 @@ struct DispatchSelectIf : SelectedPolicy error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { - break; + return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { - break; + return error; } // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.) diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu index 0405dc010ee..2d2c1ecaa06 100644 --- a/cub/test/catch2_test_device_partition_flagged.cu +++ b/cub/test/catch2_test_device_partition_flagged.cu @@ -431,16 +431,8 @@ try // Ensure that we created the correct output REQUIRE(num_selected_out[0] == cut_off_index); - bool all_results_correct = thrust::equal( - correctness_flags.cbegin(), - correctness_flags.cbegin() + (num_items / bits_per_element), - thrust::make_constant_iterator(0xFFFFFFFFU)); + bool all_results_correct = are_all_flags_set(correctness_flags, num_items); REQUIRE(all_results_correct == true); - if (num_items % bits_per_element != 0) - { - std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U; - REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); - } } catch (std::bad_alloc&) { diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu index 5feab9de316..1db5774da42 100644 --- a/cub/test/catch2_test_device_partition_if.cu +++ b/cub/test/catch2_test_device_partition_if.cu @@ -352,16 +352,8 @@ try // Ensure that we created the correct output REQUIRE(num_selected_out[0] == cut_off_index); - bool all_results_correct = thrust::equal( - correctness_flags.cbegin(), - correctness_flags.cbegin() + (num_items / bits_per_element), - thrust::make_constant_iterator(0xFFFFFFFFU)); + bool all_results_correct = are_all_flags_set(correctness_flags, num_items); REQUIRE(all_results_correct == true); - if (num_items % bits_per_element != 0) - { - std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U; - REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); - } } catch (std::bad_alloc&) { diff --git a/cub/test/catch2_test_device_select_common.cuh b/cub/test/catch2_test_device_select_common.cuh index 3ee93ee6cae..39f82d04612 100644 --- a/cub/test/catch2_test_device_select_common.cuh +++ b/cub/test/catch2_test_device_select_common.cuh @@ -3,8 +3,12 @@ #pragma once +#include + #include +#include "catch2_test_helper.h" + template struct less_than_t { @@ -70,7 +74,7 @@ struct index_to_expected_partition_op }; template -index_to_expected_partition_op make_index_to_expected_partition_op( +static index_to_expected_partition_op make_index_to_expected_partition_op( SelectedItT expected_selected_it, RejectedItT expected_rejected_it, std::int64_t expected_num_selected) { return index_to_expected_partition_op{ @@ -98,8 +102,23 @@ struct flag_correct_writes_op }; template -flag_correct_writes_op -make_checking_write_op(ExpectedValuesItT expected_it, std::uint32_t* d_correctness_flags) +flag_correct_writes_op static make_checking_write_op( + ExpectedValuesItT expected_it, std::uint32_t* d_correctness_flags) { return flag_correct_writes_op{expected_it, d_correctness_flags}; } + +static bool are_all_flags_set(c2h::device_vector& flag_vector, std::size_t num_flags_to_check) +{ + static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t); + bool all_flags_set = thrust::equal( + flag_vector.cbegin(), + flag_vector.cbegin() + (num_flags_to_check / bits_per_element), + thrust::make_constant_iterator(0xFFFFFFFFU)); + if (num_flags_to_check % bits_per_element != 0) + { + std::uint32_t last_element_flags = (0x00000001U << (num_flags_to_check % bits_per_element)) - 0x01U; + all_flags_set = all_flags_set && (flag_vector[num_flags_to_check / bits_per_element] == last_element_flags); + } + return all_flags_set; +} diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu index 080134f73b1..47afc2fc2a7 100644 --- a/cub/test/catch2_test_device_select_unique.cu +++ b/cub/test/catch2_test_device_select_unique.cu @@ -313,16 +313,8 @@ try // Ensure that we created the correct output REQUIRE(num_selected_out[0] == num_items); - bool all_results_correct = thrust::equal( - correctness_flags.cbegin(), - correctness_flags.cbegin() + (num_items / bits_per_element), - thrust::make_constant_iterator(0xFFFFFFFFU)); + bool all_results_correct = are_all_flags_set(correctness_flags, num_items); REQUIRE(all_results_correct == true); - if (num_items % bits_per_element != 0) - { - std::uint32_t last_element_flags = (0x00000001U << (num_items % bits_per_element)) - 0x01U; - REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); - } } // All the same -> single unique @@ -350,19 +342,8 @@ try // Ensure that we created the correct output REQUIRE(num_selected_out[0] == expected_num_unique); - bool all_results_correct = thrust::equal( - correctness_flags.cbegin(), - correctness_flags.cbegin() + (expected_num_unique / bits_per_element), - thrust::make_constant_iterator(0xFFFFFFFFU)); + bool all_results_correct = are_all_flags_set(correctness_flags, expected_num_unique); REQUIRE(all_results_correct == true); - _CCCL_DIAG_PUSH - _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */ - _CCCL_IF_CONSTEXPR (expected_num_unique % bits_per_element != 0) - { - std::uint32_t last_element_flags = (0x00000001U << (expected_num_unique % bits_per_element)) - 0x01U; - REQUIRE(correctness_flags[correctness_flags.size() - 1] == last_element_flags); - } - _CCCL_DIAG_POP } } catch (std::bad_alloc&) From 8abd5e900e139ed854854620df4a6679e7fb29ce Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 8 Oct 2024 02:55:31 -0700 Subject: [PATCH 37/41] addresses review comments --- cub/cub/agent/agent_select_if.cuh | 1 + cub/cub/detail/choose_offset.cuh | 10 ++++------ cub/cub/device/device_select.cuh | 6 +++--- cub/cub/device/dispatch/dispatch_select_if.cuh | 7 +++---- thrust/thrust/system/cuda/detail/copy_if.h | 2 +- 5 files changed, 12 insertions(+), 14 deletions(-) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index e765be68a18..ff09051be7e 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -169,6 +169,7 @@ struct partition_distinct_output_t * @tparam StreamingContextT * Type providing the context information for the current partition, with the following member functions: * input_offset() -> base offset for the input (and flags) iterator + * is_first_partition() -> [Select::Unique-only] whether this is the first partition * num_previously_selected() -> base offset for the output iterator for selected items * num_previously_rejected() -> base offset for the output iterator for rejected items (partition only) * num_total_items() -> total number of items across all partitions (partition only) diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh index 3e3b51f9504..b87b02f2fcb 100644 --- a/cub/cub/detail/choose_offset.cuh +++ b/cub/cub/detail/choose_offset.cuh @@ -29,8 +29,6 @@ #include -#include "cuda/std/__type_traits/is_unsigned.h" - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -63,7 +61,7 @@ struct choose_offset "NumItemsT must be an integral type, but not bool"); // Unsigned integer type for global offsets. - using type = typename ::cuda::std::conditional::type; + using type = typename ::cuda::std::_If::type; }; /** @@ -86,7 +84,7 @@ struct promote_small_offset "NumItemsT must be an integral type, but not bool"); // Unsigned integer type for global offsets. - using type = typename ::cuda::std::conditional::type; + using type = typename ::cuda::std::_If::type; }; /** @@ -113,10 +111,10 @@ struct choose_signed_offset // uint32 -> int64, else // LEQ 4B -> int32, else // int64 - using type = typename ::cuda::std::conditional< + using type = typename ::cuda::std::_If< (::cuda::std::is_integral::value && ::cuda::std::is_unsigned::value), ::cuda::std::int64_t, - typename ::cuda::std::conditional<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>::type>::type; + typename ::cuda::std::_If<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>::type>::type; /** * Checks if the given num_items can be covered by the selected offset type. If not, returns cudaErrorInvalidValue, diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 35428013574..22c9380ebe1 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -1012,20 +1012,20 @@ struct DeviceSelect } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, + ::cuda::std::int64_t num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - return Unique( + return Unique( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream); } #endif // DOXYGEN_SHOULD_SKIP_THIS diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index bff0252d357..f8cb1bc292c 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -116,12 +116,12 @@ public: return first_partition; }; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const + _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const { return first_partition ? TotalNumItemsT{0} : *d_num_selected_in; }; - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const + _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const { return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected()); }; @@ -450,8 +450,7 @@ struct DispatchSelectIf : SelectedPolicy using num_total_items_t = OffsetT; // Type used to provide streaming information about each partition's context - static constexpr per_partition_offset_t const partition_size = - cuda::std::numeric_limits::max(); + static constexpr per_partition_offset_t partition_size = ::cuda::std::numeric_limits::max(); // If the values representable by OffsetT exceed the partition_size, we use a kernel template specialization that // supports streaming (i.e., splitting the input into partitions of up to partition_size number of items) diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h index 034e850e0e2..d8676a9749a 100644 --- a/thrust/thrust/system/cuda/detail/copy_if.h +++ b/thrust/thrust/system/cuda/detail/copy_if.h @@ -236,7 +236,7 @@ THRUST_RUNTIME_FUNCTION OutputIt copy_if( void* temp_storage = static_cast(tmp.data().get()); // Run algorithm - dispatch64_t::dispatch( + status = dispatch64_t::dispatch( policy, temp_storage, temp_storage_bytes, first, stencil, output, predicate, static_cast(num_items)); cuda_cub::throw_on_error(status, "copy_if failed on 2nd step"); From f8edfb47c4c5907b6c7531078af7e250db464449 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 8 Oct 2024 03:21:53 -0700 Subject: [PATCH 38/41] fixes conditional type usage --- cub/cub/detail/choose_offset.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh index b87b02f2fcb..7dc5c6fd6fb 100644 --- a/cub/cub/detail/choose_offset.cuh +++ b/cub/cub/detail/choose_offset.cuh @@ -61,7 +61,7 @@ struct choose_offset "NumItemsT must be an integral type, but not bool"); // Unsigned integer type for global offsets. - using type = typename ::cuda::std::_If::type; + using type = ::cuda::std::_If<(sizeof(NumItemsT) <= 4), std::uint32_t, unsigned long long>; }; /** @@ -84,7 +84,7 @@ struct promote_small_offset "NumItemsT must be an integral type, but not bool"); // Unsigned integer type for global offsets. - using type = typename ::cuda::std::_If::type; + using type = ::cuda::std::_If<(sizeof(NumItemsT) < 4), std::int32_t, NumItemsT>; }; /** @@ -111,10 +111,10 @@ struct choose_signed_offset // uint32 -> int64, else // LEQ 4B -> int32, else // int64 - using type = typename ::cuda::std::_If< - (::cuda::std::is_integral::value && ::cuda::std::is_unsigned::value), - ::cuda::std::int64_t, - typename ::cuda::std::_If<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>::type>::type; + using type = + ::cuda::std::_If<(::cuda::std::is_integral::value && ::cuda::std::is_unsigned::value), + ::cuda::std::int64_t, + ::cuda::std::_If<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>>; /** * Checks if the given num_items can be covered by the selected offset type. If not, returns cudaErrorInvalidValue, From fe112dab7dc6aa17900953bf1dfd655d2673d962 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 8 Oct 2024 06:08:01 -0700 Subject: [PATCH 39/41] makes tests on empty input more robust --- cub/test/catch2_test_device_partition_flagged.cu | 2 +- cub/test/catch2_test_device_partition_if.cu | 2 +- cub/test/catch2_test_device_select_flagged.cu | 2 +- cub/test/catch2_test_device_select_if.cu | 2 +- cub/test/catch2_test_device_select_unique.cu | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu index 2d2c1ecaa06..a5dfe079fa5 100644 --- a/cub/test/catch2_test_device_partition_flagged.cu +++ b/cub/test/catch2_test_device_partition_flagged.cu @@ -106,7 +106,7 @@ CUB_TEST("DevicePartition::Flagged can run with empty input", "[device][partitio c2h::device_vector flags(num_items); // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); + c2h::device_vector num_selected_out(1, 42); int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items); diff --git a/cub/test/catch2_test_device_partition_if.cu b/cub/test/catch2_test_device_partition_if.cu index 1db5774da42..b6721b3c9e9 100644 --- a/cub/test/catch2_test_device_partition_if.cu +++ b/cub/test/catch2_test_device_partition_if.cu @@ -95,7 +95,7 @@ CUB_TEST("DevicePartition::If can run with empty input", "[device][partition_if] c2h::device_vector out(num_items); // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); + c2h::device_vector num_selected_out(1, 42); int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); partition_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{}); diff --git a/cub/test/catch2_test_device_select_flagged.cu b/cub/test/catch2_test_device_select_flagged.cu index 0ff05b3f00c..84dd38f0c3e 100644 --- a/cub/test/catch2_test_device_select_flagged.cu +++ b/cub/test/catch2_test_device_select_flagged.cu @@ -96,7 +96,7 @@ CUB_TEST("DeviceSelect::Flagged can run with empty input", "[device][select_flag c2h::device_vector flags(num_items); // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); + c2h::device_vector num_selected_out(1, 42); int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items); diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu index 6eb1219cae7..4cd6a043700 100644 --- a/cub/test/catch2_test_device_select_if.cu +++ b/cub/test/catch2_test_device_select_if.cu @@ -100,7 +100,7 @@ CUB_TEST("DeviceSelect::If can run with empty input", "[device][select_if]", typ c2h::device_vector out(num_items); // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); + c2h::device_vector num_selected_out(1, 42); int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); select_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{}); diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu index 47afc2fc2a7..ddecf4664e6 100644 --- a/cub/test/catch2_test_device_select_unique.cu +++ b/cub/test/catch2_test_device_select_unique.cu @@ -111,7 +111,7 @@ CUB_TEST("DeviceSelect::Unique can run with empty input", "[device][select_uniqu c2h::device_vector out(num_items); // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); + c2h::device_vector num_selected_out(1, 42); int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); select_unique(in.begin(), out.begin(), d_num_selected_out, num_items); From 9475d95373c2b13c50fbff4e86ea813724048425 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 8 Oct 2024 09:05:40 -0700 Subject: [PATCH 40/41] restores empty problem behaviour --- .../device/dispatch/dispatch_select_if.cuh | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index f8cb1bc292c..0ddfe163852 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -580,19 +580,6 @@ struct DispatchSelectIf : SelectedPolicy EqualityOpT, per_partition_offset_t, streaming_context_t>; - - // Return for empty problem (also needed to avoid division by zero) - // TODO(elstehle): In this case d_num_selected_out will never be written. Maybe we want to write it despite? - if (num_items == 0) - { - // If this was just to query temporary storage requirements, return non-empty bytes - if (d_temp_storage == nullptr) - { - temp_storage_bytes = std::size_t{1}; - } - return cudaSuccess; - } - cudaError error = cudaSuccess; constexpr auto block_threads = VsmemHelperT::agent_policy_t::BLOCK_THREADS; @@ -607,8 +594,9 @@ struct DispatchSelectIf : SelectedPolicy ? static_cast(partition_size) : num_items; - // The number of partitions required to "iterate" over the total input - auto const num_partitions = ::cuda::ceil_div(num_items, max_partition_size); + // The number of partitions required to "iterate" over the total input (ternary to avoid div-by-zero) + auto const num_partitions = + (max_partition_size == 0) ? static_cast(1) : ::cuda::ceil_div(num_items, max_partition_size); // The maximum number of tiles for which we will ever invoke the kernel auto const max_num_tiles_per_invocation = static_cast(::cuda::ceil_div(max_partition_size, tile_size)); @@ -704,6 +692,13 @@ struct DispatchSelectIf : SelectedPolicy return error; } + // No more items to process (note, we do not want to return early for num_items==0, because we need to make sure + // that `scan_init_kernel` has written '0' to d_num_selected_out) + if (current_num_items == 0) + { + return cudaSuccess; + } + // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { From 6f0bd0013337c2a51ffcae920d099b6fe5afc09d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 8 Oct 2024 09:05:47 -0700 Subject: [PATCH 41/41] adds comment on const ref --- cub/cub/agent/agent_select_if.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index ff09051be7e..ea2d1c24b90 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -316,6 +316,8 @@ struct AgentSelectIf InequalityWrapper inequality_op; ///< T inequality operator SelectOpT select_op; ///< Selection operator OffsetT num_items; ///< Total number of input items + + // Note: This is a const reference because we have seen double-digit percentage perf regressions otherwise const StreamingContextT& streaming_context; ///< Context for the current partition //---------------------------------------------------------------------