Skip to content

Commit

Permalink
Adds support for large number of items in DeviceSelect and `DeviceP…
Browse files Browse the repository at this point in the history
…artition` (#2400)

* adds streaming selection and partition

* ensures policy lookup uses per-partition offset type

* mitigates perf degradation on select

* makes device interfaces use i64 num_items

* updates select::if large num_items tests

* fixes syntax

* adds tests for large num_items for select::flagged

* adds tests for large num_items for partition::if

* adds tests for large num_items for partition::flagged

* fixes redundant definition

* fixes implicit conversion

* fixes f32 select::if perf regression

* fixes perf regression for partition

* fix feature macro

* fixes feature macro

* fixes feature macro

* silences msvc constant conditional warning

* add support for streamin ctx dummy for partition with small offset types

* removes superfluous template parameter

* adds test for different offset types for partition::if & ::flagged

* adds tests and support for streaming select::unique

* fixes msvc warning

* fixes perf for partition

* fixes format

* fixes mixup for partition perf fix

* fixes syntax

* fixes partition:flagged perf

* fixes perf for partition::flagged

* switches unique to always use i64 offset types

* adds benchmark for partition with distinct iterators

* resolves merge conflicts

* fixes merge conflict

* makes sass identical to main for i32 partition

* updates thrust copy_if to always use i64 offset types

* fixes formatting

* minor style improvements

* addresses review comments

* fixes conditional type usage

* makes tests on empty input more robust

* restores empty problem behaviour

* adds comment on const ref
  • Loading branch information
elstehle authored Oct 8, 2024
1 parent ee5dd3e commit 16f9a1a
Show file tree
Hide file tree
Showing 15 changed files with 1,047 additions and 361 deletions.
50 changes: 38 additions & 12 deletions cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@

#include <cub/device/device_partition.cuh>

#include <thrust/count.h>

#include <cuda/std/type_traits>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -77,16 +81,35 @@ struct policy_hub_t
};
#endif // TUNE_BASE

template <typename T, typename OffsetT>
void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
template <typename FlagsItT, typename T, typename OffsetT>
void init_output_partition_buffer(
FlagsItT d_flags,
OffsetT num_items,
T* d_out,
cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
{
using input_it_t = const T*;
using flag_it_t = const bool*;
using output_it_t = T*;
using num_selected_it_t = OffsetT*;
using select_op_t = cub::NullType;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true);
d_partition_out_buffer = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
}

template <typename FlagsItT, typename T, typename OffsetT>
void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_out_buffer)
{
d_partition_out_buffer = d_out;
}

template <typename T, typename OffsetT, typename UseDistinctPartitionT>
void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
{
using input_it_t = const T*;
using flag_it_t = const bool*;
using num_selected_it_t = OffsetT*;
using select_op_t = cub::NullType;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
Expand Down Expand Up @@ -127,8 +150,9 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)

input_it_t d_in = thrust::raw_pointer_cast(in.data());
flag_it_t d_flags = thrust::raw_pointer_cast(flags.data());
output_it_t d_out = thrust::raw_pointer_cast(out.data());
num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
output_it_t d_out{};
init_output_partition_buffer(flags.cbegin(), elements, thrust::raw_pointer_cast(out.data()), d_out);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
Expand Down Expand Up @@ -158,8 +182,10 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
});
}

NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;

NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
51 changes: 39 additions & 12 deletions cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@

#include <cub/device/device_partition.cuh>

#include <thrust/count.h>

#include <cuda/std/type_traits>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -102,16 +106,36 @@ T value_from_entropy(double percentage)
return static_cast<T>(result);
}

template <typename T, typename OffsetT>
void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
void init_output_partition_buffer(
InItT d_in,
OffsetT num_items,
T* d_out,
SelectOpT select_op,
cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
{
using input_it_t = const T*;
using flag_it_t = cub::NullType*;
using output_it_t = T*;
using num_selected_it_t = OffsetT*;
using select_op_t = less_then_t<T>;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
const auto selected_elements = thrust::count_if(d_in, d_in + num_items, select_op);
d_partition_out_buffer = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
}

template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
void init_output_partition_buffer(InItT, OffsetT, T* d_out, SelectOpT, T*& d_partition_out_buffer)
{
d_partition_out_buffer = d_out;
}

template <typename T, typename OffsetT, typename UseDistinctPartitionT>
void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
{
using input_it_t = const T*;
using flag_it_t = cub::NullType*;
using num_selected_it_t = OffsetT*;
using select_op_t = less_then_t<T>;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
Expand Down Expand Up @@ -153,8 +177,9 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)

input_it_t d_in = thrust::raw_pointer_cast(in.data());
flag_it_t d_flags = nullptr;
output_it_t d_out = thrust::raw_pointer_cast(out.data());
num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
output_it_t d_out{};
init_output_partition_buffer(in.cbegin(), elements, thrust::raw_pointer_cast(out.data()), select_op, d_out);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
Expand Down Expand Up @@ -183,8 +208,10 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
});
}

NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;

NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
Loading

0 comments on commit 16f9a1a

Please sign in to comment.