Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds support for large number of items in DeviceSelect and DevicePartition #2400

Merged
merged 44 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
97f3eec
adds streaming selection and partition
elstehle Sep 9, 2024
3085f86
ensures policy lookup uses per-partition offset type
elstehle Sep 9, 2024
7f264ec
mitigates perf degradation on select
elstehle Sep 10, 2024
412ea3c
makes device interfaces use i64 num_items
elstehle Sep 10, 2024
70d562b
updates select::if large num_items tests
elstehle Sep 11, 2024
b274e1b
fixes syntax
elstehle Sep 11, 2024
d6b21ba
adds tests for large num_items for select::flagged
elstehle Sep 11, 2024
d26eb65
adds tests for large num_items for partition::if
elstehle Sep 11, 2024
1be152c
adds tests for large num_items for partition::flagged
elstehle Sep 11, 2024
466f915
fixes redundant definition
elstehle Sep 11, 2024
d0f6e7f
fixes implicit conversion
elstehle Sep 11, 2024
cea8509
fixes f32 select::if perf regression
elstehle Sep 16, 2024
96be1e8
fixes perf regression for partition
elstehle Sep 16, 2024
a117b45
fix feature macro
elstehle Sep 16, 2024
480da39
fixes feature macro
elstehle Sep 16, 2024
aaed489
fixes feature macro
elstehle Sep 16, 2024
3333738
silences msvc constant conditional warning
elstehle Sep 17, 2024
66ca8fd
add support for streamin ctx dummy for partition with small offset types
elstehle Sep 18, 2024
2781775
removes superfluous template parameter
elstehle Sep 18, 2024
7373696
adds test for different offset types for partition::if & ::flagged
elstehle Sep 18, 2024
687fb99
adds tests and support for streaming select::unique
elstehle Sep 18, 2024
0e3f602
fixes msvc warning
elstehle Sep 18, 2024
305b307
fixes perf for partition
elstehle Sep 18, 2024
db36ae9
fixes format
elstehle Sep 18, 2024
5f38f5e
fixes mixup for partition perf fix
elstehle Sep 19, 2024
16a2a75
fixes syntax
elstehle Sep 19, 2024
be83ebe
fixes partition:flagged perf
elstehle Sep 19, 2024
d4816b6
fixes perf for partition::flagged
elstehle Sep 19, 2024
3c9788d
switches unique to always use i64 offset types
elstehle Sep 20, 2024
49a50f5
adds benchmark for partition with distinct iterators
elstehle Sep 20, 2024
4c68b46
Merge branch 'main' into enh/streaming-selection
elstehle Sep 20, 2024
5b747b9
resolves merge conflicts
elstehle Sep 20, 2024
5461815
fixes merge conflict
elstehle Sep 23, 2024
18cd2d7
makes sass identical to main for i32 partition
elstehle Sep 23, 2024
0278d54
Merge remote-tracking branch 'upstream/main' into enh/streaming-selec…
elstehle Oct 1, 2024
6ccdaf9
updates thrust copy_if to always use i64 offset types
elstehle Oct 1, 2024
95de26f
fixes formatting
elstehle Oct 1, 2024
8750240
minor style improvements
elstehle Oct 3, 2024
8abd5e9
addresses review comments
elstehle Oct 8, 2024
d12f79a
Merge remote-tracking branch 'upstream/main' into enh/streaming-selec…
elstehle Oct 8, 2024
f8edfb4
fixes conditional type usage
elstehle Oct 8, 2024
fe112da
makes tests on empty input more robust
elstehle Oct 8, 2024
9475d95
restores empty problem behaviour
elstehle Oct 8, 2024
6f0bd00
adds comment on const ref
elstehle Oct 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 38 additions & 12 deletions cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@

#include <cub/device/device_partition.cuh>

#include <thrust/count.h>

#include <cuda/std/type_traits>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -77,16 +81,35 @@ struct policy_hub_t
};
#endif // TUNE_BASE

template <typename T, typename OffsetT>
void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
template <typename FlagsItT, typename T, typename OffsetT>
void init_output_partition_buffer(
FlagsItT d_flags,
OffsetT num_items,
T* d_out,
cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
{
using input_it_t = const T*;
using flag_it_t = const bool*;
using output_it_t = T*;
using num_selected_it_t = OffsetT*;
using select_op_t = cub::NullType;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true);
d_partition_out_buffer = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
}

template <typename FlagsItT, typename T, typename OffsetT>
void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_out_buffer)
{
d_partition_out_buffer = d_out;
}

template <typename T, typename OffsetT, typename UseDistinctPartitionT>
void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
{
using input_it_t = const T*;
using flag_it_t = const bool*;
using num_selected_it_t = OffsetT*;
using select_op_t = cub::NullType;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
Expand Down Expand Up @@ -127,8 +150,9 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)

input_it_t d_in = thrust::raw_pointer_cast(in.data());
flag_it_t d_flags = thrust::raw_pointer_cast(flags.data());
output_it_t d_out = thrust::raw_pointer_cast(out.data());
num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
output_it_t d_out{};
init_output_partition_buffer(flags.cbegin(), elements, thrust::raw_pointer_cast(out.data()), d_out);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
Expand Down Expand Up @@ -158,8 +182,10 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT>)
});
}

NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;

NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
51 changes: 39 additions & 12 deletions cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@

#include <cub/device/device_partition.cuh>

#include <thrust/count.h>

#include <cuda/std/type_traits>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -102,16 +106,36 @@ T value_from_entropy(double percentage)
return static_cast<T>(result);
}

template <typename T, typename OffsetT>
void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
void init_output_partition_buffer(
InItT d_in,
OffsetT num_items,
T* d_out,
SelectOpT select_op,
cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
{
using input_it_t = const T*;
using flag_it_t = cub::NullType*;
using output_it_t = T*;
using num_selected_it_t = OffsetT*;
using select_op_t = less_then_t<T>;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
const auto selected_elements = thrust::count_if(d_in, d_in + num_items, select_op);
d_partition_out_buffer = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
}

template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
void init_output_partition_buffer(InItT, OffsetT, T* d_out, SelectOpT, T*& d_partition_out_buffer)
{
d_partition_out_buffer = d_out;
}

template <typename T, typename OffsetT, typename UseDistinctPartitionT>
void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
{
using input_it_t = const T*;
using flag_it_t = cub::NullType*;
using num_selected_it_t = OffsetT*;
using select_op_t = less_then_t<T>;
using equality_op_t = cub::NullType;
using offset_t = OffsetT;
constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
using output_it_t = typename ::cuda::std::
conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;

#if !TUNE_BASE
using policy_t = policy_hub_t<T>;
Expand Down Expand Up @@ -153,8 +177,9 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)

input_it_t d_in = thrust::raw_pointer_cast(in.data());
flag_it_t d_flags = nullptr;
output_it_t d_out = thrust::raw_pointer_cast(out.data());
num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
output_it_t d_out{};
init_output_partition_buffer(in.cbegin(), elements, thrust::raw_pointer_cast(out.data()), select_op, d_out);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
Expand Down Expand Up @@ -183,8 +208,10 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
});
}

NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;

NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
Loading
Loading