Skip to content

Commit

Permalink
Reduce thrust benchmarks noise (#1203)
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko authored Dec 12, 2023
1 parent ef35c60 commit 58a527e
Show file tree
Hide file tree
Showing 41 changed files with 440 additions and 186 deletions.
123 changes: 120 additions & 3 deletions cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
#pragma once

#include <cuda/std/complex>

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>

#include <cuda/std/complex>
#include <cuda/std/span>

#include <limits>
#include <map>
#include <stdexcept>

#include <nvbench/nvbench.cuh>
#include <cuda/std/span>

#if defined(_MSC_VER)
#define NVBENCH_HELPER_HAS_I128 0
Expand Down Expand Up @@ -478,3 +480,118 @@ struct max_t
return less(lhs, rhs) ? rhs : lhs;
}
};

namespace
{
struct caching_allocator_t
{
using value_type = char;

caching_allocator_t() = default;
~caching_allocator_t()
{
free_all();
}

char* allocate(std::ptrdiff_t num_bytes)
{
value_type* result{};
auto free_block = free_blocks.find(num_bytes);

if (free_block != free_blocks.end())
{
result = free_block->second;
free_blocks.erase(free_block);
}
else
{
result = do_allocate(num_bytes);
}

allocated_blocks.insert(std::make_pair(result, num_bytes));
return result;
}

void deallocate(char* ptr, size_t)
{
auto iter = allocated_blocks.find(ptr);
if (iter == allocated_blocks.end())
{
throw std::runtime_error("Memory was not allocated by this allocator");
}

std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
free_blocks.insert(std::make_pair(num_bytes, ptr));
}

private:
using free_blocks_type = std::multimap<std::ptrdiff_t, char*>;
using allocated_blocks_type = std::map<char*, std::ptrdiff_t>;

free_blocks_type free_blocks;
allocated_blocks_type allocated_blocks;

void free_all()
{
for (auto i : free_blocks)
{
do_deallocate(i.second);
}

for (auto i : allocated_blocks)
{
do_deallocate(i.first);
}
}

value_type* do_allocate(std::size_t num_bytes)
{
value_type* result{};
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
const cudaError_t status = cudaMalloc(&result, num_bytes);
if (cudaSuccess != status)
{
throw std::runtime_error(std::string("Failed to allocate device memory: ") + cudaGetErrorString(status));
}
#else
result = new value_type[num_bytes];
#endif
return result;
}

void do_deallocate(value_type* ptr)
{
#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
cudaFree(ptr);
#else
delete[] ptr;
#endif
}
};

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
auto policy(caching_allocator_t& alloc)
{
return thrust::cuda::par(alloc);
}
#else
auto policy(caching_allocator_t&)
{
return thrust::device;
}
#endif

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
auto policy(caching_allocator_t& alloc, nvbench::launch& launch)
{
return thrust::cuda::par(alloc).on(launch.get_stream());
}
#else
auto policy(caching_allocator_t&, nvbench::launch&)
{
return thrust::device;
}
#endif

} // namespace
7 changes: 5 additions & 2 deletions thrust/benchmarks/bench/adjacent_difference/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,11 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin());
caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(), output.begin());

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
thrust::adjacent_difference(policy(alloc, launch), input.cbegin(), input.cend(), output.begin());
});
}

Expand Down
16 changes: 11 additions & 5 deletions thrust/benchmarks/bench/adjacent_difference/custom.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ struct custom_op
};

template <typename T>
static void basic(nvbench::state &state, nvbench::type_list<T>)
{
static void basic(nvbench::state &state, nvbench::type_list<T>) {
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> input = generate(elements);
Expand All @@ -60,9 +59,16 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin(), custom_op<T>{42});
});
caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), input.cbegin(), input.cend(),
output.begin(), custom_op<T>{42});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::adjacent_difference(policy(alloc, launch),
input.cbegin(), input.cend(),
output.begin(), custom_op<T>{42});
});
}

using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
Expand Down
11 changes: 8 additions & 3 deletions thrust/benchmarks/bench/adjacent_difference/in_place.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,14 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::adjacent_difference(vec.begin(), vec.end(), vec.begin());
});
caching_allocator_t alloc;
thrust::adjacent_difference(policy(alloc), vec.begin(), vec.end(), vec.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::adjacent_difference(policy(alloc, launch), vec.begin(),
vec.end(), vec.begin());
});
}

using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
Expand Down
15 changes: 9 additions & 6 deletions thrust/benchmarks/bench/copy/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
*
******************************************************************************/

#include <nvbench/nvbench.cuh>
#include <nvbench_helper.cuh>

#include <thrust/count.h>
#include <thrust/device_vector.h>
Expand All @@ -45,11 +45,14 @@ static void basic(nvbench::state &state,
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::copy(input.cbegin(),
input.cend(),
output.begin());
});
caching_allocator_t alloc;
thrust::copy(policy(alloc), input.cbegin(), input.cend(), output.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::copy(policy(alloc, launch), input.cbegin(), input.cend(),
output.begin());
});
}

using types = nvbench::type_list<nvbench::uint8_t,
Expand Down
11 changes: 8 additions & 3 deletions thrust/benchmarks/bench/copy/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,14 @@ static void basic(nvbench::state &state,
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(selected_elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::copy_if(input.cbegin(), input.cend(), output.begin(), select_op);
});
caching_allocator_t alloc;
thrust::copy_if(policy(alloc), input.cbegin(), input.cend(), output.begin(), select_op);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::copy_if(policy(alloc, launch), input.cbegin(),
input.cend(), output.begin(), select_op);
});
}

using types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t>;
Expand Down
10 changes: 7 additions & 3 deletions thrust/benchmarks/bench/fill/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,13 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_element_count(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::fill(output.begin(), output.end(), T{42});
});
caching_allocator_t alloc;
thrust::fill(policy(alloc), output.begin(), output.end(), T{42});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::fill(policy(alloc, launch), output.begin(), output.end(), T{42});
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
Expand Down
11 changes: 8 additions & 3 deletions thrust/benchmarks/bench/inner_product/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,14 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements * 2);
state.add_global_memory_writes<T>(1);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::inner_product(lhs.begin(), lhs.end(), rhs.begin(), T{0});
});
caching_allocator_t alloc;
thrust::inner_product(policy(alloc), lhs.begin(), lhs.end(), rhs.begin(), T{0});

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::inner_product(policy(alloc, launch), lhs.begin(),
lhs.end(), rhs.begin(), T{0});
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types))
Expand Down
18 changes: 11 additions & 7 deletions thrust/benchmarks/bench/merge/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,17 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::merge(in.cbegin(),
in.cbegin() + elements_in_lhs,
in.cbegin() + elements_in_lhs,
in.cend(),
out.begin());
});
caching_allocator_t alloc;
thrust::merge(policy(alloc), in.cbegin(), in.cbegin() + elements_in_lhs,
in.cbegin() + elements_in_lhs, in.cend(), out.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::merge(policy(alloc, launch), in.cbegin(),
in.cbegin() + elements_in_lhs,
in.cbegin() + elements_in_lhs, in.cend(),
out.begin());
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
Expand Down
20 changes: 13 additions & 7 deletions thrust/benchmarks/bench/partition/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,19 @@ static void basic(nvbench::state &state,
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::partition_copy(input.cbegin(),
input.cend(),
output.begin(),
thrust::make_reverse_iterator(output.begin() + elements),
select_op);
});
caching_allocator_t alloc;
thrust::partition_copy(
policy(alloc), input.cbegin(), input.cend(), output.begin(),
thrust::make_reverse_iterator(output.begin() + elements), select_op);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::partition_copy(
policy(alloc, launch), input.cbegin(), input.cend(),
output.begin(),
thrust::make_reverse_iterator(output.begin() + elements),
select_op);
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
Expand Down
7 changes: 5 additions & 2 deletions thrust/benchmarks/bench/reduce/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,11 @@ static void basic(nvbench::state &state, nvbench::type_list<T>)
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(1);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
do_not_optimize(thrust::reduce(in.begin(), in.end()));
caching_allocator_t alloc;
do_not_optimize(thrust::reduce(policy(alloc), in.begin(), in.end()));

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & launch) {
do_not_optimize(thrust::reduce(policy(alloc, launch), in.begin(), in.end()));
});
}

Expand Down
17 changes: 10 additions & 7 deletions thrust/benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,16 @@ static void basic(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_writes<KeyT>(unique_keys);
state.add_global_memory_writes<ValueT>(unique_keys);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::reduce_by_key(in_keys.begin(),
in_keys.end(),
in_vals.begin(),
out_keys.begin(),
out_vals.begin());
});
caching_allocator_t alloc;
thrust::reduce_by_key(policy(alloc), in_keys.begin(), in_keys.end(),
in_vals.begin(), out_keys.begin(), out_vals.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::reduce_by_key(policy(alloc, launch), in_keys.begin(),
in_keys.end(), in_vals.begin(),
out_keys.begin(), out_vals.begin());
});
}

using key_types = nvbench::type_list<int8_t,
Expand Down
13 changes: 10 additions & 3 deletions thrust/benchmarks/bench/scan/exclusive/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,16 @@ static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT>)
state.add_global_memory_reads<ValueT>(elements);
state.add_global_memory_writes<ValueT>(elements);

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) {
thrust::exclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin());
});
caching_allocator_t alloc;
thrust::exclusive_scan_by_key(policy(alloc), keys.cbegin(), keys.cend(),
in_vals.cbegin(), out_vals.begin());

state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch &launch) {
thrust::exclusive_scan_by_key(
policy(alloc, launch), keys.cbegin(), keys.cend(),
in_vals.cbegin(), out_vals.begin());
});
}

using key_types = all_types;
Expand Down
Loading

0 comments on commit 58a527e

Please sign in to comment.