diff --git a/CMakeLists.txt b/CMakeLists.txt index 59131fac4f8..483cf538856 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" OFF) option(GINKGO_BUILD_TESTS "Generate build files for unit tests" ON) option(GINKGO_BUILD_EXAMPLES "Build Ginkgo's examples" ON) option(GINKGO_BUILD_BENCHMARKS "Build Ginkgo's benchmarks" ON) +option(GINKGO_BUILD_MICROBENCHMARKS "Build Ginkgo's microbenchmarks (requires GINKGO_BUILD_BENCHMARKS)" OFF) option(GINKGO_BUILD_REFERENCE "Compile reference CPU kernels" ON) option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" ${GINKGO_HAS_OMP}) option(GINKGO_BUILD_MPI "Compile the MPI module" ${GINKGO_HAS_MPI}) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 55ed76d1613..0a1b8f7692e 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -162,6 +162,9 @@ add_subdirectory(tools) if (GINKGO_BUILD_TESTS) add_subdirectory(test) endif() +if(GINKGO_BUILD_MICROBENCHMARKS) + add_subdirectory(gpu-microbenchmarks) +endif() configure_file(run_all_benchmarks.sh run_all_benchmarks.sh COPYONLY) diff --git a/benchmark/gpu-microbenchmarks/CMakeLists.txt b/benchmark/gpu-microbenchmarks/CMakeLists.txt new file mode 100644 index 00000000000..34767cf8a77 --- /dev/null +++ b/benchmark/gpu-microbenchmarks/CMakeLists.txt @@ -0,0 +1,40 @@ +if(GINKGO_BUILD_CUDA AND GINKGO_BUILD_HIP) + message(FATAL_ERROR "gpubench doesn't support CUDA and HIP at the same time") +endif() +if(NOT (GINKGO_BUILD_CUDA OR GINKGO_BUILD_HIP)) + message(FATAL_ERROR "gpubench only supports CUDA or HIP") +endif() +if(GINKGO_BUILD_CUDA) + set(GPU_LANG CUDA) + set(USE_HIP OFF) +else() + set(GPU_LANG HIP) + set(USE_HIP ON) +endif() +message(STATUS "Fetching external gpubench") +include(FetchContent) +FetchContent_Declare( + gpubench + GIT_REPOSITORY https://github.com/upsj/gpubench.git + GIT_TAG 0a5ebdc5aedd3fe5be6b5defeedd35bf43efd231 +) +FetchContent_GetProperties(gpubench) +if(NOT gpubench_POPULATED) + FetchContent_Populate(gpubench) + add_subdirectory(${gpubench_SOURCE_DIR} ${gpubench_BINARY_DIR} EXCLUDE_FROM_ALL) +endif() + +function(add_benchmark name) + set(targetname ${name}-microbench) + string(TOLOWER ${GPU_LANG} GPU_LANG_LOWER) + add_executable(${targetname} ${name}.gpu.cpp) + set_source_files_properties(${name}.gpu.cpp PROPERTIES LANGUAGE ${GPU_LANG}) + target_link_libraries(${targetname} PRIVATE nvbench::main ginkgo) + target_include_directories(${targetname} PRIVATE ${PROJECT_SOURCE_DIR}) + target_compile_options(${targetname} PRIVATE $<$:--extended-lambda> -lineinfo) + target_compile_definitions(${targetname} PRIVATE GKO_COMPILING_${GPU_LANG} GKO_DEVICE_NAMESPACE=${GPU_LANG_LOWER}) +endfunction(add_benchmark name) + +add_benchmark(memory) +add_benchmark(sorting) +add_benchmark(bitvector) diff --git a/benchmark/gpu-microbenchmarks/bitvector.gpu.cpp b/benchmark/gpu-microbenchmarks/bitvector.gpu.cpp new file mode 100644 index 00000000000..bf7988d16f2 --- /dev/null +++ b/benchmark/gpu-microbenchmarks/bitvector.gpu.cpp @@ -0,0 +1,247 @@ +// SPDX-FileCopyrightText: 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#ifdef USE_HIP +#include +#else +#include +#endif + +template +__device__ MaskType prefix_mask(int lane) +{ + return (MaskType{1} << lane) - 1; +} + +const auto sizes = nvbench::range(16, 28, 2); +const auto threadblock_size = 512; + +using mask_types = nvbench::type_list; +using rank_types = nvbench::type_list; + + +template +__global__ void compute_ranks(const MaskType* __restrict__ masks, + const RankType* __restrict__ ranks, + RankType* __restrict__ out, int size) +{ + const auto i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < size) { + constexpr auto block_size = CHAR_BIT * sizeof(MaskType); + const auto block_i = i / block_size; + const auto local_i = i % block_size; + out[i] = ranks[block_i] + + gko::detail::popcount(masks[block_i] & + prefix_mask(local_i)); + } +} + +template +void rank_operation(nvbench::state& state, + nvbench::type_list) +{ + const auto size = static_cast(state.get_int64("size")); + + constexpr auto block_size = CHAR_BIT * sizeof(MaskType); + const auto block_count = gko::ceildiv(size, block_size); + thrust::device_vector masks(block_count, ~MaskType{}); + thrust::device_vector ranks(block_count, 0); + thrust::sequence(ranks.begin(), ranks.end(), RankType{}); + thrust::for_each(ranks.begin(), ranks.end(), + [] __device__(RankType & rank) { rank *= block_size; }); + thrust::device_vector output(size, 0); + const auto num_threadblocks = gko::ceildiv(size, threadblock_size); + + state.add_element_count(size, "Items"); + state.add_global_memory_reads(block_count, "Masks"); + state.add_global_memory_reads(block_count, "Ranks"); + state.add_global_memory_writes(size, "OutSize"); + + state.exec([&](nvbench::launch& launch) { + compute_ranks<<>>( + thrust::raw_pointer_cast(masks.data()), + thrust::raw_pointer_cast(ranks.data()), + thrust::raw_pointer_cast(output.data()), size); + }); + // compare to reference + auto ref = thrust::host_vector(size); + thrust::sequence(ref.begin(), ref.end(), RankType{}); + if (ref != output) { + std::cout << "FAIL\n"; + } +} + +NVBENCH_BENCH_TYPES(rank_operation, NVBENCH_TYPE_AXES(mask_types, rank_types)) + .set_type_axes_names({"mask", "rank"}) + .add_int64_power_of_two_axis("size", sizes); + +// + +template +void binary_search_operation(nvbench::state& state, + nvbench::type_list) +{ + const auto size = static_cast(state.get_int64("size")); + + thrust::device_vector ranks(size, 0); + thrust::sequence(ranks.begin(), ranks.end(), RankType{}); + auto queries = ranks; + thrust::device_vector output(size, 0); + + state.add_element_count(size, "Items"); + state.add_global_memory_reads(size, "Ranks"); + state.add_global_memory_reads(size, "Queries"); + state.add_global_memory_writes(size, "OutSize"); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { +#ifdef USE_HIP + auto policy = thrust::hip::par.on(launch.get_stream()); +#else + auto policy = thrust::cuda::par.on(launch.get_stream()); +#endif + thrust::lower_bound(policy, ranks.begin(), ranks.end(), queries.begin(), + queries.end(), output.begin()); + }); + if (output != ranks) { + std::cout << "FAIL\n"; + } +} + +NVBENCH_BENCH_TYPES(binary_search_operation, NVBENCH_TYPE_AXES(rank_types)) + .add_int64_power_of_two_axis("size", sizes); + +// + +template +__global__ void compute_select(const MaskType* __restrict__ masks, + int* __restrict__ out, int size) +{ + const auto i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < size) { + constexpr auto block_size = CHAR_BIT * sizeof(MaskType); + int offset = 0; + const auto mask = masks[i / block_size]; + const auto rank = threadIdx.x % block_size; + for (int range_size = block_size; range_size > 1; range_size /= 2) { + const auto mid = offset + range_size / 2; + const auto half_count = + gko::detail::popcount(mask & prefix_mask(mid)); + offset = half_count <= rank ? mid : offset; + } + out[i] = offset; + } +} + +template +void select_operation(nvbench::state& state, nvbench::type_list) +{ + const auto size = static_cast(state.get_int64("size")); + + constexpr auto block_size = CHAR_BIT * sizeof(MaskType); + const auto block_count = gko::ceildiv(size, block_size); + thrust::device_vector masks(block_count, ~MaskType{}); + thrust::device_vector output(size, 0); + const auto num_threadblocks = gko::ceildiv(size, threadblock_size); + + state.add_element_count(size, "Items"); + state.add_global_memory_reads(block_count, "Masks"); + state.add_global_memory_writes(size, "OutSize"); + + state.exec([&](nvbench::launch& launch) { + compute_select<<>>( + thrust::raw_pointer_cast(masks.data()), + thrust::raw_pointer_cast(output.data()), size); + }); + auto ref = thrust::host_vector(size); + thrust::sequence(ref.begin(), ref.end(), 0); + thrust::for_each(ref.begin(), ref.end(), + [](int& rank) { rank %= block_size; }); + if (ref != output) { + std::cout << "FAIL\n"; + for (int i = 0; i < 50; i++) { + std::cout << ref[i] << ' ' << output[i] << '\n'; + } + } +} + +NVBENCH_BENCH_TYPES(select_operation, NVBENCH_TYPE_AXES(mask_types)) + .add_int64_power_of_two_axis("size", sizes); + +// + +template +__global__ void compute_select_even(const MaskType* __restrict__ masks, + int* __restrict__ out, int size) +{ + const auto i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < size) { + constexpr auto block_size = CHAR_BIT * sizeof(MaskType); + int offset = 0; + const auto mask = masks[i / (block_size / 2)]; + const auto rank = threadIdx.x % (block_size / 2); + for (int range_size = block_size; range_size > 1; range_size /= 2) { + const auto mid = offset + range_size / 2; + const auto half_count = + gko::detail::popcount(mask & prefix_mask(mid)); + offset = half_count <= rank ? mid : offset; + } + out[i] = offset; + } +} + +template +void select_even_operation(nvbench::state& state, nvbench::type_list) +{ + // Allocate input data: + const auto size = static_cast(state.get_int64("size")); + + constexpr auto block_size = CHAR_BIT * sizeof(MaskType); + const auto block_count = gko::ceildiv(size, block_size); + MaskType mask{}; + for (int i = 0; i < block_size; i += 2) { + mask |= MaskType{1} << i; + } + thrust::device_vector masks(block_count, mask); + thrust::device_vector output(size / 2, 0); + const auto num_threadblocks = gko::ceildiv(size / 2, threadblock_size); + + state.add_element_count(size, "Items"); + state.add_global_memory_reads(block_count, "Masks"); + state.add_global_memory_writes(size / 2, "OutSize"); + + state.exec([&](nvbench::launch& launch) { + compute_select_even<<>>( + thrust::raw_pointer_cast(masks.data()), + thrust::raw_pointer_cast(output.data()), size / 2); + }); + auto ref = thrust::host_vector(size / 2); + thrust::sequence(ref.begin(), ref.end(), 0); + thrust::for_each(ref.begin(), ref.end(), + [](int& rank) { rank = (rank % (block_size / 2)) * 2; }); + if (ref != output) { + std::cout << "FAIL\n"; + for (int i = 0; i < 50; i++) { + std::cout << ref[i] << ' ' << output[i] << '\n'; + } + } +} + +NVBENCH_BENCH_TYPES(select_even_operation, NVBENCH_TYPE_AXES(mask_types)) + .add_int64_power_of_two_axis("size", sizes); diff --git a/benchmark/gpu-microbenchmarks/memory.gpu.cpp b/benchmark/gpu-microbenchmarks/memory.gpu.cpp new file mode 100644 index 00000000000..a392e586ccc --- /dev/null +++ b/benchmark/gpu-microbenchmarks/memory.gpu.cpp @@ -0,0 +1,123 @@ +// SPDX-FileCopyrightText: 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/components/memory.hpp" + +#include +#include +#include + +#include + +#ifdef USE_HIP +#include +#else +#include +#endif + +template +void copy(nvbench::state& state, nvbench::type_list) +{ + // Allocate input data: + const auto size = static_cast(state.get_int64("size")); + + thrust::device_vector data(size); + thrust::device_vector data2(size); + state.add_global_memory_reads(size * sizeof(T)); + state.add_global_memory_writes(size * sizeof(T)); + + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, + [&](nvbench::launch& launch, auto& timer) { +#ifdef USE_HIP + auto policy = thrust::hip::par.on(launch.get_stream()); +#else + auto policy = thrust::cuda::par_nosync.on(launch.get_stream()); +#endif + + timer.start(); + thrust::copy(policy, data.begin(), data.end(), + data2.begin()); + timer.stop(); + }); +} +using types = nvbench::type_list; +NVBENCH_BENCH_TYPES(copy, NVBENCH_TYPE_AXES(types)) + .add_int64_power_of_two_axis("size", nvbench::range(20, 28, 2)); + +template +__global__ void copy_kernel(const T* __restrict__ in, T* __restrict__ out, + int stride, std::size_t size) +{ + using namespace gko::kernels::cuda; + auto i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= size) { + return; + } + i = (i * stride) % size; + auto value = + atomic_load + ? (relaxed_atomics ? (local_atomics ? load_relaxed_local(in + i) + : load_relaxed(in + i)) + : (local_atomics ? load_acquire_local(in + i) + : load_acquire(in + i))) + : in[i]; + atomic_store ? (relaxed_atomics + ? (local_atomics ? store_relaxed_local(out + i, value) + : store_relaxed(out + i, value)) + : (local_atomics ? store_release_local(out + i, value) + : store_release(out + i, value))) + : (void)(out[i] = value); +} + +template +void copy_load_dispatch(const T* in, T* out, bool dynamic_args[4], int stride, + std::size_t size, cudaStream_t stream) +{ + if constexpr (sizeof...(args) == 4) { + constexpr auto block_size = 1024; + const auto num_blocks = gko::ceildiv(size, block_size); + copy_kernel + <<>>(in, out, stride, size); + } else { + if (dynamic_args[sizeof...(args)]) { + copy_load_dispatch(in, out, dynamic_args, stride, + size, stream); + } else { + copy_load_dispatch(in, out, dynamic_args, stride, + size, stream); + } + } +} + +template +void copy_atomic(nvbench::state& state, nvbench::type_list) +{ + // Allocate input data: + const auto size = static_cast(state.get_int64("size")); + const bool atomic_load = state.get_int64("atomic_load"); + const bool atomic_store = state.get_int64("atomic_store"); + const bool relaxed_atomics = state.get_int64("relaxed_atomics"); + const bool local_atomics = state.get_int64("local_atomics"); + const auto stride = state.get_int64("stride"); + + thrust::device_vector data(size); + thrust::device_vector data2(size); + state.add_global_memory_reads(size * sizeof(T)); + state.add_global_memory_writes(size * sizeof(T)); + bool args[] = {atomic_load, atomic_store, relaxed_atomics, local_atomics}; + + state.exec([&](nvbench::launch& launch) { + copy_load_dispatch(data.data().get(), data2.data().get(), args, stride, + size, launch.get_stream()); + }); +} +using types = nvbench::type_list; +NVBENCH_BENCH_TYPES(copy_atomic, NVBENCH_TYPE_AXES(types)) + .add_int64_power_of_two_axis("size", nvbench::range(28, 28)) + .add_int64_axis("atomic_load", {0, 1}) + .add_int64_axis("atomic_store", {0, 1}) + .add_int64_axis("relaxed_atomics", {0, 1}) + .add_int64_axis("local_atomics", {0, 1}) + .add_int64_axis("stride", {0, 1, 2, 3, 9, 17, 33}); diff --git a/benchmark/gpu-microbenchmarks/sorting.gpu.cpp b/benchmark/gpu-microbenchmarks/sorting.gpu.cpp new file mode 100644 index 00000000000..045ff07234f --- /dev/null +++ b/benchmark/gpu-microbenchmarks/sorting.gpu.cpp @@ -0,0 +1,380 @@ +// SPDX-FileCopyrightText: 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/components/sorting.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef USE_HIP +#include +#else +#include +#endif + +constexpr auto block_size = 1024; +#ifdef USE_HIP +constexpr auto warp_size = warpSize; +#else +constexpr auto warp_size = 32; +#endif + +template +__global__ void sort_local_kernel(T* data, int size) +{ + const auto i = threadIdx.x + blockDim.x * blockIdx.x; + if (i * num_elements + (num_elements - 1) >= size) { + return; + } + const auto block_base_i = blockDim.x * blockIdx.x * num_elements; + T local_data[num_elements]; + cub::LoadDirectBlocked( + threadIdx.x, data + block_base_i, local_data); + gko::kernels::cuda::detail::bitonic_local::sort(local_data, + false); + cub::StoreDirectBlocked( + threadIdx.x, data + block_base_i, local_data); +} + +template +void sort_local_dispatch(T* data, int size, int num_elements, + cudaStream_t stream) +{ + const auto num_blocks = + gko::ceildiv(gko::ceildiv(size, num_elements), block_size); + switch (num_elements) { + case 1: + return sort_local_kernel<1> + <<>>(data, size); + case 2: + return sort_local_kernel<2> + <<>>(data, size); + case 4: + return sort_local_kernel<4> + <<>>(data, size); + case 8: + return sort_local_kernel<8> + <<>>(data, size); + default: + return; + } +} + +template +void sort_local(nvbench::state& state, nvbench::type_list) +{ + // Allocate input data: + const auto size = static_cast(state.get_int64("size")); + const auto num_elements = state.get_int64("num_elements"); + + std::default_random_engine rng{}; + std::uniform_int_distribution dist{0, 100000}; + thrust::host_vector host_data(size); + for (auto& value : host_data) { + value = dist(rng); + } + thrust::device_vector data = host_data; + state.add_global_memory_reads(size * sizeof(T)); + state.add_global_memory_writes(size * sizeof(T)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + sort_local_dispatch(data.data().get(), size, num_elements, + launch.get_stream()); + }); + for (int i = 0; i + num_elements - 1 < size; i += num_elements) { + std::sort(host_data.begin() + i, host_data.begin() + i + num_elements); + } + if (host_data != data) { + std::cout << "FAIL\n"; + } +} +using types = nvbench::type_list; +NVBENCH_BENCH_TYPES(sort_local, NVBENCH_TYPE_AXES(types)) + .add_int64_power_of_two_axis("size", nvbench::range(16, 28, 2)) + .add_int64_axis("num_elements", {1, 2, 4, 8}); + + +template +__global__ void sort_warp_kernel(T* data, int size) +{ + constexpr auto num_elements_warp = num_elements * num_threads; + constexpr auto warps_in_block = block_size / num_threads; + const auto i = threadIdx.x + blockDim.x * blockIdx.x; + const auto warp_i = i / num_threads; + if (warp_i * num_elements_warp + (num_elements_warp - 1) >= size) { + return; + } + const auto block_base_i = blockDim.x * blockIdx.x * num_elements; + T local_data[num_elements]; + // striped is most efficient for loading + using WarpLoadT = + cub::WarpLoad; + // direct is necessary for blocked output + using WarpStoreT = + cub::WarpStore; + const auto local_warp_id = static_cast(threadIdx.x) / num_threads; + union TempStorage { + typename WarpLoadT::TempStorage load; + typename WarpStoreT::TempStorage store; + }; + __shared__ TempStorage temp_storage[warps_in_block]; + WarpLoadT(temp_storage[local_warp_id].load) + .Load(data + block_base_i + local_warp_id * num_elements_warp, + local_data); + gko::kernels::cuda::detail::bitonic_warp::sort(local_data, + false); + WarpStoreT(temp_storage[local_warp_id].store) + .Store(data + block_base_i + local_warp_id * num_elements_warp, + local_data); +} + +template +void sort_warp_dispatch(T* data, int size, int num_elements, + cudaStream_t stream) +{ + const auto num_blocks = + gko::ceildiv(gko::ceildiv(size, num_elements), block_size); + switch (num_elements) { + case 1: + return sort_warp_kernel<1, warp_size> + <<>>(data, size); + case 2: + return sort_warp_kernel<2, warp_size> + <<>>(data, size); + case 4: + return sort_warp_kernel<4, warp_size> + <<>>(data, size); + case 8: + return sort_warp_kernel<8, warp_size> + <<>>(data, size); + default: + return; + } +} + +template +void sort_warp(nvbench::state& state, nvbench::type_list) +{ + // Allocate input data: + const auto size = static_cast(state.get_int64("size")); + const auto num_elements = state.get_int64("num_elements"); + + std::default_random_engine rng{}; + std::uniform_int_distribution dist{0, 100000}; + thrust::host_vector host_data(size); + for (auto& value : host_data) { + value = dist(rng); + } + thrust::device_vector data = host_data; + state.add_global_memory_reads(size * sizeof(T)); + state.add_global_memory_writes(size * sizeof(T)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + sort_warp_dispatch(data.data().get(), size, num_elements, + launch.get_stream()); + }); + const auto num_elements_warp = num_elements * warp_size; + for (int i = 0; i + num_elements_warp - 1 < size; i += num_elements_warp) { + std::sort(host_data.begin() + i, + host_data.begin() + i + num_elements_warp); + } + if (host_data != data) { + std::cout << "FAIL\n"; + } +} +using types = nvbench::type_list; +NVBENCH_BENCH_TYPES(sort_warp, NVBENCH_TYPE_AXES(types)) + .add_int64_power_of_two_axis("size", nvbench::range(16, 28, 2)) + .add_int64_axis("num_elements", {1, 2, 4, 8}); + + +template +__global__ void sort_block_kernel(T* data, int size) +{ + constexpr auto num_elements_block = num_elements * num_threads; + const auto block_i = blockIdx.x; + if (block_i * num_elements_block + (num_elements_block - 1) >= size) { + return; + } + const auto block_base_i = blockIdx.x * num_elements_block; + T local_data[num_elements]; + // striped is most efficient for loading + using BlockLoadT = + cub::BlockLoad; + // direct is necessary for blocked output + using BlockStoreT = + cub::BlockStore; + // this type should be empty + typename BlockLoadT::TempStorage load; + // this type should be empty + typename BlockStoreT::TempStorage store; + + __shared__ T shared_storage[num_elements_block]; + BlockLoadT(load).Load(data + block_base_i, local_data); + gko::kernels::cuda::detail::bitonic_global::sort(local_data, + shared_storage, + false); + BlockStoreT(store).Store(data + block_base_i, local_data); +} + +template +void sort_block_dispatch(T* data, int size, int num_elements, + cudaStream_t stream) +{ + const auto num_blocks = + gko::ceildiv(gko::ceildiv(size, num_elements), block_size); + switch (num_elements) { + case 1: + return sort_block_kernel<1, block_size> + <<>>(data, size); + case 2: + return sort_block_kernel<2, block_size> + <<>>(data, size); + case 4: + return sort_block_kernel<4, block_size> + <<>>(data, size); + default: + return; + } +} + +template +void sort_block(nvbench::state& state, nvbench::type_list) +{ + // Allocate input data: + const auto size = static_cast(state.get_int64("size")); + const auto num_elements = state.get_int64("num_elements"); + + std::default_random_engine rng{}; + std::uniform_int_distribution dist{0, 100000}; + thrust::host_vector host_data(size); + for (auto& value : host_data) { + value = dist(rng); + } + std::iota(host_data.begin(), host_data.begin() + 1024, 0); + std::reverse(host_data.begin(), host_data.begin() + 1024); + thrust::device_vector data = host_data; + state.add_global_memory_reads(size * sizeof(T)); + state.add_global_memory_writes(size * sizeof(T)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + sort_block_dispatch(data.data().get(), size, num_elements, + launch.get_stream()); + }); + const auto num_elements_block = num_elements * block_size; + for (int i = 0; i + num_elements_block - 1 < size; + i += num_elements_block) { + std::sort(host_data.begin() + i, + host_data.begin() + i + num_elements_block); + } + if (host_data != data) { + std::cout << "FAIL\n"; + } +} +using types = nvbench::type_list; +NVBENCH_BENCH_TYPES(sort_block, NVBENCH_TYPE_AXES(types)) + .add_int64_power_of_two_axis("size", nvbench::range(16, 28, 2)) + .add_int64_axis("num_elements", {1, 2, 4}); + + +template +__global__ void sort_block_radix_kernel(T* data, int size) +{ + constexpr auto num_elements_block = num_elements * num_threads; + const auto block_i = blockIdx.x; + if (block_i * num_elements_block + (num_elements_block - 1) >= size) { + return; + } + const auto block_base_i = blockIdx.x * num_elements_block; + T local_data[num_elements]; + // striped is most efficient for loading + using BlockLoadT = + cub::BlockLoad; + // direct is necessary for blocked output + using BlockStoreT = + cub::BlockStore; + using BlockRadixSortT = cub::BlockRadixSort; + // this type should be empty + typename BlockLoadT::TempStorage load; + // this type should be empty + typename BlockStoreT::TempStorage store; + + __shared__ typename BlockRadixSortT::TempStorage shared_storage; + BlockLoadT(load).Load(data + block_base_i, local_data); + BlockRadixSortT(shared_storage).Sort(local_data); + BlockStoreT(store).Store(data + block_base_i, local_data); +} + +template +void sort_block_radix_dispatch(T* data, int size, int num_elements, + cudaStream_t stream) +{ + const auto num_blocks = + gko::ceildiv(gko::ceildiv(size, num_elements), block_size); + switch (num_elements) { + case 1: + return sort_block_radix_kernel<1, block_size> + <<>>(data, size); + case 2: + return sort_block_radix_kernel<2, block_size> + <<>>(data, size); + case 4: + return sort_block_radix_kernel<4, block_size> + <<>>(data, size); + default: + return; + } +} + +template +void sort_block_radix(nvbench::state& state, nvbench::type_list) +{ + // Allocate input data: + const auto size = static_cast(state.get_int64("size")); + const auto num_elements = state.get_int64("num_elements"); + + std::default_random_engine rng{}; + std::uniform_int_distribution dist{0, 100000}; + thrust::host_vector host_data(size); + for (auto& value : host_data) { + value = dist(rng); + } + std::iota(host_data.begin(), host_data.begin() + 1024, 0); + std::reverse(host_data.begin(), host_data.begin() + 1024); + thrust::device_vector data = host_data; + state.add_global_memory_reads(size * sizeof(T)); + state.add_global_memory_writes(size * sizeof(T)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + sort_block_radix_dispatch(data.data().get(), size, num_elements, + launch.get_stream()); + }); + const auto num_elements_block = num_elements * block_size; + for (int i = 0; i + num_elements_block - 1 < size; + i += num_elements_block) { + std::sort(host_data.begin() + i, + host_data.begin() + i + num_elements_block); + } + if (host_data != data) { + std::cout << "FAIL\n"; + } +} +using types = nvbench::type_list; +NVBENCH_BENCH_TYPES(sort_block_radix, NVBENCH_TYPE_AXES(types)) + .add_int64_power_of_two_axis("size", nvbench::range(16, 28, 2)) + .add_int64_axis("num_elements", {1, 2, 4});