diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index ca4b0f099d..5da2cd916b 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -32,6 +32,7 @@ function(ConfigureBench) PRIVATE raft::raft raft_internal $<$:raft::compiled> + ${RAFT_CTK_MATH_DEPENDENCIES} benchmark::benchmark Threads::Threads $ @@ -73,11 +74,14 @@ function(ConfigureBench) endfunction() if(BUILD_PRIMS_BENCH) + ConfigureBench( + NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp + ) + ConfigureBench( NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) - ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp) ConfigureBench( NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu diff --git a/cpp/bench/prims/core/copy.cu b/cpp/bench/prims/core/copy.cu new file mode 100644 index 0000000000..31ee83b924 --- /dev/null +++ b/cpp/bench/prims/core/copy.cu @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::core { + +template +auto constexpr const default_dims = []() { + auto dims = std::array{}; + std::fill(dims.begin(), dims.end(), 2); + return dims; +}(); + +template +auto constexpr const default_dims = std::array{3000000}; + +template +auto constexpr const default_dims = std::array{1000, 3000}; + +template +auto constexpr const default_dims = std::array{20, 300, 500}; + +template > +struct bench_array_type; + +template +struct bench_array_type> { + template + auto static constexpr const extent_type = raft::dynamic_extent; + + using type = + std::conditional_t...>, LayoutPolicy>, + device_mdarray...>, LayoutPolicy>>; +}; + +template +struct params { + std::array dims = default_dims; + using src_array_type = + typename bench_array_type::type; + using dst_array_type = + typename bench_array_type::type; +}; + +template +struct CopyBench : public fixture { + using params_type = + params; + using src_array_type = typename params_type::src_array_type; + using dst_array_type = typename params_type::dst_array_type; + explicit CopyBench(const params_type& ps) + : fixture{true}, + res_{}, + params_{ps}, + src_{ + res_, + typename src_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename src_array_type::container_policy_type{}, + }, + dst_{ + res_, + typename dst_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename dst_array_type::container_policy_type{}, + } + { + res_.get_cublas_handle(); // initialize cublas handle + auto src_data = std::vector(src_.size()); + std::iota(src_data.begin(), src_data.end(), SrcT{}); + raft::copy(src_.data_handle(), src_data.data(), src_.size(), res_.get_stream()); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { raft::copy(res_, dst_.view(), src_.view()); }); + } + + private: + raft::device_resources res_; + params_type params_; + src_array_type src_; + dst_array_type dst_; +}; + +template +auto static const inputs = std::vector{ParamsT{}}; + +#define COPY_REGISTER(BenchT) \ + RAFT_BENCH_REGISTER(BenchT, "BenchT", inputs) + +using copy_bench_device_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_3d_diff_dtype_same_layout = CopyBench; + +using copy_bench_host_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout_float_float = CopyBench; +using copy_bench_host_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_host_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_device_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_device_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_host_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_host_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_3d_diff_dtype_same_layout = CopyBench; + +// COPY_REGISTER(copy_bench_same_dtype_1d_host_host); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout_float_float); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_diff_layout); + +} // namespace raft::bench::core diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh new file mode 100644 index 0000000000..f256f9ea0f --- /dev/null +++ b/cpp/include/raft/core/copy.cuh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. Any necessary device operations will be stream-ordered via the CUDA stream + * provided by the `raft::resources` argument. + * + * This header includes a custom kernel used for copying data between + * completely arbitrary mdspans on device. To compile this function in a + * non-CUDA translation unit, `raft/core/copy.hpp` may be used instead. The + * pure C++ header will correctly compile even without a CUDA compiler. + * Depending on the specialization, this CUDA header may invoke the kernel and + * therefore require a CUDA compiler. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ +template +detail::mdspan_copyable_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +template +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} +#endif +} // namespace raft diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp new file mode 100644 index 0000000000..0a16b742a2 --- /dev/null +++ b/cpp/include/raft/core/copy.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. + * + * This header does _not_ include the custom kernel used for copying data + * between completely arbitrary mdspans on device. For arbitrary copies of this + * kind, `#include ` instead. Specializations of this + * function that require the custom kernel will be SFINAE-omitted when this + * header is used instead of `copy.cuh`. This header _does_ support + * device-to-device copies that can be performed with cuBLAS or a + * straightforward cudaMemcpy. Any necessary device operations will be stream-ordered via the CUDA + * stream provided by the `raft::resources` argument. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ +template +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} +#endif + +} // namespace raft diff --git a/cpp/include/raft/core/cuda_support.hpp b/cpp/include/raft/core/cuda_support.hpp new file mode 100644 index 0000000000..07fb95a921 --- /dev/null +++ b/cpp/include/raft/core/cuda_support.hpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +namespace raft { +#ifndef RAFT_DISABLE_CUDA +auto constexpr static const CUDA_ENABLED = true; +#else +auto constexpr static const CUDA_ENABLED = false; +#endif +} // namespace raft diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp new file mode 100644 index 0000000000..b23660fefe --- /dev/null +++ b/cpp/include/raft/core/detail/copy.hpp @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#include +#include +#ifdef __CUDACC__ +#include +#endif +#endif + +namespace raft { +namespace detail { + +template +struct mdspan_copyable : std::false_type { + auto static constexpr const custom_kernel_allowed = false; + auto static constexpr const custom_kernel_not_allowed = false; +}; + +/* + * A helper struct used to determine whether one mdspan type can be copied to + * another and if so how + */ +template +struct mdspan_copyable>>, + std::bool_constant>>>>> { + using dst_type = std::remove_reference_t; + using src_type = std::remove_reference_t; + + // Extents properties + using dst_extents_type = typename dst_type::extents_type; + using src_extents_type = typename src_type::extents_type; + using index_type = + std::conditional_t<(std::numeric_limits::max() > + std::numeric_limits::max()), + typename dst_extents_type::index_type, + typename src_extents_type::index_type>; + + // Dtype properties + using dst_value_type = typename dst_type::value_type; + using src_value_type = typename src_type::value_type; + using dst_element_type = typename dst_type::element_type; + using src_element_type = typename src_type::element_type; + auto static constexpr const same_dtype = std::is_same_v; + auto static constexpr const compatible_dtype = + std::is_assignable_v; + + auto static constexpr const dst_float = std::is_same_v; + auto static constexpr const src_float = std::is_same_v; + auto static constexpr const dst_double = std::is_same_v; + auto static constexpr const src_double = std::is_same_v; + + auto static constexpr const both_float = dst_float && src_float; + auto static constexpr const both_double = dst_double && src_double; + auto static constexpr const both_float_or_both_double = both_float || both_double; + + // Ranks + auto static constexpr const dst_rank = dst_extents_type::rank(); + auto static constexpr const src_rank = src_extents_type::rank(); + auto static constexpr const compatible_rank = (dst_rank == src_rank); + auto static constexpr const has_vector_rank = (dst_rank == 1); + auto static constexpr const has_matrix_rank = (dst_rank == 2); + + // Layout properties + using dst_layout_type = typename dst_type::layout_type; + using src_layout_type = typename src_type::layout_type; + + auto static constexpr const same_layout = std::is_same_v; + + auto static check_for_unique_dst(dst_type dst) + { + if constexpr (!dst_type::is_always_unique()) { + RAFT_EXPECTS(dst.is_unique(), "Destination mdspan must be unique for parallelized copies"); + } + } + + auto static constexpr const src_contiguous = + std::disjunction_v, + std::is_same>; + + auto static constexpr const dst_contiguous = + std::disjunction_v, + std::is_same>; + + auto static constexpr const both_contiguous = src_contiguous && dst_contiguous; + + auto static constexpr const same_underlying_layout = + std::disjunction_v, + std::bool_constant>; + // Layout for intermediate tile if copying through custom kernel + using tile_layout_type = + std::conditional_t>; + + // Accessibility + auto static constexpr const dst_device_accessible = is_device_mdspan_v; + auto static constexpr const src_device_accessible = is_device_mdspan_v; + auto static constexpr const both_device_accessible = + dst_device_accessible && src_device_accessible; + + auto static constexpr const dst_host_accessible = is_host_mdspan_v; + auto static constexpr const src_host_accessible = is_host_mdspan_v; + auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; + + // Allowed copy codepaths + auto static constexpr const can_use_host = both_host_accessible; + +#if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) + // TODO(wphicks): Following should be only necessary restrictions. Test if + // perf actually improves once fully implemented. + // auto static constexpr const can_use_simd = can_use_host && both_contiguous && + // both_float_or_both_double; + auto static constexpr const can_use_simd = + can_use_host && both_contiguous && both_float && has_matrix_rank; +#else + auto static constexpr const can_use_simd = false; +#endif + + auto static constexpr const can_use_std_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + auto static constexpr const can_use_raft_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + // Do we need intermediate storage on device in order to perform + // non-trivial layout or dtype conversions after copying source from host or + // before copying converted results back to host? + auto static constexpr const requires_intermediate = + !both_host_accessible && !both_device_accessible && !can_use_raft_copy; + + auto static constexpr const use_intermediate_dst = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const use_intermediate_src = + std::conjunction_v, + std::bool_constant>; + auto static constexpr const can_use_device = + std::conjunction_v, + std::disjunction, + std::bool_constant, + std::bool_constant>>; + + auto static constexpr const can_use_cublas = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + auto static constexpr const custom_kernel_allowed = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const custom_kernel_not_allowed = !custom_kernel_allowed; + auto static constexpr const custom_kernel_required = + std::conjunction_v, + std::bool_constant>; + + // Viable overload? + auto static constexpr const value = + std::conjunction_v>, + std::bool_constant>, + std::bool_constant>; + using type = std::enable_if_t; +}; + +template +using mdspan_copyable_t = typename mdspan_copyable::type; +template +auto static constexpr const mdspan_copyable_v = + mdspan_copyable::value; + +template +auto static constexpr const mdspan_copyable_with_kernel_v = + mdspan_copyable::custom_kernel_allowed; +template +auto static constexpr const mdspan_copyable_not_with_kernel_v = + mdspan_copyable::custom_kernel_not_allowed; + +template +using mdspan_copyable_with_kernel_t = + std::enable_if_t, T>; + +template +using mdspan_copyable_not_with_kernel_t = + std::enable_if_t, T>; + +#ifdef __CUDACC__ +auto static constexpr const mdspan_copy_tile_dim = 32; +auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim; + +// Helper struct to work around lack of CUDA-native std::apply +template +struct index_sequence {}; + +template +struct make_index_sequence + : std::conditional_t, + make_index_sequence> {}; + +/* template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence) +{ + return lambda(args[Idx]...); +} + +template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args) +{ + return apply(std::forward(lambda), std::forward(args), +make_index_sequence{}); +} */ + +/* + * Given an mdspan and an array of indices, return a reference to the + * indicated element. + */ +template +__device__ decltype(auto) get_mdspan_elem(MdspanType md, + IdxType const* indices, + index_sequence) +{ + return md(indices[Idx]...); +} + +template +__device__ decltype(auto) get_mdspan_elem(MdspanType md, IdxType const* indices) +{ + return get_mdspan_elem( + md, indices, make_index_sequence{}); +} + +/* Advance old_indices forward by the number of mdspan elements specified + * by increment. Store the result in indices. Return true if the new + * indices are valid for the input mdspan. + */ +template +__device__ auto increment_indices(IdxType* indices, + MdspanType const& md, + IdxType const* old_indices, + IdxType const* index_strides, + IncrType increment) +{ +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + increment += index_strides[i] * old_indices[i]; + } + +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + // Iterate through dimensions in order from slowest to fastest varying for + // layout_right and layout_left. Otherwise, just iterate through dimensions + // in order. + // + // TODO(wphicks): It is possible to always iterate through dimensions in + // the slowest to fastest order. Consider this or at minimum expanding to + // padded layouts. + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return MdspanType::rank() - ind - 1; + } else { + return ind; + } + }(i); + + auto cur_index = IdxType{}; + + while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) { + increment -= index_strides[real_index]; + ++cur_index; + } + indices[real_index] = cur_index; + } + + return increment == IdxType{}; +} + +/* + * WARNING: This kernel _must_ be launched with mdspan_copy_tile_dim x + * mdspan_copy_tile_dim threads per block. This restriction allows for + * additional optimizations at the expense of generalized launch + * parameters. + */ +template +__global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(DstType dst, + SrcType src) +{ + using config = mdspan_copyable; + + // An intermediate storage location for the data to be copied. + __shared__ typename config::dst_value_type tile[mdspan_copy_tile_dim][mdspan_copy_tile_dim + 1]; + + // Compute the cumulative product of extents in order from fastest to + // slowest varying extent + typename config::index_type index_strides[config::dst_rank]; + auto cur_stride = typename config::index_type{1}; +#pragma unroll + for (auto i = typename SrcType::extents_type::rank_type{}; i < config::src_rank; ++i) { + // Iterate through dimensions in order from fastest to slowest varying + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return config::src_rank - ind - 1; + } else { + return ind; + } + }(i); + + index_strides[real_index] = cur_stride; + cur_stride *= src.extent(real_index); + } + + // The index of the first element in the mdspan which will be copied via + // the current tile for this block. + typename config::index_type tile_offset[config::dst_rank] = {0}; + typename config::index_type cur_indices[config::dst_rank]; + auto valid_tile = increment_indices( + tile_offset, src, tile_offset, index_strides, blockIdx.x * mdspan_copy_tile_elems); + + while (valid_tile) { + auto tile_read_x = std::is_same_v + ? threadIdx.x + : threadIdx.y; + auto tile_read_y = std::is_same_v + ? threadIdx.y + : threadIdx.x; + + auto valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_x * mdspan_copy_tile_dim + tile_read_y); + + if constexpr (config::same_underlying_layout || !config::dst_contiguous) { + if (valid_index) { + tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); + get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y]; + } + } else { + if (valid_index) { tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); } + __syncthreads(); + + valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_y * mdspan_copy_tile_dim + tile_read_x); + if (valid_index) { get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; } + __syncthreads(); + } + valid_tile = increment_indices( + tile_offset, src, tile_offset, index_strides, blockDim.x * mdspan_copy_tile_elems); + } +} +#endif + +template +mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType&& src) +{ + using config = mdspan_copyable; + for (auto i = std::size_t{}; i < config::src_rank; ++i) { + RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape"); + } + + if constexpr (config::use_intermediate_src) { +#ifndef RAFT_DISABLE_CUDA + // Copy to intermediate source on device, then perform necessary + // changes in layout on device, directly into final destination + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{src.extents()}, + typename mdarray_t::container_policy_type{}); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to device in non-CUDA build")); +#endif + + } else if constexpr (config::use_intermediate_dst) { +#ifndef RAFT_DISABLE_CUDA + // Perform necessary changes in layout on device, then copy to final + // destination on host + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{dst.extents()}, + typename mdarray_t::container_policy_type{}); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); +#else + throw(raft::non_cuda_build_error("Copying from device in non-CUDA build")); +#endif + } else if constexpr (config::can_use_raft_copy) { +#ifndef RAFT_DISABLE_CUDA + raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res)); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif + } else if constexpr (config::can_use_cublas) { +#ifndef RAFT_DISABLE_CUDA + auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; + auto constexpr const beta = typename std::remove_reference_t::value_type{0}; + if constexpr (std::is_same_v) { + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(1), + dst.extent(0), + &alpha, + src.data_handle(), + src.extent(0), + &beta, + dst.data_handle(), + dst.extent(1), + dst.data_handle(), + dst.extent(1), + resource::get_cuda_stream(res))); + } else { + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(0), + dst.extent(1), + &alpha, + src.data_handle(), + src.extent(1), + &beta, + dst.data_handle(), + dst.extent(0), + dst.data_handle(), + dst.extent(0), + resource::get_cuda_stream(res))); + } +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif + } else if constexpr (config::custom_kernel_allowed) { +#ifdef __CUDACC__ + config::check_for_unique_dst(dst); + auto const blocks = std::min( + // This maximum is somewhat arbitrary. Could query the device to see + // how many blocks we could reasonably allow, but this is probably + // sufficient considering that this kernel will likely overlap with + // real computations for most use cases. + typename config::index_type{32}, + raft::ceildiv(typename config::index_type(dst.size()), + typename config::index_type(mdspan_copy_tile_elems))); + auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1}; + mdspan_copy_kernel<<>>(dst, src); +#else + // Should never actually reach this because of enable_ifs. Included for + // safety. + RAFT_FAIL( + "raft::copy called in a way that requires custom kernel. Please use " + "raft/core/copy.cuh and include the header in a .cu file"); +#endif + } else if constexpr (config::can_use_std_copy) { + std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); + } else { + // TODO(wphicks): Make the following cache-oblivious and add SIMD support + auto indices = std::array{}; + for (auto i = std::size_t{}; i < dst.size(); ++i) { + if (i != 0) { + if constexpr (std::is_same_v) { + // For layout_right/layout_c_contiguous, we iterate over the + // rightmost extent fastest + auto dim = config::src_rank - 1; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + --dim; + } + } else { + // For layout_left/layout_f_contiguous (and currently all other + // layouts), we iterate over the leftmost extent fastest. The + // cache-oblivious implementation should work through dimensions in + // order of increasing stride. + auto dim = std::size_t{}; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + ++dim; + } + } + } + std::apply(dst, indices) = std::apply(src, indices); + } + } +} +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 84b244f4dc..9045c5c871 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,6 +98,16 @@ struct logic_error : public raft::exception { explicit logic_error(std::string const& message) : raft::exception(message) {} }; +/** + * @brief Exception thrown when attempting to use CUDA features from a non-CUDA + * build + * + */ +struct non_cuda_build_error : public raft::exception { + explicit non_cuda_build_error(char const* const message) : raft::exception(message) {} + explicit non_cuda_build_error(std::string const& message) : raft::exception(message) {} +}; + /** * @} */ diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index 8e331293bf..c30f2e81e8 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -39,6 +39,8 @@ enum resource_type { SUB_COMMUNICATOR, // raft sub communicator DEVICE_PROPERTIES, // cuda device properties DEVICE_ID, // cuda device id + STREAM_VIEW, // view of a cuda stream or a placeholder in + // CUDA-free builds THRUST_POLICY, // thrust execution policy WORKSPACE_RESOURCE, // rmm device memory resource diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp new file mode 100644 index 0000000000..ccf516076f --- /dev/null +++ b/cpp/include/raft/core/resource/stream_view.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif + +namespace raft::resource { +struct stream_view_resource : public resource { + stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) {} + void* get_resource() override { return &stream; } + + ~stream_view_resource() override {} + + private: + raft::stream_view stream; +}; + +/** + * Factory that knows how to construct a specific raft::resource to populate + * the resources instance. + */ +struct stream_view_resource_factory : public resource_factory { + public: + stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) : stream(view) + { + } + resource_type get_resource_type() override { return resource_type::STREAM_VIEW; } + resource* make_resource() override { return new stream_view_resource(stream); } + + private: + raft::stream_view stream; +}; + +/** + * @defgroup resource_stream_view stream resource functions compatible with + * non-CUDA builds + * @{ + */ +/** + * Load a raft::stream_view from a resources instance (and populate it on the res + * if needed). + * @param res raft res object for managing resources + * @return + */ +inline raft::stream_view get_stream_view(resources const& res) +{ + if (!res.has_resource_factory(resource_type::STREAM_VIEW)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::STREAM_VIEW); +}; + +/** + * Load a raft::stream__view from a resources instance (and populate it on the res + * if needed). + * @param[in] res raft resources object for managing resources + * @param[in] view raft stream view + */ +inline void set_stream_view(resources const& res, raft::stream_view view) +{ + res.add_resource_factory(std::make_shared(view)); +}; + +/** + * @brief synchronize a specific stream + * + * @param[in] res the raft resources object + * @param[in] stream stream to synchronize + */ +inline void sync_stream_view(const resources& res, raft::stream_view stream) +{ + stream.interruptible_synchronize(); +} + +/** + * @brief synchronize main stream on the resources instance + */ +inline void sync_stream_view(const resources& res) { sync_stream_view(res, get_stream_view(res)); } + +/** + * @} + */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp new file mode 100644 index 0000000000..f7e7934dbf --- /dev/null +++ b/cpp/include/raft/core/stream_view.hpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#endif + +namespace raft { + +namespace detail { +struct fail_stream_view { + constexpr fail_stream_view() = default; + constexpr fail_stream_view(fail_stream_view const&) = default; + constexpr fail_stream_view(fail_stream_view&&) = default; + auto constexpr operator=(fail_stream_view const&) -> fail_stream_view& = default; + auto constexpr operator=(fail_stream_view&&) -> fail_stream_view& = default; + auto value() { throw non_cuda_build_error{"Attempted to access CUDA stream in non-CUDA build"}; } + [[nodiscard]] auto is_per_thread_default() const { return false; } + [[nodiscard]] auto is_default() const { return false; } + void synchronize() const + { + throw non_cuda_build_error{"Attempted to sync CUDA stream in non-CUDA build"}; + } + void synchronize_no_throw() const + { + RAFT_LOG_ERROR("Attempted to sync CUDA stream in non-CUDA build"); + } +}; +} // namespace detail + +/** A lightweight wrapper around rmm::cuda_stream_view that can be used in + * CUDA-free builds + * + * While CUDA-free builds should never actually make use of a CUDA stream at + * runtime, it is sometimes useful to have a symbol that can stand in place of + * a CUDA stream to avoid excessive ifdef directives interspersed with other + * logic. This struct's methods invoke the underlying rmm::cuda_stream_view in + * CUDA-enabled builds but throw runtime exceptions if any non-trivial method + * is called from a CUDA-free build */ +struct stream_view { +#ifndef RAFT_DISABLE_CUDA + using underlying_view_type = rmm::cuda_stream_view; +#else + using underlying_view_type = detail::fail_stream_view; +#endif + + constexpr stream_view( + underlying_view_type base_view = stream_view::get_underlying_per_thread_default()) + : base_view_{base_view} + { + } + constexpr stream_view(stream_view const&) = default; + constexpr stream_view(stream_view&&) = default; + auto operator=(stream_view const&) -> stream_view& = default; + auto operator=(stream_view&&) -> stream_view& = default; + auto value() { return base_view_.value(); } + operator underlying_view_type() const noexcept { return base_view_; } + [[nodiscard]] auto is_per_thread_default() const { return base_view_.is_per_thread_default(); } + [[nodiscard]] auto is_default() const { return base_view_.is_default(); } + void synchronize() const { base_view_.synchronize(); } + void synchronize_no_throw() const { base_view_.synchronize_no_throw(); } + void interruptible_synchronize() const + { +#ifndef RAFT_DISABLE_CUDA + interruptible::synchronize(base_view_); +#else + synchronize(); +#endif + } + + auto underlying() { return base_view_; } + void synchronize_if_cuda_enabled() + { + if constexpr (raft::CUDA_ENABLED) { base_view_.synchronize(); } + } + + private: + underlying_view_type base_view_; + auto static get_underlying_per_thread_default() -> underlying_view_type + { +#ifndef RAFT_DISABLE_CUDA + return rmm::cuda_stream_per_thread; +#else + auto static constexpr const default_fail_stream = underlying_view_type{}; + return default_fail_stream; +#endif + } +}; + +auto static const stream_view_per_thread = stream_view{}; + +} // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 0651ccac86..8da5e6986c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -21,7 +21,7 @@ rapids_test_init() function(ConfigureTest) - set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY) + set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY NOCUDA) set(oneValueArgs NAME GPUS PERCENT) set(multiValueArgs PATH TARGETS CONFIGURATIONS) @@ -37,7 +37,11 @@ function(ConfigureTest) set(_RAFT_TEST_PERCENT 100) endif() - set(TEST_NAME ${_RAFT_TEST_NAME}) + if(_RAFT_TEST_NOCUDA) + set(TEST_NAME "${_RAFT_TEST_NAME}_NOCUDA") + else() + set(TEST_NAME ${_RAFT_TEST_NAME}) + endif() add_executable(${TEST_NAME} ${_RAFT_TEST_PATH}) target_link_libraries( @@ -68,6 +72,9 @@ function(ConfigureTest) if(_RAFT_TEST_EXPLICIT_INSTANTIATE_ONLY) target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") endif() + if(_RAFT_TEST_NOCUDA) + target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA") + endif() target_include_directories(${TEST_NAME} PUBLIC "$") @@ -117,6 +124,8 @@ if(BUILD_TESTS) test/core/interruptible.cu test/core/nvtx.cpp test/core/mdarray.cu + test/core/mdspan_copy.cpp + test/core/mdspan_copy.cu test/core/mdspan_utils.cu test/core/numpy_serializer.cu test/core/memory_type.cpp @@ -124,12 +133,18 @@ if(BUILD_TESTS) test/core/sparse_matrix.cpp test/core/span.cpp test/core/span.cu + test/core/stream_view.cpp test/core/temporary_device_buffer.cu test/test.cpp LIB EXPLICIT_INSTANTIATE_ONLY ) + ConfigureTest( + NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB + EXPLICIT_INSTANTIATE_ONLY NOCUDA + ) + ConfigureTest( NAME DISTANCE_TEST diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp new file mode 100644 index 0000000000..2f938e3035 --- /dev/null +++ b/cpp/test/core/mdspan_copy.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#endif +#include +#include + +namespace raft { +TEST(MDSpanCopy, Mdspan1DHostHost) +{ + auto res = resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_host_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE(match(out_right(i), double(gen_unique_entry(i)), CompareApprox{0.0001})); + } +} + +#ifndef RAFT_DISABLE_CUDA +TEST(MDSpanCopy, Mdspan1DHostDevice) +{ + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_device_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE( + match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); + } +} + +TEST(MDSpanCopy, Mdspan1DDeviceHost) +{ + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_device_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_host_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE( + match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); + } +} +#endif + +TEST(MDSpanCopy, Mdspan3DHostHost) +{ + auto res = resources{}; + auto constexpr depth = std::uint32_t{500}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_left = make_host_mdarray( + res, extents{}); + auto out_right = make_host_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + copy(res, out_left.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + static_assert(detail::mdspan_copyable:: + can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_left.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } +} + +#ifndef RAFT_DISABLE_CUDA +TEST(MDSpanCopy, Mdspan3DHostDevice) +{ + auto res = device_resources{}; + // Use smaller values here since host/device copy takes awhile. + // Non-trivial logic is tested in the other cases. + auto constexpr depth = std::uint32_t{5}; + auto constexpr rows = std::uint32_t{3}; + auto constexpr cols = std::uint32_t{2}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = + make_device_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match(float(out_right(i, j, k)), + float(gen_unique_entry(i, j, k)), + CompareApprox{0.0001})); + } + } + } + + static_assert(detail::mdspan_copyable:: + can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_left.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match(float(out_left(i, j, k)), + float(gen_unique_entry(i, j, k)), + CompareApprox{0.0001})); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceDevice) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + static_assert(detail::mdspan_copyable::can_use_cublas, + "Current implementation should use cuBLAS for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + static_assert(detail::mdspan_copyable::can_use_cublas, + "Current implementation should use cuBLAS for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} +#endif + +} // namespace raft diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu new file mode 100644 index 0000000000..95d7d3befd --- /dev/null +++ b/cpp/test/core/mdspan_copy.cu @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_device_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} +TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_host_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_host_mdarray( + res, extents{}); + auto out_right = make_host_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} + +TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_device_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} + +} // namespace raft diff --git a/cpp/test/core/stream_view.cpp b/cpp/test/core/stream_view.cpp new file mode 100644 index 0000000000..715c53fe21 --- /dev/null +++ b/cpp/test/core/stream_view.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif +namespace raft { +TEST(StreamView, Default) +{ + auto stream = stream_view_per_thread; + ASSERT_EQ(stream.is_per_thread_default(), raft::CUDA_ENABLED); + ASSERT_FALSE(stream.is_default()); + if (raft::CUDA_ENABLED) { + EXPECT_NO_THROW(stream.synchronize()); + EXPECT_NO_THROW(stream.interruptible_synchronize()); + } else { + EXPECT_THROW(stream.synchronize(), raft::non_cuda_build_error); + EXPECT_THROW(stream.interruptible_synchronize(), raft::non_cuda_build_error); + } + EXPECT_NO_THROW(stream.synchronize_no_throw()); + EXPECT_NO_THROW(stream.synchronize_if_cuda_enabled()); +#ifndef RAFT_DISABLE_CUDA + static_assert(std::is_same_v, + "underlying should return rmm::cuda_stream_view"); +#endif +} +} // namespace raft