diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh index 9054bf0ea5e..b01bac3c1ce 100644 --- a/cudax/include/cuda/experimental/__algorithm/copy.cuh +++ b/cudax/include/cuda/experimental/__algorithm/copy.cuh @@ -53,8 +53,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD //! @brief Launches a bytewise memory copy from source to destination into the provided stream. //! -//! Both source and destination needs to either be a `contiguous_range` or implicitly -//! implicitly/launch transform to one. +//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one. //! Both source and destination type is required to be trivially copyable. //! //! This call might be synchronous if either source or destination is pagable host memory. diff --git a/cudax/include/cuda/experimental/__device/arch_traits.cuh b/cudax/include/cuda/experimental/__device/arch_traits.cuh index dd6d5c863f5..8cb3894cc4d 100644 --- a/cudax/include/cuda/experimental/__device/arch_traits.cuh +++ b/cudax/include/cuda/experimental/__device/arch_traits.cuh @@ -176,6 +176,28 @@ inline constexpr arch_traits_t sm_600_traits = []() constexpr { return __traits; }(); +inline constexpr arch_traits_t sm_610_traits = []() constexpr { + arch_traits_t __traits{}; + __traits.compute_capability_major = 6; + __traits.compute_capability_minor = 1; + __traits.compute_capability = 610; + __traits.max_shared_memory_per_multiprocessor = 96 * 1024; + __traits.max_blocks_per_multiprocessor = 32; + __traits.max_threads_per_multiprocessor = 2048; + __traits.max_warps_per_multiprocessor = + __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size; + __traits.reserved_shared_memory_per_block = 0; + __traits.max_shared_memory_per_block_optin = 48 * 1024; + + __traits.cluster_supported = false; + __traits.redux_intrinisic = false; + __traits.elect_intrinsic = false; + __traits.cp_async_supported = false; + __traits.tma_supported = false; + + return __traits; +}(); + inline constexpr arch_traits_t sm_700_traits = []() constexpr { arch_traits_t __traits{}; __traits.compute_capability_major = 7; @@ -330,6 +352,8 @@ _CCCL_HOST_DEVICE inline constexpr arch_traits_t arch_traits(unsigned int __sm_v { case 600: return detail::sm_600_traits; + case 610: + return detail::sm_610_traits; case 700: return detail::sm_700_traits; case 750: