diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h index bc7d88efd48..8b09ddd1110 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h +++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h @@ -32,129 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster -/* -// barrier.cluster.arrive; // PTX ISA 78, SM_90 -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.wait; // PTX ISA 78, SM_90 -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 -// .sem = { .release } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_release_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) -{ - // __sem == sem_release (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.release;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 -// .sem = { .relaxed } -// Marked volatile -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_relaxed_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) -{ - // __sem == sem_relaxed (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.relaxed;" - : - : - :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.wait.sem; // PTX ISA 80, SM_90 -// .sem = { .acquire } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait( - cuda::ptx::sem_acquire_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) -{ - // __sem == sem_acquire (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait.acquire;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h index 7acce210230..480a02a701e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h @@ -32,162 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk -/* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, -SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* srcMem, - const uint32_t& size, - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __srcMem, - const _CUDA_VSTD::uint32_t& __size, - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast" - : - : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - void* dstMem, - const void* srcMem, - const uint32_t& size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk( - space_cluster_t, - space_shared_t, - void* __dstMem, - const void* __srcMem, - const _CUDA_VSTD::uint32_t& __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - void* dstMem, - const void* srcMem, - const uint32_t& size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. " - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], -ctaMask; // 1. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* srcMem, - const uint32_t& size, - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __srcMem, - const _CUDA_VSTD::uint32_t& __size, - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], " - "%4; // 1. " - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__as_ptr_gmem(__srcMem)), - "r"(__size), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h index f0028105350..bd97259cf19 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h @@ -32,27 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group -/* -// cp.async.bulk.commit_group; // PTX ISA 80, SM_90 -template -__device__ static inline void cp_async_bulk_commit_group(); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_commit_group() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.commit_group;" - : - : - :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h index b66981e8bbb..5b9f575ce5f 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h @@ -32,661 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor -/* -// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1a. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[1], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// " - "1a." - : - : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[1], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1b. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[2], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], " - "[%4];// 1b." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[2], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1c. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[3], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], " - "[%5];// 1c." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[3], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1d. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[4], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " - "%5}], [%6];// 1d." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[4], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1e. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[5], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " - "%6}], [%7];// 1e." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[5], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[1], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2}], [%3], %4; // 2a." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[2], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3}], [%4], %5; // 2b." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[3], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4}], [%5], %6; // 2c." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[4], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[5], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h index 5dcbf8572f4..00a3700e1a9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h @@ -32,51 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group -/* -// cp.async.bulk.wait_group N; // PTX ISA 80, SM_90 -template -__device__ static inline void cp_async_bulk_wait_group( - cuda::ptx::n32_t N); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group %0;" - : - : "n"(__n.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.wait_group.read N; // PTX ISA 80, SM_90 -template -__device__ static inline void cp_async_bulk_wait_group_read( - cuda::ptx::n32_t N); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group.read %0;" - : - : "n"(__n.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h index ee89e33c1c2..ee6d90bc4d9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h @@ -43,1679 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .b32 } -// .op = { .and } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_and_op_t, - B32* dstMem, - const B32* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_and_op_t, - _B32* __dstMem, - const _B32* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .b32 } -// .op = { .or } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_or_op_t, - B32* dstMem, - const B32* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_or_op_t, - _B32* __dstMem, - const _B32* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .b32 } -// .op = { .xor } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_xor_op_t, - B32* dstMem, - const B32* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_xor_op_t, - _B32* __dstMem, - const _B32* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .inc } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_inc_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_inc_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .dec } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_dec_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_dec_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - uint64_t* dstMem, - const uint64_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::uint64_t* __dstMem, - const _CUDA_VSTD::uint64_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .s64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::int64_t* __dstMem, - const _CUDA_VSTD::int64_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 2." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .b32, .b64 } -// .op = { .and } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_and_op_t, - Type* dstMem, - const Type* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_and_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .b32, .b64 } -// .op = { .or } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_or_op_t, - Type* dstMem, - const Type* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_or_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .b32, .b64 } -// .op = { .xor } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_xor_op_t, - Type* dstMem, - const Type* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_xor_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .inc } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_inc_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_inc_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .dec } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_dec_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_dec_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u64 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - uint64_t* dstMem, - const uint64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::uint64_t* __dstMem, - const _CUDA_VSTD::uint64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u64 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - uint64_t* dstMem, - const uint64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::uint64_t* __dstMem, - const _CUDA_VSTD::uint64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - uint64_t* dstMem, - const uint64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::uint64_t* __dstMem, - const _CUDA_VSTD::uint64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s64 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::int64_t* __dstMem, - const _CUDA_VSTD::int64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s64 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::int64_t* __dstMem, - const _CUDA_VSTD::int64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - float* dstMem, - const float* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - double* dstMem, - const double* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.u64 [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::int64_t* __dstMem, - const _CUDA_VSTD::int64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - +#include #ifdef _LIBCUDACXX_HAS_NVF16 -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f16 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f16 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f16 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 +# include #endif // _LIBCUDACXX_HAS_NVF16 - #ifdef _LIBCUDACXX_HAS_NVBF16 -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .bf16 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - __nv_bfloat16* __dstMem, - const __nv_bfloat16* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .bf16 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - __nv_bfloat16* __dstMem, - const __nv_bfloat16* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .bf16 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - __nv_bfloat16* __dstMem, - const __nv_bfloat16* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 +# include #endif // _LIBCUDACXX_HAS_NVBF16 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h index 4ecb108a719..a6b23a706c7 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h @@ -32,538 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor -/* -// cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[1], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.tensor.2d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1b. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[2], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.tensor.3d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1c. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[3], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.tensor.4d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1d. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[4], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.tensor.5d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1e. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[5], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " - "1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h index 956f86c910e..045f09cb40e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h @@ -32,253 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence -/* -// fence{.sem}.scope; // 1. PTX ISA 60, SM_70 -// .sem = { .sc, .acq_rel } -// .scope = { .cta, .gpu, .sys } -template -__device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); -*/ -#if __cccl_ptx_isa >= 600 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope) -{ - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); - static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { - asm volatile("fence.sc.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { - asm volatile("fence.sc.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { - asm volatile("fence.sc.sys; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { - asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { - asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { - asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_70__();)); -} -#endif // __cccl_ptx_isa >= 600 - -/* -// fence{.sem}.scope; // 2. PTX ISA 78, SM_90 -// .sem = { .sc, .acq_rel } -// .scope = { .cluster } -template -__device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) -{ - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc) { - asm volatile("fence.sc.cluster; // 2." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) { - asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 780 -/* -// fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cluster } -template -__device__ static inline void fence_mbarrier_init( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t) -{ - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.mbarrier_init.release.cluster; // 3." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// fence.proxy.alias; // 4. PTX ISA 75, SM_70 -template -__device__ static inline void fence_proxy_alias(); -*/ -#if __cccl_ptx_isa >= 750 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__(); -template -_CCCL_DEVICE static inline void fence_proxy_alias() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("fence.proxy.alias; // 4." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); -} -#endif // __cccl_ptx_isa >= 750 -/* -// fence.proxy.async; // 5. PTX ISA 80, SM_90 -template -__device__ static inline void fence_proxy_async(); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence_proxy_async() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.proxy.async; // 5." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 -// .space = { .global, .shared::cluster, .shared::cta } -template -__device__ static inline void fence_proxy_async( - cuda::ptx::space_t space); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) -{ - static_assert(__space == space_global || __space == space_cluster || __space == space_shared, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__space == space_global) { - asm volatile("fence.proxy.async.global; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) { - asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) { - asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster, .gpu, .sys } -template -__device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, scope_t<_Scope> __scope) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster, .gpu, .sys } -template -__device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void* __addr, n32_t<_N32> __size) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 830 +#include +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc new file mode 100644 index 00000000000..ca9238bc3ff --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc @@ -0,0 +1,123 @@ +/* +// barrier.cluster.arrive; // PTX ISA 78, SM_90 +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.arrive;" + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.wait; // PTX ISA 78, SM_90 +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.wait;" + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 +// .sem = { .release } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_release_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) +{ + // __sem == sem_release (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.arrive.release;" + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 +// .sem = { .relaxed } +// Marked volatile +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_relaxed_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) +{ + // __sem == sem_relaxed (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.arrive.relaxed;" + : + : + :);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.wait.sem; // PTX ISA 80, SM_90 +// .sem = { .acquire } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait( + cuda::ptx::sem_acquire_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) +{ + // __sem == sem_acquire (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.wait.acquire;" + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc new file mode 100644 index 00000000000..69f77053b95 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc @@ -0,0 +1,111 @@ +/* +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, +SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_cluster_t, + space_shared_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. " + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc new file mode 100644 index 00000000000..24baddaea8f --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc @@ -0,0 +1,21 @@ +/* +// cp.async.bulk.commit_group; // PTX ISA 80, SM_90 +template +__device__ static inline void cp_async_bulk_commit_group(); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_commit_group() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("cp.async.bulk.commit_group;" + : + : + :);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc new file mode 100644 index 00000000000..cdd5a535eb6 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc @@ -0,0 +1,45 @@ +/* +// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], +ctaMask; // 1. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], " + "%4; // 1. " + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__as_ptr_gmem(__srcMem)), + "r"(__size), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc new file mode 100644 index 00000000000..547888d5b0f --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc @@ -0,0 +1,416 @@ +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1a. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// " + "1a." + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1b. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], " + "[%4];// 1b." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1c. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], " + "[%5];// 1c." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1d. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " + "%5}], [%6];// 1d." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1e. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " + "%6}], [%7];// 1e." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc new file mode 100644 index 00000000000..020698a15b1 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc @@ -0,0 +1,239 @@ +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2}], [%3], %4; // 2a." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2, %3}], [%4], %5; // 2b." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2, %3, %4}], [%5], %6; // 2c." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc new file mode 100644 index 00000000000..1a715a0fac6 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc @@ -0,0 +1,45 @@ +/* +// cp.async.bulk.wait_group N; // PTX ISA 80, SM_90 +template +__device__ static inline void cp_async_bulk_wait_group( + cuda::ptx::n32_t N); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("cp.async.bulk.wait_group %0;" + : + : "n"(__n.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.wait_group.read N; // PTX ISA 80, SM_90 +template +__device__ static inline void cp_async_bulk_wait_group_read( + cuda::ptx::n32_t N); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("cp.async.bulk.wait_group.read %0;" + : + : "n"(__n.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc new file mode 100644 index 00000000000..50059ff6c5b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc @@ -0,0 +1,1435 @@ +// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .b32 } +// .op = { .and } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_and_op_t, + B32* dstMem, + const B32* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_and_op_t, + _B32* __dstMem, + const _B32* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_b32 (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .b32 } +// .op = { .or } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_or_op_t, + B32* dstMem, + const B32* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_or_op_t, + _B32* __dstMem, + const _B32* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_b32 (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .b32 } +// .op = { .xor } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_xor_op_t, + B32* dstMem, + const B32* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_xor_op_t, + _B32* __dstMem, + const _B32* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_b32 (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .inc } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_inc_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_inc_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_inc (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .dec } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_dec_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_dec_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_dec (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + uint64_t* dstMem, + const uint64_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::uint64_t* __dstMem, + const _CUDA_VSTD::uint64_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .s64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::int64_t* __dstMem, + const _CUDA_VSTD::int64_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " + "// 2." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .b32, .b64 } +// .op = { .and } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_and_op_t, + Type* dstMem, + const Type* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_and_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + // __op == op_and_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .b32, .b64 } +// .op = { .or } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_or_op_t, + Type* dstMem, + const Type* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_or_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + // __op == op_or_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .b32, .b64 } +// .op = { .xor } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_xor_op_t, + Type* dstMem, + const Type* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_xor_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + // __op == op_xor_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .inc } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_inc_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_inc_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_inc (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .dec } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_dec_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_dec_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_dec (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u64 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + uint64_t* dstMem, + const uint64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::uint64_t* __dstMem, + const _CUDA_VSTD::uint64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u64 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u64 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + uint64_t* dstMem, + const uint64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::uint64_t* __dstMem, + const _CUDA_VSTD::uint64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u64 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + uint64_t* dstMem, + const uint64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::uint64_t* __dstMem, + const _CUDA_VSTD::uint64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s64 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::int64_t* __dstMem, + const _CUDA_VSTD::int64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s64 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s64 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::int64_t* __dstMem, + const _CUDA_VSTD::int64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s64 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + float* dstMem, + const float* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + double* dstMem, + const double* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.u64 [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::int64_t* __dstMem, + const _CUDA_VSTD::int64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc new file mode 100644 index 00000000000..c657e8d1935 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc @@ -0,0 +1,127 @@ +#ifdef _LIBCUDACXX_HAS_NVBF16 +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .bf16 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + __nv_bfloat16* __dstMem, + const __nv_bfloat16* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_bf16 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .bf16 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + __nv_bfloat16* __dstMem, + const __nv_bfloat16* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_bf16 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .bf16 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + __nv_bfloat16* __dstMem, + const __nv_bfloat16* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_bf16 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 +#endif // _LIBCUDACXX_HAS_NVBF16 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc new file mode 100644 index 00000000000..3a52630db53 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc @@ -0,0 +1,110 @@ +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f16 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f16 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f16 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f16 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f16 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f16 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc new file mode 100644 index 00000000000..32008f6af5b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc @@ -0,0 +1,532 @@ +/* +// cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.tensor.2d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1b. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.tensor.3d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1c. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.tensor.4d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1d. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.tensor.5d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1e. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " + "1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc new file mode 100644 index 00000000000..f10ec07ebb5 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc @@ -0,0 +1,67 @@ +/* +// fence{.sem}.scope; // 1. PTX ISA 60, SM_70 +// .sem = { .sc, .acq_rel } +// .scope = { .cta, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 600 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); +template +_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope) +{ + static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); + static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_70, + ( + _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { + asm volatile("fence.sc.cta; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { + asm volatile("fence.sc.gpu; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { + asm volatile("fence.sc.sys; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { + asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { + asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { + asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_70__();)); +} +#endif // __cccl_ptx_isa >= 600 + +/* +// fence{.sem}.scope; // 2. PTX ISA 78, SM_90 +// .sem = { .sc, .acq_rel } +// .scope = { .cluster } +template +__device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) +{ + static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); + // __scope == scope_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__sem == sem_sc) { + asm volatile("fence.sc.cluster; // 2." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) { + asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 780 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc new file mode 100644 index 00000000000..0d39c222598 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc @@ -0,0 +1,27 @@ +/* +// fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +template +__device__ static inline void fence_mbarrier_init( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("fence.mbarrier_init.release.cluster; // 3." + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc new file mode 100644 index 00000000000..98260b851ca --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc @@ -0,0 +1,21 @@ +/* +// fence.proxy.alias; // 4. PTX ISA 75, SM_70 +template +__device__ static inline void fence_proxy_alias(); +*/ +#if __cccl_ptx_isa >= 750 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__(); +template +_CCCL_DEVICE static inline void fence_proxy_alias() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_70, + (asm volatile("fence.proxy.alias; // 4." + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); +} +#endif // __cccl_ptx_isa >= 750 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc new file mode 100644 index 00000000000..f0a37baabdb --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc @@ -0,0 +1,50 @@ +/* +// fence.proxy.async; // 5. PTX ISA 80, SM_90 +template +__device__ static inline void fence_proxy_async(); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("fence.proxy.async; // 5." + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 +// .space = { .global, .shared::cluster, .shared::cta } +template +__device__ static inline void fence_proxy_async( + cuda::ptx::space_t space); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) +{ + static_assert(__space == space_global || __space == space_cluster || __space == space_shared, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__space == space_global) { + asm volatile("fence.proxy.async.global; // 6." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) { + asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) { + asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc new file mode 100644 index 00000000000..3e5b2a265f4 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc @@ -0,0 +1,82 @@ +/* +// fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, scope_t<_Scope> __scope) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void* __addr, n32_t<_N32> __size) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc new file mode 100644 index 00000000000..dd3079915f7 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc @@ -0,0 +1,1001 @@ +/* +// mov.u32 sreg_value, %%tid.x; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_tid_x(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%tid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%tid.y; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_tid_y(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%tid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%tid.z; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_tid_z(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%tid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ntid_x(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%ntid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ntid_y(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%ntid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ntid_z(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%ntid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%laneid; // PTX ISA 13 +template +__device__ static inline uint32_t get_sreg_laneid(); +*/ +#if __cccl_ptx_isa >= 130 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%laneid;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 130 + +/* +// mov.u32 sreg_value, %%warpid; // PTX ISA 13 +template +__device__ static inline uint32_t get_sreg_warpid(); +*/ +#if __cccl_ptx_isa >= 130 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%warpid;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 130 + +/* +// mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_nwarpid(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%nwarpid;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ctaid_x(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%ctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ctaid_y(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%ctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ctaid_z(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%ctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_nctaid_x(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_nctaid_y(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_nctaid_z(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%smid; // PTX ISA 13 +template +__device__ static inline uint32_t get_sreg_smid(); +*/ +#if __cccl_ptx_isa >= 130 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%smid;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 130 + +/* +// mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_nsmid(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%nsmid;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u64 sreg_value, %%gridid; // PTX ISA 30 +template +__device__ static inline uint64_t get_sreg_gridid(); +*/ +#if __cccl_ptx_isa >= 300 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid() +{ + _CUDA_VSTD::uint64_t __sreg_value; + asm("mov.u64 %0, %%gridid;" : "=l"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 300 + +/* +// mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 +template +__device__ static inline bool get_sreg_is_explicit_cluster(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mov.pred P_OUT, %%is_explicit_cluster;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__sreg_value) + : + :); + return static_cast(__sreg_value);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_clusterid_x(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.x;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_clusterid_y(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.y;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_clusterid_z(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.z;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_nclusterid_x(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.x;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_nclusterid_y(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.y;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_nclusterid_z(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.z;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_ctaid_x(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.x;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_ctaid_y(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.y;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_ctaid_z(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.z;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_nctaid_x(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.x;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_nctaid_y(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.y;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_nctaid_z(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.z;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_ctarank(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctarank;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_nctarank(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctarank;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_eq(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_eq;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_le(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_le;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_lt(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_lt;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_ge(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_ge;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_gt(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_gt;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%clock; // PTX ISA 10 +template +__device__ static inline uint32_t get_sreg_clock(); +*/ +#if __cccl_ptx_isa >= 100 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%clock;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 100 + +/* +// mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 +template +__device__ static inline uint32_t get_sreg_clock_hi(); +*/ +#if __cccl_ptx_isa >= 500 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%clock_hi;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 500 + +/* +// mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 +template +__device__ static inline uint64_t get_sreg_clock64(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( + "mov.u64 %0, %%clock64;" + : "=l"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 +template +__device__ static inline uint64_t get_sreg_globaltimer(); +*/ +#if __cccl_ptx_isa >= 310 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( + "mov.u64 %0, %%globaltimer;" + : "=l"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 310 + +/* +// mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 +template +__device__ static inline uint32_t get_sreg_globaltimer_lo(); +*/ +#if __cccl_ptx_isa >= 310 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%globaltimer_lo;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 310 + +/* +// mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 +template +__device__ static inline uint32_t get_sreg_globaltimer_hi(); +*/ +#if __cccl_ptx_isa >= 310 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%globaltimer_hi;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 310 + +/* +// mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 +template +__device__ static inline uint32_t get_sreg_total_smem_size(); +*/ +#if __cccl_ptx_isa >= 410 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%total_smem_size;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 410 + +/* +// mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 +template +__device__ static inline uint32_t get_sreg_aggr_smem_size(); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%aggr_smem_size;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 +template +__device__ static inline uint32_t get_sreg_dynamic_smem_size(); +*/ +#if __cccl_ptx_isa >= 410 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%dynamic_smem_size;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 410 + +/* +// mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 +template +__device__ static inline uint64_t get_sreg_current_graph_exec(); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_50, + (_CUDA_VSTD::uint64_t __sreg_value; + asm("mov.u64 %0, %%current_graph_exec;" + : "=l"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc new file mode 100644 index 00000000000..51bd351be87 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc @@ -0,0 +1,27 @@ +/* +// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline uint32_t getctarank( + cuda::ptx::space_cluster_t, + const void* addr); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr) +{ + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __dest; + asm("getctarank.shared::cluster.u32 %0, %1;" + : "=r"(__dest) + : "r"(__as_ptr_smem(__addr)) + :); + return __dest;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc new file mode 100644 index 00000000000..f3e2b860d50 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc @@ -0,0 +1,205 @@ +/* +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (_CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared.b64 %0, [%1]; // 1. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 700 + +/* +// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 +template +__device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + sem_release_t, + scope_t<_Scope> __scope, + space_shared_t, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_remote_dsmem(__addr)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive( + sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc new file mode 100644 index 00000000000..efb749957b1 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc @@ -0,0 +1,79 @@ +/* +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& tx_count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( + sem_release_t, + scope_t<_Scope> __scope, + space_shared_t, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __tx_count) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& tx_count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( + sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc new file mode 100644 index 00000000000..879bedebdc9 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc @@ -0,0 +1,26 @@ +/* +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive_no_complete( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (_CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc new file mode 100644 index 00000000000..3afeeacfccf --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc @@ -0,0 +1,23 @@ +/* +// mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 +template +__device__ static inline void mbarrier_init( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (asm("mbarrier.init.shared.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); +} +#endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc new file mode 100644 index 00000000000..e97d9ccf15c --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc @@ -0,0 +1,28 @@ +/* +// mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX +ISA 70, SM_80 template +__device__ static inline bool mbarrier_test_wait( + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;)); +} +#endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc new file mode 100644 index 00000000000..604cfd92045 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc @@ -0,0 +1,75 @@ +/* +// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX +ISA 71, SM_80 template +__device__ static inline bool mbarrier_test_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 710 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline bool +mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;)); +} +#endif // __cccl_ptx_isa >= 710 + +/* +// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX +ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_test_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_test_wait_parity( + sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc new file mode 100644 index 00000000000..c5f2062664c --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc @@ -0,0 +1,157 @@ +/* +// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. +PTX ISA 78, SM_90 template +__device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX +ISA 78, SM_90 template +__device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait( + _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. +PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait( + sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " + "\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. +PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc new file mode 100644 index 00000000000..321bfc515da --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc @@ -0,0 +1,157 @@ +/* +// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. +PTX ISA 78, SM_90 template +__device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool +mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. +PTX ISA 78, SM_90 template +__device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( + _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. +PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( + sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. +PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc new file mode 100644 index 00000000000..3157fa1c627 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc @@ -0,0 +1,417 @@ +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .inc } +template +__device__ static inline void red_async( + cuda::ptx::op_inc_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_inc (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .dec } +template +__device__ static inline void red_async( + cuda::ptx::op_dec_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_dec (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .min } +template +__device__ static inline void red_async( + cuda::ptx::op_min_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .max } +template +__device__ static inline void red_async( + cuda::ptx::op_max_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .min } +template +__device__ static inline void red_async( + cuda::ptx::op_min_t, + int32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .max } +template +__device__ static inline void red_async( + cuda::ptx::op_max_t, + int32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .and } +template +__device__ static inline void red_async( + cuda::ptx::op_and_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .or } +template +__device__ static inline void red_async( + cuda::ptx::op_or_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .xor } +template +__device__ static inline void red_async( + cuda::ptx::op_xor_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u64 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint64_t* dest, + const uint64_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 +intentional PTX ISA 81, SM_90 +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar) +{ + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " + "intentional" + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc new file mode 100644 index 00000000000..9dfab243ffe --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc @@ -0,0 +1,108 @@ +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, +SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + Type* addr, + const Type& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, +SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + Type* addr, + const Type (&value)[2], + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value[0])), + "l"(__as_b64(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, +SM_90 template +__device__ static inline void st_async( + B32* addr, + const B32 (&value)[4], + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_b32(__value[2])), + "r"(__as_b32(__value[3])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc new file mode 100644 index 00000000000..033d0606e7f --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc @@ -0,0 +1,54 @@ +/* +// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA +83, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void tensormap_cp_fenceproxy( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + void* dst, + const void* src, + cuda::ptx::n32_t size); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, const void* __src, n32_t<_N32> __size) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc new file mode 100644 index 00000000000..3b1060ead38 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc @@ -0,0 +1,569 @@ +/* +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_address( + cuda::ptx::space_global_t, + void* tm_addr, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_address( + cuda::ptx::space_shared_t, + void* tm_addr, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_rank( + cuda::ptx::space_global_t, + void* tm_addr, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_rank( + cuda::ptx::space_shared_t, + void* tm_addr, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_box_dim( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_box_dim( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_dim( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_dim( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_element_size( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_element_size( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_elemtype( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_elemtype( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_interleave_layout( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_interleave_layout( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_swizzle_mode( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_swizzle_mode( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_fill_mode( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_fill_mode( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h index 8982984885d..033005beb5b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h +++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h @@ -32,1007 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 10. Special Registers // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers -/* -// mov.u32 sreg_value, %%tid.x; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_tid_x(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%tid.x;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%tid.y; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_tid_y(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%tid.y;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%tid.z; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_tid_z(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%tid.z;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ntid_x(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%ntid.x;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ntid_y(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%ntid.y;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ntid_z(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%ntid.z;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%laneid; // PTX ISA 13 -template -__device__ static inline uint32_t get_sreg_laneid(); -*/ -#if __cccl_ptx_isa >= 130 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%laneid;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 130 - -/* -// mov.u32 sreg_value, %%warpid; // PTX ISA 13 -template -__device__ static inline uint32_t get_sreg_warpid(); -*/ -#if __cccl_ptx_isa >= 130 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%warpid;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 130 - -/* -// mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_nwarpid(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%nwarpid;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ctaid_x(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%ctaid.x;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ctaid_y(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%ctaid.y;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ctaid_z(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%ctaid.z;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_nctaid_x(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nctaid.x;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_nctaid_y(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nctaid.y;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_nctaid_z(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nctaid.z;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%smid; // PTX ISA 13 -template -__device__ static inline uint32_t get_sreg_smid(); -*/ -#if __cccl_ptx_isa >= 130 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%smid;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 130 - -/* -// mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_nsmid(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%nsmid;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u64 sreg_value, %%gridid; // PTX ISA 30 -template -__device__ static inline uint64_t get_sreg_gridid(); -*/ -#if __cccl_ptx_isa >= 300 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid() -{ - _CUDA_VSTD::uint64_t __sreg_value; - asm("mov.u64 %0, %%gridid;" : "=l"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 300 - -/* -// mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 -template -__device__ static inline bool get_sreg_is_explicit_cluster(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mov.pred P_OUT, %%is_explicit_cluster;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__sreg_value) - : - :); - return static_cast(__sreg_value);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_clusterid_x(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_clusterid_y(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_clusterid_z(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_nclusterid_x(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_nclusterid_y(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_nclusterid_z(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_ctaid_x(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_ctaid_y(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_ctaid_z(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_nctaid_x(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_nctaid_y(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_nctaid_z(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_ctarank(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctarank;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_nctarank(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctarank;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_eq(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_eq;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_le(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_le;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_lt(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_lt;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_ge(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_ge;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_gt(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_gt;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%clock; // PTX ISA 10 -template -__device__ static inline uint32_t get_sreg_clock(); -*/ -#if __cccl_ptx_isa >= 100 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%clock;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 100 - -/* -// mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 -template -__device__ static inline uint32_t get_sreg_clock_hi(); -*/ -#if __cccl_ptx_isa >= 500 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%clock_hi;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 500 - -/* -// mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 -template -__device__ static inline uint64_t get_sreg_clock64(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( - "mov.u64 %0, %%clock64;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 -template -__device__ static inline uint64_t get_sreg_globaltimer(); -*/ -#if __cccl_ptx_isa >= 310 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( - "mov.u64 %0, %%globaltimer;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 310 - -/* -// mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 -template -__device__ static inline uint32_t get_sreg_globaltimer_lo(); -*/ -#if __cccl_ptx_isa >= 310 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%globaltimer_lo;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 310 - -/* -// mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 -template -__device__ static inline uint32_t get_sreg_globaltimer_hi(); -*/ -#if __cccl_ptx_isa >= 310 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%globaltimer_hi;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 310 - -/* -// mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 -template -__device__ static inline uint32_t get_sreg_total_smem_size(); -*/ -#if __cccl_ptx_isa >= 410 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%total_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 410 - -/* -// mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 -template -__device__ static inline uint32_t get_sreg_aggr_smem_size(); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%aggr_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 -template -__device__ static inline uint32_t get_sreg_dynamic_smem_size(); -*/ -#if __cccl_ptx_isa >= 410 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%dynamic_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 410 - -/* -// mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 -template -__device__ static inline uint64_t get_sreg_current_graph_exec(); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_50, - (_CUDA_VSTD::uint64_t __sreg_value; - asm("mov.u64 %0, %%current_graph_exec;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h index f1a2bbbd0e9..f5ed3424d3b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h +++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h @@ -32,33 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.23. Data Movement and Conversion Instructions: getctarank // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank -/* -// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 -// .space = { .shared::cluster } -template -__device__ static inline uint32_t getctarank( - cuda::ptx::space_cluster_t, - const void* addr); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr) -{ - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __dest; - asm("getctarank.shared::cluster.u32 %0, %1;" - : "=r"(__dest) - : "r"(__as_ptr_smem(__addr)) - :); - return __dest;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h index 5b423990f1c..fb1341a61d8 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h @@ -32,316 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -/* -// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 -template -__device__ static inline uint64_t mbarrier_arrive( - uint64_t* addr); -*/ -#if __cccl_ptx_isa >= 700 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.shared.b64 %0, [%1]; // 1. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 700 - -/* -// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 -template -__device__ static inline uint64_t mbarrier_arrive( - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t -mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster } -// .space = { .shared::cta } -template -__device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t -mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster } -// .space = { .shared::cta } -template -__device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( - sem_release_t, - scope_t<_Scope> __scope, - space_shared_t, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint32_t& __count) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cluster } -// .space = { .shared::cluster } -template -__device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) -{ - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " - : - : "r"(__as_ptr_remote_dsmem(__addr)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cluster } -// .space = { .shared::cluster } -template -__device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void mbarrier_arrive( - sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) -{ - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 -template -__device__ static inline uint64_t mbarrier_arrive_no_complete( - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 700 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t -mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 700 -/* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster } -// .space = { .shared::cta } -template -__device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& tx_count); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( - sem_release_t, - scope_t<_Scope> __scope, - space_shared_t, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint32_t& __tx_count) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cluster } -// .space = { .shared::cluster } -template -__device__ static inline void mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr, - const uint32_t& tx_count); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( - sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) -{ - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h index 366b1b67eec..575abda7a41 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h @@ -32,29 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init -/* -// mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 -template -__device__ static inline void mbarrier_init( - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 700 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (asm("mbarrier.init.shared.b64 [%0], %1;" - : - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); -} -#endif // __cccl_ptx_isa >= 700 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h index 837fec44b9f..2d6adb78eec 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h @@ -32,470 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait -/* -// mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX -ISA 70, SM_80 template -__device__ static inline bool mbarrier_test_wait( - uint64_t* addr, - const uint64_t& state); -*/ -#if __cccl_ptx_isa >= 700 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;)); -} -#endif // __cccl_ptx_isa >= 700 - -/* -// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX -ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_test_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_test_wait( - sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX -ISA 71, SM_80 template -__device__ static inline bool mbarrier_test_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity); -*/ -#if __cccl_ptx_isa >= 710 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline bool -mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;)); -} -#endif // __cccl_ptx_isa >= 710 - -/* -// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX -ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_test_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_test_wait_parity( - sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. -PTX ISA 78, SM_90 template -__device__ static inline bool mbarrier_try_wait( - uint64_t* addr, - const uint64_t& state); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX -ISA 78, SM_90 template -__device__ static inline bool mbarrier_try_wait( - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait( - _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. -PTX ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait( - sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. -PTX ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait( - sem_acquire_t, - scope_t<_Scope> __scope, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint64_t& __state, - const _CUDA_VSTD::uint32_t& __suspendTimeHint) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. -PTX ISA 78, SM_90 template -__device__ static inline bool mbarrier_try_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool -mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. -PTX ISA 78, SM_90 template -__device__ static inline bool mbarrier_try_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( - _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. -PTX ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( - sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. -PTX ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( - sem_acquire_t, - scope_t<_Scope> __scope, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint32_t& __phaseParity, - const _CUDA_VSTD::uint32_t& __suspendTimeHint) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h index 777628c67d0..a610cf2b583 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h @@ -32,423 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .inc } -template -__device__ static inline void red_async( - cuda::ptx::op_inc_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .dec } -template -__device__ static inline void red_async( - cuda::ptx::op_dec_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .min } -template -__device__ static inline void red_async( - cuda::ptx::op_min_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .max } -template -__device__ static inline void red_async( - cuda::ptx::op_max_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .add } -template -__device__ static inline void red_async( - cuda::ptx::op_add_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .s32 } -// .op = { .min } -template -__device__ static inline void red_async( - cuda::ptx::op_min_t, - int32_t* dest, - const int32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .s32 } -// .op = { .max } -template -__device__ static inline void red_async( - cuda::ptx::op_max_t, - int32_t* dest, - const int32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .s32 } -// .op = { .add } -template -__device__ static inline void red_async( - cuda::ptx::op_add_t, - int32_t* dest, - const int32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .b32 } -// .op = { .and } -template -__device__ static inline void red_async( - cuda::ptx::op_and_op_t, - B32* dest, - const B32& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_b32 (due to parameter type constraint) - // __op == op_and_op (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .b32 } -// .op = { .or } -template -__device__ static inline void red_async( - cuda::ptx::op_or_op_t, - B32* dest, - const B32& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_b32 (due to parameter type constraint) - // __op == op_or_op (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .b32 } -// .op = { .xor } -template -__device__ static inline void red_async( - cuda::ptx::op_xor_op_t, - B32* dest, - const B32& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_b32 (due to parameter type constraint) - // __op == op_xor_op (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u64 } -// .op = { .add } -template -__device__ static inline void red_async( - cuda::ptx::op_add_t, - uint64_t* dest, - const uint64_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 -intentional PTX ISA 81, SM_90 -// .op = { .add } -template -__device__ static inline void red_async( - cuda::ptx::op_add_t, - int64_t* dest, - const int64_t& value, - int64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar) -{ - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " - "intentional" - : - : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h index e6774087802..09199b4a3ce 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h @@ -32,114 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.12. Data Movement and Conversion Instructions: st.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async -/* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, -SM_90 -// .type = { .b32, .b64 } -template -__device__ static inline void st_async( - Type* addr, - const Type& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, -SM_90 -// .type = { .b32, .b64 } -template -__device__ static inline void st_async( - Type* addr, - const Type (&value)[2], - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) -{ - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "l"(__as_b64(__value[0])), - "l"(__as_b64(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, -SM_90 template -__device__ static inline void st_async( - B32* addr, - const B32 (&value)[4], - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) -{ - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_b32(__value[2])), - "r"(__as_b32(__value[3])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h index ce8b0f10991..de179f69735 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h @@ -32,60 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy -/* -// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA -83, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster, .gpu, .sys } -template -__device__ static inline void tensormap_cp_fenceproxy( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - void* dst, - const void* src, - cuda::ptx::n32_t size); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, const void* __src, n32_t<_N32> __size) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 830 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h index b40c0cf72aa..2f81d8b4361 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h @@ -32,575 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace -/* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_global_address( - cuda::ptx::space_global_t, - void* tm_addr, - B64 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_global_address( - cuda::ptx::space_shared_t, - void* tm_addr, - B64 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_rank( - cuda::ptx::space_global_t, - void* tm_addr, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_rank( - cuda::ptx::space_shared_t, - void* tm_addr, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_box_dim( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_box_dim( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_global_dim( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_global_dim( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_global_stride( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B64 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_global_stride( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B64 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_element_size( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_element_size( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_elemtype( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_elemtype( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_interleave_layout( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_interleave_layout( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_swizzle_mode( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_swizzle_mode( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_fill_mode( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_fill_mode( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX