diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
index bc7d88efd48..8b09ddd1110 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
@@ -32,129 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
-/*
-// barrier.cluster.arrive; // PTX ISA 78, SM_90
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.wait; // PTX ISA 78, SM_90
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_wait();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.wait;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
-// .sem       = { .release }
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_release_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive.release;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
-// .sem       = { .relaxed }
-// Marked volatile
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_relaxed_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
-{
-  // __sem == sem_relaxed (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive.relaxed;"
-                  :
-                  :
-                  :);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_wait(
-  cuda::ptx::sem_acquire_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.wait.acquire;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/barrier_cluster.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
index 7acce210230..480a02a701e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
@@ -32,162 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-/*
-// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
-SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
-         :
-         : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
-ctaMask; // 1.  PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
-         "%4; // 1. "
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__as_ptr_gmem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
index f0028105350..bd97259cf19 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
@@ -32,27 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-/*
-// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-template <typename=void>
-__device__ static inline void cp_async_bulk_commit_group();
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_commit_group()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.commit_group;"
-                  :
-                  :
-                  :);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
index b66981e8bbb..5b9f575ce5f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
@@ -32,661 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-/*
-// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1a. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[1],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// "
-         "1a."
-         :
-         : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[1],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a."
-         :
-         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1b. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[2],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], "
-         "[%4];// 1b."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[2],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b."
-         :
-         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1c. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[3],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], "
-         "[%5];// 1c."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[3],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1d. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[4],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, "
-         "%5}], [%6];// 1d."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[4],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1e. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[5],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, "
-         "%6}], [%7];// 1e."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[5],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[1],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2}], [%3], %4; // 2a."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[2],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3}], [%4], %5; // 2b."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[3],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4}], [%5], %6; // 2c."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[4],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[5],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
index 5dcbf8572f4..00a3700e1a9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
@@ -32,51 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
-/*
-// cp.async.bulk.wait_group N; // PTX ISA 80, SM_90
-template <int N32>
-__device__ static inline void cp_async_bulk_wait_group(
-  cuda::ptx::n32_t<N32> N);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();
-template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.wait_group %0;"
-                  :
-                  : "n"(__n.value)
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.wait_group.read N; // PTX ISA 80, SM_90
-template <int N32>
-__device__ static inline void cp_async_bulk_wait_group_read(
-  cuda::ptx::n32_t<N32> N);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();
-template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.wait_group.read %0;"
-                  :
-                  : "n"(__n.value)
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
index ee89e33c1c2..ee6d90bc4d9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
@@ -43,1679 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .b32 }
-// .op        = { .and }
-template <typename B32>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_and_op_t,
-  B32* dstMem,
-  const B32* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_and_op_t,
-  _B32* __dstMem,
-  const _B32* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_and_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .b32 }
-// .op        = { .or }
-template <typename B32>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_or_op_t,
-  B32* dstMem,
-  const B32* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_or_op_t,
-  _B32* __dstMem,
-  const _B32* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_or_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .b32 }
-// .op        = { .xor }
-template <typename B32>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_xor_op_t,
-  B32* dstMem,
-  const B32* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_xor_op_t,
-  _B32* __dstMem,
-  const _B32* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_xor_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .inc }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_inc_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_inc_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_inc (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .dec }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_dec_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_dec_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_dec (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  uint64_t* dstMem,
-  const uint64_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::uint64_t* __dstMem,
-  const _CUDA_VSTD::uint64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .s64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  int64_t* dstMem,
-  const int64_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::int64_t* __dstMem,
-  const _CUDA_VSTD::int64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
-         "// 2."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .b32, .b64 }
-// .op        = { .and }
-template <typename Type>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_and_op_t,
-  Type* dstMem,
-  const Type* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_and_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  // __op == op_and_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .b32, .b64 }
-// .op        = { .or }
-template <typename Type>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_or_op_t,
-  Type* dstMem,
-  const Type* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_or_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  // __op == op_or_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .b32, .b64 }
-// .op        = { .xor }
-template <typename Type>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_xor_op_t,
-  Type* dstMem,
-  const Type* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_xor_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  // __op == op_xor_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .inc }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_inc_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_inc_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_inc (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .dec }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_dec_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_dec_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_dec (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u64 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  uint64_t* dstMem,
-  const uint64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::uint64_t* __dstMem,
-  const _CUDA_VSTD::uint64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u64 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  uint64_t* dstMem,
-  const uint64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::uint64_t* __dstMem,
-  const _CUDA_VSTD::uint64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  uint64_t* dstMem,
-  const uint64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::uint64_t* __dstMem,
-  const _CUDA_VSTD::uint64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s64 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  int64_t* dstMem,
-  const int64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::int64_t* __dstMem,
-  const _CUDA_VSTD::int64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s64 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s64 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  int64_t* dstMem,
-  const int64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::int64_t* __dstMem,
-  const _CUDA_VSTD::int64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s64 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  float* dstMem,
-  const float* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  double* dstMem,
-  const double* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  int64_t* dstMem,
-  const int64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::int64_t* __dstMem,
-  const _CUDA_VSTD::int64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 6."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc>
 #ifdef _LIBCUDACXX_HAS_NVF16
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f16 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  __half* dstMem,
-  const __half* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f16 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f16 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  __half* dstMem,
-  const __half* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f16 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f16 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  __half* dstMem,
-  const __half* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f16 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16  [%0], [%1], %2; // 5."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc>
 #endif // _LIBCUDACXX_HAS_NVF16
-
 #ifdef _LIBCUDACXX_HAS_NVBF16
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .bf16 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  __nv_bfloat16* dstMem,
-  const __nv_bfloat16* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  __nv_bfloat16* __dstMem,
-  const __nv_bfloat16* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_bf16 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .bf16 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  __nv_bfloat16* dstMem,
-  const __nv_bfloat16* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  __nv_bfloat16* __dstMem,
-  const __nv_bfloat16* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_bf16 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .bf16 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  __nv_bfloat16* dstMem,
-  const __nv_bfloat16* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  __nv_bfloat16* __dstMem,
-  const __nv_bfloat16* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_bf16 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16  [%0], [%1], %2; // 5."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc>
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
index 4ecb108a719..a6b23a706c7 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
@@ -32,538 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
-/*
-// cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[1],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.tensor.2d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1b. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[2],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.tensor.3d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1c. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[3],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.tensor.4d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1d. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[4],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.tensor.5d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1e. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[5],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // "
-            "1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h
index 956f86c910e..045f09cb40e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/fence.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h
@@ -32,253 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
-/*
-// fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-// .sem       = { .sc, .acq_rel }
-// .scope     = { .cta, .gpu, .sys }
-template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-__device__ static inline void fence(
-  cuda::ptx::sem_t<Sem> sem,
-  cuda::ptx::scope_t<Scope> scope);
-*/
-#if __cccl_ptx_isa >= 600
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__();
-template <dot_sem _Sem, dot_scope _Scope>
-_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope)
-{
-  static_assert(__sem == sem_sc || __sem == sem_acq_rel, "");
-  static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_70,
-    (
-      _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) {
-        asm volatile("fence.sc.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
-        asm volatile("fence.sc.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
-        asm volatile("fence.sc.sys; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
-        asm volatile("fence.acq_rel.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
-        asm volatile("fence.acq_rel.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
-        asm volatile("fence.acq_rel.sys; // 1." : : : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_is_not_supported_before_SM_70__();));
-}
-#endif // __cccl_ptx_isa >= 600
-
-/*
-// fence{.sem}.scope; // 2. PTX ISA 78, SM_90
-// .sem       = { .sc, .acq_rel }
-// .scope     = { .cluster }
-template <cuda::ptx::dot_sem Sem>
-__device__ static inline void fence(
-  cuda::ptx::sem_t<Sem> sem,
-  cuda::ptx::scope_cluster_t);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__();
-template <dot_sem _Sem>
-_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
-{
-  static_assert(__sem == sem_sc || __sem == sem_acq_rel, "");
-  // __scope == scope_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__sem == sem_sc) {
-        asm volatile("fence.sc.cluster; // 2." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) {
-        asm volatile("fence.acq_rel.cluster; // 2." : : : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 780
-/*
-// fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cluster }
-template <typename=void>
-__device__ static inline void fence_mbarrier_init(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("fence.mbarrier_init.release.cluster; // 3."
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// fence.proxy.alias; // 4. PTX ISA 75, SM_70
-template <typename=void>
-__device__ static inline void fence_proxy_alias();
-*/
-#if __cccl_ptx_isa >= 750
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();
-template <typename = void>
-_CCCL_DEVICE static inline void fence_proxy_alias()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("fence.proxy.alias; // 4."
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();));
-}
-#endif // __cccl_ptx_isa >= 750
-/*
-// fence.proxy.async; // 5. PTX ISA 80, SM_90
-template <typename=void>
-__device__ static inline void fence_proxy_async();
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void fence_proxy_async()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("fence.proxy.async; // 5."
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
-// .space     = { .global, .shared::cluster, .shared::cta }
-template <cuda::ptx::dot_space Space>
-__device__ static inline void fence_proxy_async(
-  cuda::ptx::space_t<Space> space);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();
-template <dot_space _Space>
-_CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
-{
-  static_assert(__space == space_global || __space == space_cluster || __space == space_shared, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__space == space_global) {
-        asm volatile("fence.proxy.async.global; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) {
-        asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) {
-        asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster, .gpu, .sys }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline void fence_proxy_tensormap_generic(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, scope_t<_Scope> __scope)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
-        asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
-        asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster, .gpu, .sys }
-template <int N32, cuda::ptx::dot_scope Scope>
-__device__ static inline void fence_proxy_tensormap_generic(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  const void* addr,
-  cuda::ptx::n32_t<N32> size);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();
-template <int _N32, dot_scope _Scope>
-_CCCL_DEVICE static inline void
-fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void* __addr, n32_t<_N32> __size)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8."
-                     :
-                     : "l"(__addr), "n"(__size.value)
-                     : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8."
-                     :
-                     : "l"(__addr), "n"(__size.value)
-                     : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
-        asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8."
-                     :
-                     : "l"(__addr), "n"(__size.value)
-                     : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
-        asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8."
-                     :
-                     : "l"(__addr), "n"(__size.value)
-                     : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 830
+#include <cuda/__ptx/instructions/generated/fence.inc>
+#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.inc>
+#include <cuda/__ptx/instructions/generated/fence_proxy_alias.inc>
+#include <cuda/__ptx/instructions/generated/fence_proxy_async.inc>
+#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
new file mode 100644
index 00000000000..ca9238bc3ff
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
@@ -0,0 +1,123 @@
+/*
+// barrier.cluster.arrive; // PTX ISA 78, SM_90
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.arrive;"
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// barrier.cluster.wait; // PTX ISA 78, SM_90
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_wait();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_wait()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.wait;"
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
+// .sem       = { .release }
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive(
+  cuda::ptx::sem_release_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.arrive.release;"
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
+// .sem       = { .relaxed }
+// Marked volatile
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive(
+  cuda::ptx::sem_relaxed_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
+{
+  // __sem == sem_relaxed (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.arrive.relaxed;"
+                  :
+                  :
+                  :);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_wait(
+  cuda::ptx::sem_acquire_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.wait.acquire;"
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
new file mode 100644
index 00000000000..69f77053b95
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
@@ -0,0 +1,111 @@
+/*
+// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
+SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* srcMem,
+  const uint32_t& size,
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __srcMem,
+  const _CUDA_VSTD::uint32_t& __size,
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
+         :
+         : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  void* dstMem,
+  const void* srcMem,
+  const uint32_t& size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  void* __dstMem,
+  const void* __srcMem,
+  const _CUDA_VSTD::uint32_t& __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  void* dstMem,
+  const void* srcMem,
+  const uint32_t& size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
new file mode 100644
index 00000000000..24baddaea8f
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
@@ -0,0 +1,21 @@
+/*
+// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
+template <typename=void>
+__device__ static inline void cp_async_bulk_commit_group();
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_commit_group()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("cp.async.bulk.commit_group;"
+                  :
+                  :
+                  :);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
new file mode 100644
index 00000000000..cdd5a535eb6
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
@@ -0,0 +1,45 @@
+/*
+// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
+ctaMask; // 1.  PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* srcMem,
+  const uint32_t& size,
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __srcMem,
+  const _CUDA_VSTD::uint32_t& __size,
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
+         "%4; // 1. "
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__as_ptr_gmem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
new file mode 100644
index 00000000000..547888d5b0f
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
@@ -0,0 +1,416 @@
+/*
+// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1a. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[1],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// "
+         "1a."
+         :
+         : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[1],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a."
+         :
+         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1b. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[2],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], "
+         "[%4];// 1b."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[2],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b."
+         :
+         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1c. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[3],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], "
+         "[%5];// 1c."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[3],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c."
+         :
+         : "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1d. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[4],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, "
+         "%5}], [%6];// 1d."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[4],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d."
+         :
+         : "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1e. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[5],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, "
+         "%6}], [%7];// 1e."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__tensorCoords[4]),
+           "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[5],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e."
+         :
+         : "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__tensorCoords[4]),
+           "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
new file mode 100644
index 00000000000..020698a15b1
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
@@ -0,0 +1,239 @@
+/*
+// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[1],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2}], [%3], %4; // 2a."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[2],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2, %3}], [%4], %5; // 2b."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[3],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2, %3, %4}], [%5], %6; // 2c."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[4],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[5],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__tensorCoords[4]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
new file mode 100644
index 00000000000..1a715a0fac6
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
@@ -0,0 +1,45 @@
+/*
+// cp.async.bulk.wait_group N; // PTX ISA 80, SM_90
+template <int N32>
+__device__ static inline void cp_async_bulk_wait_group(
+  cuda::ptx::n32_t<N32> N);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();
+template <int _N32>
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("cp.async.bulk.wait_group %0;"
+                  :
+                  : "n"(__n.value)
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.wait_group.read N; // PTX ISA 80, SM_90
+template <int N32>
+__device__ static inline void cp_async_bulk_wait_group_read(
+  cuda::ptx::n32_t<N32> N);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();
+template <int _N32>
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("cp.async.bulk.wait_group.read %0;"
+                  :
+                  : "n"(__n.value)
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
new file mode 100644
index 00000000000..50059ff6c5b
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
@@ -0,0 +1,1435 @@
+// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .b32 }
+// .op        = { .and }
+template <typename B32>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_and_op_t,
+  B32* dstMem,
+  const B32* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_and_op_t,
+  _B32* __dstMem,
+  const _B32* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_and_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .b32 }
+// .op        = { .or }
+template <typename B32>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_or_op_t,
+  B32* dstMem,
+  const B32* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_or_op_t,
+  _B32* __dstMem,
+  const _B32* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_or_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .b32 }
+// .op        = { .xor }
+template <typename B32>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_xor_op_t,
+  B32* dstMem,
+  const B32* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_xor_op_t,
+  _B32* __dstMem,
+  const _B32* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_xor_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .inc }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_inc_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_inc_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_inc (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .dec }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_dec_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_dec_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_dec (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  uint64_t* dstMem,
+  const uint64_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::uint64_t* __dstMem,
+  const _CUDA_VSTD::uint64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .s64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  int64_t* dstMem,
+  const int64_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::int64_t* __dstMem,
+  const _CUDA_VSTD::int64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
+         "// 2."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .b32, .b64 }
+// .op        = { .and }
+template <typename Type>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_and_op_t,
+  Type* dstMem,
+  const Type* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_and_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  // __op == op_and_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .b32, .b64 }
+// .op        = { .or }
+template <typename Type>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_or_op_t,
+  Type* dstMem,
+  const Type* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_or_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  // __op == op_or_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .b32, .b64 }
+// .op        = { .xor }
+template <typename Type>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_xor_op_t,
+  Type* dstMem,
+  const Type* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_xor_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  // __op == op_xor_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .inc }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_inc_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_inc_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_inc (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .dec }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_dec_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_dec_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_dec (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u64 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  uint64_t* dstMem,
+  const uint64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::uint64_t* __dstMem,
+  const _CUDA_VSTD::uint64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u64 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  uint64_t* dstMem,
+  const uint64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::uint64_t* __dstMem,
+  const _CUDA_VSTD::uint64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  uint64_t* dstMem,
+  const uint64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::uint64_t* __dstMem,
+  const _CUDA_VSTD::uint64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s64 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  int64_t* dstMem,
+  const int64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::int64_t* __dstMem,
+  const _CUDA_VSTD::int64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s64 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s64 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  int64_t* dstMem,
+  const int64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::int64_t* __dstMem,
+  const _CUDA_VSTD::int64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s64 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  float* dstMem,
+  const float* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  double* dstMem,
+  const double* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  int64_t* dstMem,
+  const int64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::int64_t* __dstMem,
+  const _CUDA_VSTD::int64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 6."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
new file mode 100644
index 00000000000..c657e8d1935
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
@@ -0,0 +1,127 @@
+#ifdef _LIBCUDACXX_HAS_NVBF16
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .bf16 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  __nv_bfloat16* dstMem,
+  const __nv_bfloat16* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  __nv_bfloat16* __dstMem,
+  const __nv_bfloat16* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_bf16 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .bf16 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  __nv_bfloat16* dstMem,
+  const __nv_bfloat16* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  __nv_bfloat16* __dstMem,
+  const __nv_bfloat16* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_bf16 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .bf16 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  __nv_bfloat16* dstMem,
+  const __nv_bfloat16* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  __nv_bfloat16* __dstMem,
+  const __nv_bfloat16* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_bf16 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16  [%0], [%1], %2; // 5."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+#endif // _LIBCUDACXX_HAS_NVBF16
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
new file mode 100644
index 00000000000..3a52630db53
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
@@ -0,0 +1,110 @@
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f16 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  __half* dstMem,
+  const __half* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f16 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f16 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  __half* dstMem,
+  const __half* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f16 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f16 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  __half* dstMem,
+  const __half* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f16 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16  [%0], [%1], %2; // 5."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
new file mode 100644
index 00000000000..32008f6af5b
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
@@ -0,0 +1,532 @@
+/*
+// cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[1],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.tensor.2d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1b. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[2],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.tensor.3d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1c. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[3],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.tensor.4d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1d. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[4],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.tensor.5d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1e. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[5],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // "
+            "1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
new file mode 100644
index 00000000000..f10ec07ebb5
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
@@ -0,0 +1,67 @@
+/*
+// fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+// .sem       = { .sc, .acq_rel }
+// .scope     = { .cta, .gpu, .sys }
+template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+__device__ static inline void fence(
+  cuda::ptx::sem_t<Sem> sem,
+  cuda::ptx::scope_t<Scope> scope);
+*/
+#if __cccl_ptx_isa >= 600
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__();
+template <dot_sem _Sem, dot_scope _Scope>
+_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope)
+{
+  static_assert(__sem == sem_sc || __sem == sem_acq_rel, "");
+  static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_70,
+    (
+      _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) {
+        asm volatile("fence.sc.cta; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
+        asm volatile("fence.sc.gpu; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
+        asm volatile("fence.sc.sys; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
+        asm volatile("fence.acq_rel.cta; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
+        asm volatile("fence.acq_rel.gpu; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
+        asm volatile("fence.acq_rel.sys; // 1." : : : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_is_not_supported_before_SM_70__();));
+}
+#endif // __cccl_ptx_isa >= 600
+
+/*
+// fence{.sem}.scope; // 2. PTX ISA 78, SM_90
+// .sem       = { .sc, .acq_rel }
+// .scope     = { .cluster }
+template <cuda::ptx::dot_sem Sem>
+__device__ static inline void fence(
+  cuda::ptx::sem_t<Sem> sem,
+  cuda::ptx::scope_cluster_t);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__();
+template <dot_sem _Sem>
+_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
+{
+  static_assert(__sem == sem_sc || __sem == sem_acq_rel, "");
+  // __scope == scope_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__sem == sem_sc) {
+        asm volatile("fence.sc.cluster; // 2." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) {
+        asm volatile("fence.acq_rel.cluster; // 2." : : : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 780
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
new file mode 100644
index 00000000000..0d39c222598
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
@@ -0,0 +1,27 @@
+/*
+// fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+template <typename=void>
+__device__ static inline void fence_mbarrier_init(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("fence.mbarrier_init.release.cluster; // 3."
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
new file mode 100644
index 00000000000..98260b851ca
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
@@ -0,0 +1,21 @@
+/*
+// fence.proxy.alias; // 4. PTX ISA 75, SM_70
+template <typename=void>
+__device__ static inline void fence_proxy_alias();
+*/
+#if __cccl_ptx_isa >= 750
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();
+template <typename = void>
+_CCCL_DEVICE static inline void fence_proxy_alias()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("fence.proxy.alias; // 4."
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();));
+}
+#endif // __cccl_ptx_isa >= 750
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
new file mode 100644
index 00000000000..f0a37baabdb
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
@@ -0,0 +1,50 @@
+/*
+// fence.proxy.async; // 5. PTX ISA 80, SM_90
+template <typename=void>
+__device__ static inline void fence_proxy_async();
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void fence_proxy_async()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("fence.proxy.async; // 5."
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
+// .space     = { .global, .shared::cluster, .shared::cta }
+template <cuda::ptx::dot_space Space>
+__device__ static inline void fence_proxy_async(
+  cuda::ptx::space_t<Space> space);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();
+template <dot_space _Space>
+_CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
+{
+  static_assert(__space == space_global || __space == space_cluster || __space == space_shared, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__space == space_global) {
+        asm volatile("fence.proxy.async.global; // 6." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) {
+        asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) {
+        asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
new file mode 100644
index 00000000000..3e5b2a265f4
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
@@ -0,0 +1,82 @@
+/*
+// fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster, .gpu, .sys }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline void fence_proxy_tensormap_generic(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, scope_t<_Scope> __scope)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+        asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+        asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster, .gpu, .sys }
+template <int N32, cuda::ptx::dot_scope Scope>
+__device__ static inline void fence_proxy_tensormap_generic(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  const void* addr,
+  cuda::ptx::n32_t<N32> size);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();
+template <int _N32, dot_scope _Scope>
+_CCCL_DEVICE static inline void
+fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void* __addr, n32_t<_N32> __size)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8."
+                     :
+                     : "l"(__addr), "n"(__size.value)
+                     : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8."
+                     :
+                     : "l"(__addr), "n"(__size.value)
+                     : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+        asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8."
+                     :
+                     : "l"(__addr), "n"(__size.value)
+                     : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+        asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8."
+                     :
+                     : "l"(__addr), "n"(__size.value)
+                     : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 830
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
new file mode 100644
index 00000000000..dd3079915f7
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
@@ -0,0 +1,1001 @@
+/*
+// mov.u32 sreg_value, %%tid.x; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_tid_x();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%tid.x;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%tid.y; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_tid_y();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%tid.y;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%tid.z; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_tid_z();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%tid.z;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ntid_x();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%ntid.x;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ntid_y();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%ntid.y;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ntid_z();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%ntid.z;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%laneid; // PTX ISA 13
+template <typename=void>
+__device__ static inline uint32_t get_sreg_laneid();
+*/
+#if __cccl_ptx_isa >= 130
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%laneid;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 130
+
+/*
+// mov.u32 sreg_value, %%warpid; // PTX ISA 13
+template <typename=void>
+__device__ static inline uint32_t get_sreg_warpid();
+*/
+#if __cccl_ptx_isa >= 130
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%warpid;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 130
+
+/*
+// mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nwarpid();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%nwarpid;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ctaid_x();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%ctaid.x;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ctaid_y();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%ctaid.y;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ctaid_z();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%ctaid.z;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nctaid_x();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%nctaid.x;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nctaid_y();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%nctaid.y;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nctaid_z();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%nctaid.z;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%smid; // PTX ISA 13
+template <typename=void>
+__device__ static inline uint32_t get_sreg_smid();
+*/
+#if __cccl_ptx_isa >= 130
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%smid;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 130
+
+/*
+// mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nsmid();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%nsmid;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u64 sreg_value, %%gridid; // PTX ISA 30
+template <typename=void>
+__device__ static inline uint64_t get_sreg_gridid();
+*/
+#if __cccl_ptx_isa >= 300
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid()
+{
+  _CUDA_VSTD::uint64_t __sreg_value;
+  asm("mov.u64 %0, %%gridid;" : "=l"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 300
+
+/*
+// mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline bool get_sreg_is_explicit_cluster();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mov.pred P_OUT, %%is_explicit_cluster;\n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return static_cast<bool>(__sreg_value);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clusterid_x();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%clusterid.x;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clusterid_y();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%clusterid.y;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clusterid_z();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%clusterid.z;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nclusterid_x();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%nclusterid.x;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nclusterid_y();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%nclusterid.y;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nclusterid_z();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%nclusterid.z;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_ctaid_x();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_ctaid.x;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_ctaid_y();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_ctaid.y;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_ctaid_z();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_ctaid.z;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_nctaid_x();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_nctaid.x;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_nctaid_y();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_nctaid.y;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_nctaid_z();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_nctaid.z;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_ctarank();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_ctarank;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_nctarank();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_nctarank;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_eq();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_eq;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_le();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_le;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_lt();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_lt;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_ge();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_ge;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_gt();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_gt;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%clock; // PTX ISA 10
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clock();
+*/
+#if __cccl_ptx_isa >= 100
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%clock;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 100
+
+/*
+// mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clock_hi();
+*/
+#if __cccl_ptx_isa >= 500
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%clock_hi;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 500
+
+/*
+// mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint64_t get_sreg_clock64();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
+       "mov.u64 %0, %%clock64;"
+       : "=l"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
+template <typename=void>
+__device__ static inline uint64_t get_sreg_globaltimer();
+*/
+#if __cccl_ptx_isa >= 310
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
+       "mov.u64 %0, %%globaltimer;"
+       : "=l"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 310
+
+/*
+// mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_globaltimer_lo();
+*/
+#if __cccl_ptx_isa >= 310
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%globaltimer_lo;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 310
+
+/*
+// mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_globaltimer_hi();
+*/
+#if __cccl_ptx_isa >= 310
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%globaltimer_hi;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 310
+
+/*
+// mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_total_smem_size();
+*/
+#if __cccl_ptx_isa >= 410
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%total_smem_size;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 410
+
+/*
+// mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_aggr_smem_size();
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%aggr_smem_size;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_dynamic_smem_size();
+*/
+#if __cccl_ptx_isa >= 410
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%dynamic_smem_size;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 410
+
+/*
+// mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
+template <typename=void>
+__device__ static inline uint64_t get_sreg_current_graph_exec();
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_50,
+    (_CUDA_VSTD::uint64_t __sreg_value;
+     asm("mov.u64 %0, %%current_graph_exec;"
+         : "=l"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
new file mode 100644
index 00000000000..51bd351be87
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
@@ -0,0 +1,27 @@
+/*
+// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline uint32_t getctarank(
+  cuda::ptx::space_cluster_t,
+  const void* addr);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __dest;
+     asm("getctarank.shared::cluster.u32 %0, %1;"
+         : "=r"(__dest)
+         : "r"(__as_ptr_smem(__addr))
+         :);
+     return __dest;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
new file mode 100644
index 00000000000..f3e2b860d50
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
@@ -0,0 +1,205 @@
+/*
+// mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CUDA_VSTD::uint64_t __state;
+     asm("mbarrier.arrive.shared.b64                                  %0,  [%1];           // 1. "
+         : "=l"(__state)
+         : "r"(__as_ptr_smem(__addr))
+         : "memory");
+     return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 700
+
+/*
+// mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
+mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint64_t __state;
+     asm("mbarrier.arrive.shared::cta.b64                             %0,  [%1], %2;    // 2. "
+         : "=l"(__state)
+         : "r"(__as_ptr_smem(__addr)), "r"(__count)
+         : "memory");
+     return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
+mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("mbarrier.arrive.release.cta.shared::cta.b64                   %0,  [%1];           // 3a. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1];           // 3a. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr))
+            : "memory");
+      } return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  space_shared_t,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("mbarrier.arrive.release.cta.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr)), "r"(__count)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr)), "r"(__count)
+            : "memory");
+      } return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__addr))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void mbarrier_arrive(
+  sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
new file mode 100644
index 00000000000..efb749957b1
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
@@ -0,0 +1,79 @@
+/*
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr,
+  const uint32_t& tx_count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  space_shared_t,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __tx_count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
+            : "memory");
+      } return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr,
+  const uint32_t& tx_count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
+  sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
new file mode 100644
index 00000000000..879bedebdc9
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
@@ -0,0 +1,26 @@
+/*
+// mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive_no_complete(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
+mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CUDA_VSTD::uint64_t __state;
+     asm("mbarrier.arrive.noComplete.shared.b64                       %0,  [%1], %2;    // 5. "
+         : "=l"(__state)
+         : "r"(__as_ptr_smem(__addr)), "r"(__count)
+         : "memory");
+     return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 700
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
new file mode 100644
index 00000000000..3afeeacfccf
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
@@ -0,0 +1,23 @@
+/*
+// mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline void mbarrier_init(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (asm("mbarrier.init.shared.b64 [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__addr)), "r"(__count)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();));
+}
+#endif // __cccl_ptx_isa >= 700
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
new file mode 100644
index 00000000000..e97d9ccf15c
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
@@ -0,0 +1,28 @@
+/*
+// mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1. PTX
+ISA 70, SM_80 template <typename=void>
+__device__ static inline bool mbarrier_test_wait(
+  uint64_t* addr,
+  const uint64_t& state);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2;                                                  // 1. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "l"(__state)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 700
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
new file mode 100644
index 00000000000..604cfd92045
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
@@ -0,0 +1,75 @@
+/*
+// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3. PTX
+ISA 71, SM_80 template <typename=void>
+__device__ static inline bool mbarrier_test_wait_parity(
+  uint64_t* addr,
+  const uint32_t& phaseParity);
+*/
+#if __cccl_ptx_isa >= 710
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline bool
+mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2;                                     // 3. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 710
+
+/*
+// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4. PTX
+ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_test_wait_parity(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint32_t& phaseParity);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
+  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
new file mode 100644
index 00000000000..c5f2062664c
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
@@ -0,0 +1,157 @@
+/*
+// mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.
+PTX ISA 78, SM_90 template <typename=void>
+__device__ static inline bool mbarrier_try_wait(
+  uint64_t* addr,
+  const uint64_t& state);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2;                                      // 5a. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "l"(__state)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b. PTX
+ISA 78, SM_90 template <typename=void>
+__device__ static inline bool mbarrier_try_wait(
+  uint64_t* addr,
+  const uint64_t& state,
+  const uint32_t& suspendTimeHint);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool mbarrier_try_wait(
+  _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2, %3;                    // 5b. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.
+PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_try_wait(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint64_t& state);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_try_wait(
+  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.acquire.cta.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. "
+            "\n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.
+PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_try_wait(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint64_t& state,
+  const uint32_t& suspendTimeHint);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_try_wait(
+  sem_acquire_t,
+  scope_t<_Scope> __scope,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint64_t& __state,
+  const _CUDA_VSTD::uint32_t& __suspendTimeHint)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.acquire.cta.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
new file mode 100644
index 00000000000..321bfc515da
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
@@ -0,0 +1,157 @@
+/*
+// mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.
+PTX ISA 78, SM_90 template <typename=void>
+__device__ static inline bool mbarrier_try_wait_parity(
+  uint64_t* addr,
+  const uint32_t& phaseParity);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool
+mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2;                                // 7a. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.
+PTX ISA 78, SM_90 template <typename=void>
+__device__ static inline bool mbarrier_try_wait_parity(
+  uint64_t* addr,
+  const uint32_t& phaseParity,
+  const uint32_t& suspendTimeHint);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
+  _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2, %3;               // 7b. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.
+PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_try_wait_parity(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint32_t& phaseParity);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
+  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.
+PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_try_wait_parity(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint32_t& phaseParity,
+  const uint32_t& suspendTimeHint);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
+  sem_acquire_t,
+  scope_t<_Scope> __scope,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __phaseParity,
+  const _CUDA_VSTD::uint32_t& __suspendTimeHint)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
new file mode 100644
index 00000000000..3157fa1c627
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
@@ -0,0 +1,417 @@
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .inc }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_inc_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_inc (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .dec }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_dec_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_dec (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_min_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_max_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_min_t,
+  int32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_max_t,
+  int32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .and }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_and_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void
+red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_and_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .or }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_or_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void
+red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_or_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .xor }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_xor_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void
+red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_xor_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint64_t* dest,
+  const uint64_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64
+intentional PTX ISA 81, SM_90
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int64_t* dest,
+  const int64_t& value,
+  int64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar)
+{
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; // .u64 "
+         "intentional"
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
new file mode 100644
index 00000000000..9dfab243ffe
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
@@ -0,0 +1,108 @@
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81,
+SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  Type* addr,
+  const Type& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2];    // 1. "
+            :
+            : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
+            :
+            : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81,
+SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  Type* addr,
+  const Type (&value)[2],
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. "
+            :
+            : "r"(__as_ptr_remote_dsmem(__addr)),
+              "r"(__as_b32(__value[0])),
+              "r"(__as_b32(__value[1])),
+              "r"(__as_ptr_remote_dsmem(__remote_bar))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
+            :
+            : "r"(__as_ptr_remote_dsmem(__addr)),
+              "l"(__as_b64(__value[0])),
+              "l"(__as_b64(__value[1])),
+              "r"(__as_ptr_remote_dsmem(__remote_bar))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81,
+SM_90 template <typename B32>
+__device__ static inline void st_async(
+  B32* addr,
+  const B32 (&value)[4],
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5];    // 3. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__addr)),
+           "r"(__as_b32(__value[0])),
+           "r"(__as_b32(__value[1])),
+           "r"(__as_b32(__value[2])),
+           "r"(__as_b32(__value[3])),
+           "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
new file mode 100644
index 00000000000..033d0606e7f
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
@@ -0,0 +1,54 @@
+/*
+// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA
+83, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster, .gpu, .sys }
+template <int N32, cuda::ptx::dot_scope Scope>
+__device__ static inline void tensormap_cp_fenceproxy(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  void* dst,
+  const void* src,
+  cuda::ptx::n32_t<N32> size);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();
+template <int _N32, dot_scope _Scope>
+_CCCL_DEVICE static inline void
+tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, const void* __src, n32_t<_N32> __size)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm volatile(
+          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned  [%0], [%1], %2;"
+          :
+          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
+          : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm volatile(
+          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [%0], [%1], %2;"
+          :
+          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
+          : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+        asm volatile(
+          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [%0], [%1], %2;"
+          :
+          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
+          : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+        asm volatile(
+          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [%0], [%1], %2;"
+          :
+          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
+          : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 830
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
new file mode 100644
index 00000000000..3b1060ead38
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
@@ -0,0 +1,569 @@
+/*
+// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <typename B64>
+__device__ static inline void tensormap_replace_global_address(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
+template <typename _B64>
+_CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_address.global.b1024.b64    [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <typename B64>
+__device__ static inline void tensormap_replace_global_address(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
+template <typename _B64>
+_CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64    [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <typename B32>
+__device__ static inline void tensormap_replace_rank(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
+template <typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.rank.global.b1024.b32              [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <typename B32>
+__device__ static inline void tensormap_replace_rank(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
+template <typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32              [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_box_dim(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.box_dim.global.b1024.b32           [%0], %1, %2;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_box_dim(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [%0], %1, %2;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_global_dim(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_dim.global.b1024.b32        [%0], %1, %2;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_global_dim(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [%0], %1, %2;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B64>
+__device__ static inline void tensormap_replace_global_stride(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B64>
+_CCCL_DEVICE static inline void
+tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_stride.global.b1024.b64     [%0], %1, %2;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B64>
+__device__ static inline void tensormap_replace_global_stride(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B64>
+_CCCL_DEVICE static inline void
+tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [%0], %1, %2;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_element_size(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.element_stride.global.b1024.b32    [%0], %1, %2;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_element_size(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [%0], %1, %2;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_elemtype(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.elemtype.global.b1024.b32          [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_elemtype(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_interleave_layout(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void
+tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_interleave_layout(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void
+tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_swizzle_mode(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32      [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_swizzle_mode(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_fill_mode(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.fill_mode.global.b1024.b32         [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_fill_mode(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
index 8982984885d..033005beb5b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
@@ -32,1007 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 10. Special Registers
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers
-/*
-// mov.u32 sreg_value, %%tid.x; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_tid_x();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%tid.x;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%tid.y; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_tid_y();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%tid.y;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%tid.z; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_tid_z();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%tid.z;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ntid_x();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%ntid.x;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ntid_y();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%ntid.y;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ntid_z();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%ntid.z;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%laneid; // PTX ISA 13
-template <typename=void>
-__device__ static inline uint32_t get_sreg_laneid();
-*/
-#if __cccl_ptx_isa >= 130
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%laneid;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 130
-
-/*
-// mov.u32 sreg_value, %%warpid; // PTX ISA 13
-template <typename=void>
-__device__ static inline uint32_t get_sreg_warpid();
-*/
-#if __cccl_ptx_isa >= 130
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%warpid;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 130
-
-/*
-// mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nwarpid();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%nwarpid;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ctaid_x();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%ctaid.x;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ctaid_y();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%ctaid.y;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ctaid_z();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%ctaid.z;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nctaid_x();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%nctaid.x;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nctaid_y();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%nctaid.y;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nctaid_z();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%nctaid.z;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%smid; // PTX ISA 13
-template <typename=void>
-__device__ static inline uint32_t get_sreg_smid();
-*/
-#if __cccl_ptx_isa >= 130
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%smid;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 130
-
-/*
-// mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nsmid();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%nsmid;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u64 sreg_value, %%gridid; // PTX ISA 30
-template <typename=void>
-__device__ static inline uint64_t get_sreg_gridid();
-*/
-#if __cccl_ptx_isa >= 300
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid()
-{
-  _CUDA_VSTD::uint64_t __sreg_value;
-  asm("mov.u64 %0, %%gridid;" : "=l"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 300
-
-/*
-// mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline bool get_sreg_is_explicit_cluster();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mov.pred P_OUT, %%is_explicit_cluster;\n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return static_cast<bool>(__sreg_value);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clusterid_x();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clusterid_y();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clusterid_z();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nclusterid_x();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nclusterid_y();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nclusterid_z();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_ctaid_x();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_ctaid_y();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_ctaid_z();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_nctaid_x();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_nctaid_y();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_nctaid_z();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_ctarank();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctarank;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_nctarank();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctarank;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_eq();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_eq;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_le();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_le;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_lt();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_lt;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_ge();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_ge;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_gt();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_gt;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%clock; // PTX ISA 10
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clock();
-*/
-#if __cccl_ptx_isa >= 100
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 100
-
-/*
-// mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clock_hi();
-*/
-#if __cccl_ptx_isa >= 500
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%clock_hi;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 500
-
-/*
-// mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint64_t get_sreg_clock64();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
-       "mov.u64 %0, %%clock64;"
-       : "=l"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
-template <typename=void>
-__device__ static inline uint64_t get_sreg_globaltimer();
-*/
-#if __cccl_ptx_isa >= 310
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
-       "mov.u64 %0, %%globaltimer;"
-       : "=l"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 310
-
-/*
-// mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_globaltimer_lo();
-*/
-#if __cccl_ptx_isa >= 310
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%globaltimer_lo;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 310
-
-/*
-// mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_globaltimer_hi();
-*/
-#if __cccl_ptx_isa >= 310
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%globaltimer_hi;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 310
-
-/*
-// mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_total_smem_size();
-*/
-#if __cccl_ptx_isa >= 410
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%total_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 410
-
-/*
-// mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_aggr_smem_size();
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%aggr_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_dynamic_smem_size();
-*/
-#if __cccl_ptx_isa >= 410
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%dynamic_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 410
-
-/*
-// mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
-template <typename=void>
-__device__ static inline uint64_t get_sreg_current_graph_exec();
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_50,
-    (_CUDA_VSTD::uint64_t __sreg_value;
-     asm("mov.u64 %0, %%current_graph_exec;"
-         : "=l"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/get_sreg.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
index f1a2bbbd0e9..f5ed3424d3b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
@@ -32,33 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.23. Data Movement and Conversion Instructions: getctarank
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
-/*
-// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
-// .space     = { .shared::cluster }
-template <typename=void>
-__device__ static inline uint32_t getctarank(
-  cuda::ptx::space_cluster_t,
-  const void* addr);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __dest;
-     asm("getctarank.shared::cluster.u32 %0, %1;"
-         : "=r"(__dest)
-         : "r"(__as_ptr_smem(__addr))
-         :);
-     return __dest;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
+#include <cuda/__ptx/instructions/generated/getctarank.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
index 5b423990f1c..fb1341a61d8 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
@@ -32,316 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-/*
-// mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-template <typename=void>
-__device__ static inline uint64_t mbarrier_arrive(
-  uint64_t* addr);
-*/
-#if __cccl_ptx_isa >= 700
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.shared.b64                                  %0,  [%1];           // 1. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr))
-         : "memory");
-     return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 700
-
-/*
-// mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint64_t mbarrier_arrive(
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
-mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.shared::cta.b64                             %0,  [%1], %2;    // 2. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");
-     return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster }
-// .space     = { .shared::cta }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_t,
-  uint64_t* addr);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
-mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("mbarrier.arrive.release.cta.shared::cta.b64                   %0,  [%1];           // 3a. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1];           // 3a. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr))
-            : "memory");
-      } return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster }
-// .space     = { .shared::cta }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_t,
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
-  sem_release_t,
-  scope_t<_Scope> __scope,
-  space_shared_t,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint32_t& __count)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("mbarrier.arrive.release.cta.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr)), "r"(__count)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr)), "r"(__count)
-            : "memory");
-      } return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cluster }
-// .space     = { .shared::cluster }
-template <typename=void>
-__device__ static inline void mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t,
-  cuda::ptx::space_cluster_t,
-  uint64_t* addr);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
-  // __space == space_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cluster }
-// .space     = { .shared::cluster }
-template <typename=void>
-__device__ static inline void mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t,
-  cuda::ptx::space_cluster_t,
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void mbarrier_arrive(
-  sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
-  // __space == space_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-template <typename=void>
-__device__ static inline uint64_t mbarrier_arrive_no_complete(
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 700
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
-mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.noComplete.shared.b64                       %0,  [%1], %2;    // 5. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");
-     return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 700
-/*
-// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster }
-// .space     = { .shared::cta }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline uint64_t mbarrier_arrive_expect_tx(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_t,
-  uint64_t* addr,
-  const uint32_t& tx_count);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
-  sem_release_t,
-  scope_t<_Scope> __scope,
-  space_shared_t,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint32_t& __tx_count)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
-            : "memory");
-      } return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cluster }
-// .space     = { .shared::cluster }
-template <typename=void>
-__device__ static inline void mbarrier_arrive_expect_tx(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t,
-  cuda::ptx::space_cluster_t,
-  uint64_t* addr,
-  const uint32_t& tx_count);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
-  sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
-  // __space == space_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
index 366b1b67eec..575abda7a41 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
@@ -32,29 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
-/*
-// mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
-template <typename=void>
-__device__ static inline void mbarrier_init(
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 700
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (asm("mbarrier.init.shared.b64 [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();));
-}
-#endif // __cccl_ptx_isa >= 700
+#include <cuda/__ptx/instructions/generated/mbarrier_init.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
index 837fec44b9f..2d6adb78eec 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
@@ -32,470 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
-/*
-// mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1. PTX
-ISA 70, SM_80 template <typename=void>
-__device__ static inline bool mbarrier_test_wait(
-  uint64_t* addr,
-  const uint64_t& state);
-*/
-#if __cccl_ptx_isa >= 700
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2;                                                  // 1. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 700
-
-/*
-// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2. PTX
-ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_test_wait(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint64_t& state);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_test_wait(
-  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.test_wait.acquire.cta.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.test_wait.acquire.cluster.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  "
-            "\n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3. PTX
-ISA 71, SM_80 template <typename=void>
-__device__ static inline bool mbarrier_test_wait_parity(
-  uint64_t* addr,
-  const uint32_t& phaseParity);
-*/
-#if __cccl_ptx_isa >= 710
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline bool
-mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2;                                     // 3. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 710
-
-/*
-// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4. PTX
-ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_test_wait_parity(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint32_t& phaseParity);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
-  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.
-PTX ISA 78, SM_90 template <typename=void>
-__device__ static inline bool mbarrier_try_wait(
-  uint64_t* addr,
-  const uint64_t& state);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2;                                      // 5a. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b. PTX
-ISA 78, SM_90 template <typename=void>
-__device__ static inline bool mbarrier_try_wait(
-  uint64_t* addr,
-  const uint64_t& state,
-  const uint32_t& suspendTimeHint);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool mbarrier_try_wait(
-  _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2, %3;                    // 5b. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.
-PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_try_wait(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint64_t& state);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_try_wait(
-  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.acquire.cta.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. "
-            "\n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.
-PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_try_wait(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint64_t& state,
-  const uint32_t& suspendTimeHint);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_try_wait(
-  sem_acquire_t,
-  scope_t<_Scope> __scope,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint64_t& __state,
-  const _CUDA_VSTD::uint32_t& __suspendTimeHint)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.acquire.cta.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.
-PTX ISA 78, SM_90 template <typename=void>
-__device__ static inline bool mbarrier_try_wait_parity(
-  uint64_t* addr,
-  const uint32_t& phaseParity);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool
-mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2;                                // 7a. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.
-PTX ISA 78, SM_90 template <typename=void>
-__device__ static inline bool mbarrier_try_wait_parity(
-  uint64_t* addr,
-  const uint32_t& phaseParity,
-  const uint32_t& suspendTimeHint);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
-  _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2, %3;               // 7b. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.
-PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_try_wait_parity(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint32_t& phaseParity);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
-  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.
-PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_try_wait_parity(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint32_t& phaseParity,
-  const uint32_t& suspendTimeHint);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
-  sem_acquire_t,
-  scope_t<_Scope> __scope,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint32_t& __phaseParity,
-  const _CUDA_VSTD::uint32_t& __suspendTimeHint)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
index 777628c67d0..a610cf2b583 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
@@ -32,423 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .inc }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_inc_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_inc (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .dec }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_dec_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_dec (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_min_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_max_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_add_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .s32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_min_t,
-  int32_t* dest,
-  const int32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .s32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_max_t,
-  int32_t* dest,
-  const int32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .s32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_add_t,
-  int32_t* dest,
-  const int32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .b32 }
-// .op        = { .and }
-template <typename B32>
-__device__ static inline void red_async(
-  cuda::ptx::op_and_op_t,
-  B32* dest,
-  const B32& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void
-red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_and_op (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .b32 }
-// .op        = { .or }
-template <typename B32>
-__device__ static inline void red_async(
-  cuda::ptx::op_or_op_t,
-  B32* dest,
-  const B32& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void
-red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_or_op (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .b32 }
-// .op        = { .xor }
-template <typename B32>
-__device__ static inline void red_async(
-  cuda::ptx::op_xor_op_t,
-  B32* dest,
-  const B32& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void
-red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_xor_op (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_add_t,
-  uint64_t* dest,
-  const uint64_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64
-intentional PTX ISA 81, SM_90
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_add_t,
-  int64_t* dest,
-  const int64_t& value,
-  int64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar)
-{
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; // .u64 "
-         "intentional"
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
+#include <cuda/__ptx/instructions/generated/red_async.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
index e6774087802..09199b4a3ce 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
@@ -32,114 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.12. Data Movement and Conversion Instructions: st.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
-/*
-// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81,
-SM_90
-// .type      = { .b32, .b64 }
-template <typename Type>
-__device__ static inline void st_async(
-  Type* addr,
-  const Type& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2];    // 1. "
-            :
-            : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
-            :
-            : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81,
-SM_90
-// .type      = { .b32, .b64 }
-template <typename Type>
-__device__ static inline void st_async(
-  Type* addr,
-  const Type (&value)[2],
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. "
-            :
-            : "r"(__as_ptr_remote_dsmem(__addr)),
-              "r"(__as_b32(__value[0])),
-              "r"(__as_b32(__value[1])),
-              "r"(__as_ptr_remote_dsmem(__remote_bar))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
-            :
-            : "r"(__as_ptr_remote_dsmem(__addr)),
-              "l"(__as_b64(__value[0])),
-              "l"(__as_b64(__value[1])),
-              "r"(__as_ptr_remote_dsmem(__remote_bar))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81,
-SM_90 template <typename B32>
-__device__ static inline void st_async(
-  B32* addr,
-  const B32 (&value)[4],
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5];    // 3. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)),
-           "r"(__as_b32(__value[0])),
-           "r"(__as_b32(__value[1])),
-           "r"(__as_b32(__value[2])),
-           "r"(__as_b32(__value[3])),
-           "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
+#include <cuda/__ptx/instructions/generated/st_async.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
index ce8b0f10991..de179f69735 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
@@ -32,60 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
-/*
-// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA
-83, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster, .gpu, .sys }
-template <int N32, cuda::ptx::dot_scope Scope>
-__device__ static inline void tensormap_cp_fenceproxy(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
-  void* dst,
-  const void* src,
-  cuda::ptx::n32_t<N32> size);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();
-template <int _N32, dot_scope _Scope>
-_CCCL_DEVICE static inline void
-tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, const void* __src, n32_t<_N32> __size)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm volatile(
-          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned  [%0], [%1], %2;"
-          :
-          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
-          : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm volatile(
-          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [%0], [%1], %2;"
-          :
-          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
-          : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
-        asm volatile(
-          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [%0], [%1], %2;"
-          :
-          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
-          : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
-        asm volatile(
-          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [%0], [%1], %2;"
-          :
-          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
-          : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 830
+#include <cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
index b40c0cf72aa..2f81d8b4361 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
@@ -32,575 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
-/*
-// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <typename B64>
-__device__ static inline void tensormap_replace_global_address(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  B64 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
-template <typename _B64>
-_CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B64) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_address.global.b1024.b64    [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <typename B64>
-__device__ static inline void tensormap_replace_global_address(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  B64 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
-template <typename _B64>
-_CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B64) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64    [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <typename B32>
-__device__ static inline void tensormap_replace_rank(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
-template <typename _B32>
-_CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.rank.global.b1024.b32              [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <typename B32>
-__device__ static inline void tensormap_replace_rank(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
-template <typename _B32>
-_CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32              [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_box_dim(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.box_dim.global.b1024.b32           [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_box_dim(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_global_dim(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_dim.global.b1024.b32        [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_global_dim(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32, typename B64>
-__device__ static inline void tensormap_replace_global_stride(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B64 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B64>
-_CCCL_DEVICE static inline void
-tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B64) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_stride.global.b1024.b64     [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32, typename B64>
-__device__ static inline void tensormap_replace_global_stride(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B64 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B64>
-_CCCL_DEVICE static inline void
-tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B64) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_element_size(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.element_stride.global.b1024.b32    [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_element_size(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32>
-__device__ static inline void tensormap_replace_elemtype(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.elemtype.global.b1024.b32          [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32>
-__device__ static inline void tensormap_replace_elemtype(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32>
-__device__ static inline void tensormap_replace_interleave_layout(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void
-tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32>
-__device__ static inline void tensormap_replace_interleave_layout(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void
-tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32>
-__device__ static inline void tensormap_replace_swizzle_mode(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32      [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32>
-__device__ static inline void tensormap_replace_swizzle_mode(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32>
-__device__ static inline void tensormap_replace_fill_mode(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.fill_mode.global.b1024.b32         [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32>
-__device__ static inline void tensormap_replace_fill_mode(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
+#include <cuda/__ptx/instructions/generated/tensormap_replace.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX