diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc new file mode 100644 index 00000000000..cad5510ba70 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc @@ -0,0 +1,40 @@ +__global__ void test_barrier_cluster(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.release; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.relaxed; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait.acquire; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc new file mode 100644 index 00000000000..cd66de989a2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc @@ -0,0 +1,37 @@ +__global__ void test_cp_async_bulk(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // + // 1a. unicast + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, + // [rdsmem_bar]; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc new file mode 100644 index 00000000000..afdf14abb8a --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc @@ -0,0 +1,10 @@ +__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.commit_group; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_commit_group));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc new file mode 100644 index 00000000000..b2bd0d968d9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc @@ -0,0 +1,18 @@ +__global__ void test_cp_async_bulk_multicast(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], + // size, [smem_bar], ctaMask; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc new file mode 100644 index 00000000000..f9d0d240d28 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc @@ -0,0 +1,117 @@ +__global__ void test_cp_async_bulk_tensor(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1c. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1d. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1e. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc new file mode 100644 index 00000000000..2851aab6d7c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc @@ -0,0 +1,82 @@ +__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc new file mode 100644 index 00000000000..0139a65f6ce --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc @@ -0,0 +1,18 @@ +__global__ void test_cp_async_bulk_wait_group(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // cp.async.bulk.wait_group N; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::cp_async_bulk_wait_group));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // cp.async.bulk.wait_group.read N; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::cp_async_bulk_wait_group_read));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc new file mode 100644 index 00000000000..5ee274bcbe8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc @@ -0,0 +1,476 @@ +__global__ void test_cp_reduce_async_bulk(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk)); + // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk)); + // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk)); + // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [dstMem], [srcMem], size; // 6. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc new file mode 100644 index 00000000000..fe38374fe00 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc @@ -0,0 +1,44 @@ +__global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr) +{ +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 + +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 + +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [dstMem], [srcMem], size; // 5. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc new file mode 100644 index 00000000000..e7e58cfcb80 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc @@ -0,0 +1,35 @@ +__global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr) +{ +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 + +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 + +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [dstMem], [srcMem], size; // 5. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc new file mode 100644 index 00000000000..6f0a7d710ce --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc @@ -0,0 +1,392 @@ +__global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc new file mode 100644 index 00000000000..2e464580de9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc @@ -0,0 +1,38 @@ +__global__ void test_fence(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 600 + NV_IF_TARGET( + NV_PROVIDES_SM_70, + ( + // fence.sc.cta; // 1. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::fence)); + // fence.sc.gpu; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.sc.sys; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acq_rel.cta; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acq_rel.gpu; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acq_rel.sys; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 600 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.sc.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acq_rel.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 780 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc new file mode 100644 index 00000000000..f503c1d055b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc @@ -0,0 +1,11 @@ +__global__ void test_fence_mbarrier_init(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.mbarrier_init.release.cluster; // 3. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_mbarrier_init));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc new file mode 100644 index 00000000000..a8021d3f5be --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc @@ -0,0 +1,9 @@ +__global__ void test_fence_proxy_alias(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 750 + NV_IF_TARGET(NV_PROVIDES_SM_70, + ( + // fence.proxy.alias; // 4. + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_alias));)); +#endif // __cccl_ptx_isa >= 750 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc new file mode 100644 index 00000000000..e3d8e6d160a --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc @@ -0,0 +1,24 @@ +__global__ void test_fence_proxy_async(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // fence.proxy.async; // 5. + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.async.global; // 6. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async)); + // fence.proxy.async.shared::cluster; // 6. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async)); + // fence.proxy.async.shared::cta; // 6. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence_proxy_async));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc new file mode 100644 index 00000000000..1e0ea93a387 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc @@ -0,0 +1,44 @@ +__global__ void test_fence_proxy_tensormap_generic(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.tensormap::generic.release.cta; // 7. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.release.cluster; // 7. + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.release.gpu; // 7. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.release.sys; // 7. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_proxy_tensormap_generic));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.tensormap::generic.acquire.cta [addr], size; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.acquire.cluster [addr], size; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.acquire.gpu [addr], size; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.acquire.sys [addr], size; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::fence_proxy_tensormap_generic));)); +#endif // __cccl_ptx_isa >= 830 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc new file mode 100644 index 00000000000..90842352f90 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc @@ -0,0 +1,331 @@ +__global__ void test_get_sreg(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%tid.x; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_x)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%tid.y; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_y)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%tid.z; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_z)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ntid.x; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_x)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ntid.y; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_y)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ntid.z; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_z)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 130 + // mov.u32 sreg_value, %%laneid; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_laneid)); +#endif // __cccl_ptx_isa >= 130 + +#if __cccl_ptx_isa >= 130 + // mov.u32 sreg_value, %%warpid; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_warpid)); +#endif // __cccl_ptx_isa >= 130 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%nwarpid; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nwarpid));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ctaid.x; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_x)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ctaid.y; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_y)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ctaid.z; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_z)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%nctaid.x; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_x)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%nctaid.y; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_y)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%nctaid.z; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_z)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 130 + // mov.u32 sreg_value, %%smid; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_smid)); +#endif // __cccl_ptx_isa >= 130 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%nsmid; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nsmid));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 300 + // mov.u64 sreg_value, %%gridid; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_gridid)); +#endif // __cccl_ptx_isa >= 300 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.pred sreg_value, %%is_explicit_cluster; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_is_explicit_cluster));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%clusterid.x; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_x));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%clusterid.y; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_y));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%clusterid.z; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_z));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%nclusterid.x; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_x));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%nclusterid.y; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_y));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%nclusterid.z; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_z));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_ctaid.x; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_x));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_ctaid.y; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_y));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_ctaid.z; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_z));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_nctaid.x; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_x));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_nctaid.y; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_y));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_nctaid.z; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_z));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_ctarank; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctarank));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_nctarank; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctarank));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_eq; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_eq));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_le; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_le));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_lt; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_lt));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_ge; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_ge));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_gt; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_gt));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 100 + // mov.u32 sreg_value, %%clock; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock)); +#endif // __cccl_ptx_isa >= 100 + +#if __cccl_ptx_isa >= 500 + NV_IF_TARGET(NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%clock_hi; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock_hi));)); +#endif // __cccl_ptx_isa >= 500 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_35, + ( + // mov.u64 sreg_value, %%clock64; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock64));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 310 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u64 sreg_value, %%globaltimer; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer));)); +#endif // __cccl_ptx_isa >= 310 + +#if __cccl_ptx_isa >= 310 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%globaltimer_lo; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer_lo));)); +#endif // __cccl_ptx_isa >= 310 + +#if __cccl_ptx_isa >= 310 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%globaltimer_hi; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer_hi));)); +#endif // __cccl_ptx_isa >= 310 + +#if __cccl_ptx_isa >= 410 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%total_smem_size; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_total_smem_size));)); +#endif // __cccl_ptx_isa >= 410 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%aggr_smem_size; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_aggr_smem_size));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 410 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%dynamic_smem_size; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_dynamic_smem_size));)); +#endif // __cccl_ptx_isa >= 410 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_50, + ( + // mov.u64 sreg_value, %%current_graph_exec; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_current_graph_exec));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc new file mode 100644 index 00000000000..28b04c9f738 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc @@ -0,0 +1,10 @@ +__global__ void test_getctarank(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // getctarank.shared::cluster.u32 dest, addr; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::getctarank));)); +#endif // __cccl_ptx_isa >= 780 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc new file mode 100644 index 00000000000..4a94ec51d45 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc @@ -0,0 +1,74 @@ +__global__ void test_mbarrier_arrive(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET( + NV_PROVIDES_SM_80, + ( + // mbarrier.arrive.shared.b64 state, [addr]; // 1. + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 700 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.release.cta.shared::cta.b64 state, [addr]; // 3a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr]; // 3a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.release.cta.shared::cta.b64 state, [addr], count; // 3b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr], count; // 3b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.release.cluster.shared::cluster.b64 _, [addr]; // 4a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.release.cluster.shared::cluster.b64 _, [addr], count; // 4b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc new file mode 100644 index 00000000000..085723a452b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc @@ -0,0 +1,31 @@ +__global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 state, [addr], tx_count; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx)); + // mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 state, [addr], tx_count; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [addr], tx_count; // 9. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc new file mode 100644 index 00000000000..d1d017cd3c2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc @@ -0,0 +1,10 @@ +__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive_no_complete));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc new file mode 100644 index 00000000000..f814161d1f9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc @@ -0,0 +1,10 @@ +__global__ void test_mbarrier_init(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.init.shared.b64 [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_init));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc new file mode 100644 index 00000000000..e9d8661a07e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc @@ -0,0 +1,53 @@ +__global__ void test_mbarrier_try_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // + // 5b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // + // 6a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 6a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // + // 6b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; + // // 6b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc new file mode 100644 index 00000000000..f8c3875451a --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc @@ -0,0 +1,52 @@ +__global__ void test_mbarrier_try_wait_parity(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // + // 8a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // + // 8b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity, + // suspendTimeHint; // 8b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc new file mode 100644 index 00000000000..80129e5016c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc @@ -0,0 +1,24 @@ +__global__ void test_mbarrier_test_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 700 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait)); + // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc new file mode 100644 index 00000000000..30902c58905 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc @@ -0,0 +1,24 @@ +__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 710 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 710 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity)); + // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc new file mode 100644 index 00000000000..0d562fd31a7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc @@ -0,0 +1,120 @@ +__global__ void test_red_async(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; + // // .u64 intentional + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc new file mode 100644 index 00000000000..4efb95ef217 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc @@ -0,0 +1,35 @@ +__global__ void test_st_async(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [addr], value, [remote_bar]; // 1. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::st_async)); + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [addr], value, [remote_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::st_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [addr], value, [remote_bar]; // 2. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::st_async)); + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [addr], value, [remote_bar]; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::st_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; + // // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::st_async));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc new file mode 100644 index 00000000000..9a0a8c1f615 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc @@ -0,0 +1,29 @@ +__global__ void test_tensormap_cp_fenceproxy(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [dst], [src], size; + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [dst], [src], + // size; + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [dst], [src], size; + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [dst], [src], size; + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::tensormap_cp_fenceproxy));)); +#endif // __cccl_ptx_isa >= 830 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc new file mode 100644 index 00000000000..c69f3d11964 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc @@ -0,0 +1,198 @@ +__global__ void test_tensormap_replace(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); +#endif // __cccl_ptx_isa >= 830 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp index e6088d2f317..c460a2e5b09 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp @@ -31,48 +31,10 @@ * */ -__global__ void test_barrier_cluster(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.release; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.relaxed; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait.acquire; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/barrier_cluster.inc" int main(int, char**) { + // FIXME(bgruber): why no call to test_barrier_cluster? return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp index b4dff69d5b7..4695221dbc5 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp @@ -31,16 +31,7 @@ * */ -__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.commit_group; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_commit_group));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_commit_group.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp index b234c35fcdc..b1811727b66 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp @@ -31,43 +31,7 @@ * */ -__global__ void test_cp_async_bulk(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // - // 1a. unicast - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, - // [rdsmem_bar]; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp index 8dbc81741d2..c040528cabc 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp @@ -33,24 +33,7 @@ * */ -__global__ void test_cp_async_bulk_multicast(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], - // size, [smem_bar], ctaMask; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_multicast.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index 64d9b9590a3..0b69b8a8f1c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -31,123 +31,7 @@ * */ -__global__ void test_cp_async_bulk_tensor(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1c. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1d. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1e. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_tensor.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp index 2a3457396d0..7d53d9ee0c9 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp @@ -33,88 +33,7 @@ * */ -__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_tensor_multicast.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp index 3bfa9bbc7dd..39df53c5f9d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp @@ -31,24 +31,7 @@ * */ -__global__ void test_cp_async_bulk_wait_group(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // cp.async.bulk.wait_group N; - * fn_ptr++ = reinterpret_cast( - static_cast)>(cuda::ptx::cp_async_bulk_wait_group));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // cp.async.bulk.wait_group.read N; - * fn_ptr++ = reinterpret_cast( - static_cast)>(cuda::ptx::cp_async_bulk_wait_group_read));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_wait_group.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp index b1d06ca49c0..a186e34a809 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp @@ -31,568 +31,14 @@ * */ -__global__ void test_cp_reduce_async_bulk(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk)); - // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk)); - // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk)); - // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [dstMem], [srcMem], size; // 6. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_reduce_async_bulk.inc" #ifdef _LIBCUDACXX_HAS_NVF16 -__global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr) -{ -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 - -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 - -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [dstMem], [srcMem], size; // 5. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 -} - +# include "generated/cp_reduce_async_bulk_f16.inc" #endif // _LIBCUDACXX_HAS_NVF16 #ifdef _LIBCUDACXX_HAS_NVBF16 -__global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr) -{ -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 - -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 - -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [dstMem], [srcMem], size; // 5. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 -} - +# include "generated/cp_reduce_async_bulk_bf16.inc" #endif // _LIBCUDACXX_HAS_NVBF16 int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp index 5ae7d313c36..14abc0d3ae6 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp @@ -31,398 +31,7 @@ * */ -__global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_reduce_async_bulk_tensor.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp index 0be4f6b32fe..641cb83f172 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp @@ -31,136 +31,11 @@ * */ -__global__ void test_fence(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 600 - NV_IF_TARGET( - NV_PROVIDES_SM_70, - ( - // fence.sc.cta; // 1. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::fence)); - // fence.sc.gpu; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.sc.sys; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cta; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.gpu; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.sys; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence));)); -#endif // __cccl_ptx_isa >= 600 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.sc.cluster; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cluster; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence));)); -#endif // __cccl_ptx_isa >= 780 -} - -__global__ void test_fence_mbarrier_init(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.mbarrier_init.release.cluster; // 3. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_mbarrier_init));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_fence_proxy_alias(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 750 - NV_IF_TARGET(NV_PROVIDES_SM_70, - ( - // fence.proxy.alias; // 4. - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_alias));)); -#endif // __cccl_ptx_isa >= 750 -} - -__global__ void test_fence_proxy_async(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // fence.proxy.async; // 5. - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.proxy.async.global; // 6. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async)); - // fence.proxy.async.shared::cluster; // 6. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async)); - // fence.proxy.async.shared::cta; // 6. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence_proxy_async));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_fence_proxy_tensormap_generic(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.proxy.tensormap::generic.release.cta; // 7. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.release.cluster; // 7. - * fn_ptr++ = - reinterpret_cast(static_cast( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.release.gpu; // 7. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.release.sys; // 7. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_proxy_tensormap_generic));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.proxy.tensormap::generic.acquire.cta [addr], size; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.acquire.cluster [addr], size; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.acquire.gpu [addr], size; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.acquire.sys [addr], size; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::fence_proxy_tensormap_generic));)); -#endif // __cccl_ptx_isa >= 830 -} +#include "generated/fence.inc" +#include "generated/fence_mbarrier_init.inc" +#include "generated/fence_proxy_alias.inc" +#include "generated/fence_proxy_async.inc" +#include "generated/fence_proxy_tensormap_generic.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp index 0003afb2fe2..697cc00a1be 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp @@ -32,337 +32,7 @@ * */ -__global__ void test_get_sreg(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%tid.x; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_x)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%tid.y; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_y)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%tid.z; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_z)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ntid.x; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_x)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ntid.y; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_y)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ntid.z; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_z)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 130 - // mov.u32 sreg_value, %%laneid; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_laneid)); -#endif // __cccl_ptx_isa >= 130 - -#if __cccl_ptx_isa >= 130 - // mov.u32 sreg_value, %%warpid; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_warpid)); -#endif // __cccl_ptx_isa >= 130 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET(NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%nwarpid; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nwarpid));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ctaid.x; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_x)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ctaid.y; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_y)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ctaid.z; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_z)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%nctaid.x; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_x)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%nctaid.y; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_y)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%nctaid.z; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_z)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 130 - // mov.u32 sreg_value, %%smid; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_smid)); -#endif // __cccl_ptx_isa >= 130 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET(NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%nsmid; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nsmid));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 300 - // mov.u64 sreg_value, %%gridid; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_gridid)); -#endif // __cccl_ptx_isa >= 300 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.pred sreg_value, %%is_explicit_cluster; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_is_explicit_cluster));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%clusterid.x; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_x));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%clusterid.y; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_y));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%clusterid.z; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_z));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%nclusterid.x; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_x));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%nclusterid.y; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_y));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%nclusterid.z; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_z));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_ctaid.x; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_x));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_ctaid.y; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_y));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_ctaid.z; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_z));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_nctaid.x; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_x));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_nctaid.y; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_y));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_nctaid.z; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_z));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_ctarank; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctarank));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_nctarank; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctarank));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_eq; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_eq));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_le; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_le));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_lt; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_lt));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_ge; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_ge));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_gt; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_gt));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 100 - // mov.u32 sreg_value, %%clock; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock)); -#endif // __cccl_ptx_isa >= 100 - -#if __cccl_ptx_isa >= 500 - NV_IF_TARGET(NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%clock_hi; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock_hi));)); -#endif // __cccl_ptx_isa >= 500 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET(NV_PROVIDES_SM_35, - ( - // mov.u64 sreg_value, %%clock64; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock64));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 310 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u64 sreg_value, %%globaltimer; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer));)); -#endif // __cccl_ptx_isa >= 310 - -#if __cccl_ptx_isa >= 310 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%globaltimer_lo; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer_lo));)); -#endif // __cccl_ptx_isa >= 310 - -#if __cccl_ptx_isa >= 310 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%globaltimer_hi; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer_hi));)); -#endif // __cccl_ptx_isa >= 310 - -#if __cccl_ptx_isa >= 410 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%total_smem_size; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_total_smem_size));)); -#endif // __cccl_ptx_isa >= 410 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%aggr_smem_size; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_aggr_smem_size));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 410 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%dynamic_smem_size; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_dynamic_smem_size));)); -#endif // __cccl_ptx_isa >= 410 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_50, - ( - // mov.u64 sreg_value, %%current_graph_exec; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_current_graph_exec));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/get_sreg.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp index 73112e871b0..80fc71c0998 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp @@ -31,16 +31,7 @@ * */ -__global__ void test_getctarank(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // getctarank.shared::cluster.u32 dest, addr; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::getctarank));)); -#endif // __cccl_ptx_isa >= 780 -} +#include "generated/getctarank.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp index 3a213d9bce3..2350b176630 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -31,123 +31,9 @@ * */ -__global__ void test_mbarrier_arrive(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET( - NV_PROVIDES_SM_80, - ( - // mbarrier.arrive.shared.b64 state, [addr]; // 1. - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 700 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.release.cta.shared::cta.b64 state, [addr]; // 3a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive)); - // mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr]; // 3a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.release.cta.shared::cta.b64 state, [addr], count; // 3b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive)); - // mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr], count; // 3b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.release.cluster.shared::cluster.b64 _, [addr]; // 4a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.release.cluster.shared::cluster.b64 _, [addr], count; // 4b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive_no_complete));)); -#endif // __cccl_ptx_isa >= 700 -} - -__global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 state, [addr], tx_count; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive_expect_tx)); - // mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 state, [addr], tx_count; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive_expect_tx));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [addr], tx_count; // 9. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive_expect_tx));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/mbarrier_arrive.inc" +#include "generated/mbarrier_arrive_expect_tx.inc" +#include "generated/mbarrier_arrive_no_complete.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp index 6aa0f87e41e..b445a61a8a9 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp @@ -31,16 +31,7 @@ * */ -__global__ void test_mbarrier_init(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.init.shared.b64 [addr], count; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_init));)); -#endif // __cccl_ptx_isa >= 700 -} +#include "generated/mbarrier_init.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp index 007ccdef29c..e9c17a2024d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp @@ -31,162 +31,10 @@ * */ -__global__ void test_mbarrier_test_wait(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 700 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait)); - // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 710 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 710 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity)); - // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_mbarrier_try_wait(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_try_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // - // 5b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_try_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // - // 6a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait)); - // mbarrier.try_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 6a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // - // 6b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait)); - // mbarrier.try_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; - // // 6b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_mbarrier_try_wait_parity(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_try_wait_parity));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_try_wait_parity));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // - // 8a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait_parity)); - // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // - // 8b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait_parity)); - // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity, - // suspendTimeHint; // 8b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/mbarrier_try_wait.inc" +#include "generated/mbarrier_try_wait_parity.inc" +#include "generated/mbarrier_wait.inc" +#include "generated/mbarrier_wait_parity.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp index 5a910b77fbd..4a380ec8396 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp @@ -31,126 +31,7 @@ * */ -__global__ void test_red_async(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; - // // .u64 intentional - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 -} +#include "generated/red_async.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp index 1cc0c1e2d74..2c74f48e04d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -31,41 +31,7 @@ * */ -__global__ void test_st_async(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [addr], value, [remote_bar]; // 1. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::st_async)); - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [addr], value, [remote_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::st_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [addr], value, [remote_bar]; // 2. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::st_async)); - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [addr], value, [remote_bar]; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::st_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; - // // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::st_async));)); -#endif // __cccl_ptx_isa >= 810 -} +#include "generated/st_async.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp index 9d923951f0c..d0d3a967836 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp @@ -31,35 +31,7 @@ * */ -__global__ void test_tensormap_cp_fenceproxy(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [dst], [src], size; - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::tensormap_cp_fenceproxy)); - // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [dst], [src], - // size; - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::tensormap_cp_fenceproxy)); - // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [dst], [src], size; - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::tensormap_cp_fenceproxy)); - // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [dst], [src], size; - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::tensormap_cp_fenceproxy));)); -#endif // __cccl_ptx_isa >= 830 -} +#include "generated/tensormap_cp_fenceproxy.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp index f7360eacbcd..d780ff26dca 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp @@ -31,204 +31,7 @@ * */ -__global__ void test_tensormap_replace(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::tensormap_replace_global_address));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::tensormap_replace_global_address));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::tensormap_replace_rank));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::tensormap_replace_rank));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_box_dim));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_box_dim));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_global_dim));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_global_dim));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int64_t)>( - cuda::ptx::tensormap_replace_global_stride));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int64_t)>( - cuda::ptx::tensormap_replace_global_stride));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_element_size));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_element_size));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_elemtype));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_elemtype));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_interleave_layout));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_interleave_layout));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_swizzle_mode));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_swizzle_mode));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_fill_mode));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_fill_mode));)); -#endif // __cccl_ptx_isa >= 830 -} +#include "generated/tensormap_replace.inc" int main(int, char**) {