Skip to content

Commit

Permalink
Apply clang-format
Browse files Browse the repository at this point in the history
  • Loading branch information
bdice committed Apr 30, 2024
1 parent 9abba40 commit f542bf3
Show file tree
Hide file tree
Showing 101 changed files with 16,159 additions and 14,659 deletions.
596 changes: 360 additions & 236 deletions libcudacxx/include/cuda/annotated_ptr

Large diffs are not rendered by default.

226 changes: 118 additions & 108 deletions libcudacxx/include/cuda/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
# pragma system_header
#endif // no system header

#include <cuda/std/barrier>
#include <cuda/ptx>
#include <cuda/std/barrier>

// Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
// functions. These functions take a pointer to CUtensorMap, so do not need to
Expand Down Expand Up @@ -54,175 +54,185 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _CCCL_DEVICE
void cp_async_bulk_global_to_shared(void *__dest, const void *__src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
_LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");

_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __src, __size,
::cuda::device::barrier_native_handle(__bar));
_LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");

_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__src,
__size,
::cuda::device::barrier_native_handle(__bar));
}


// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _CCCL_DEVICE
void cp_async_bulk_shared_to_global(void *__dest, const void * __src, _CUDA_VSTD::uint32_t __size)
inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size)
{
_LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");

_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__dest, __src, __size);
_CUDA_VPTX::cp_async_bulk(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __dest, __src, __size);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_1d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_2d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_3d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_4d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
int __c3,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_5d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, int __c4, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
int __c3,
int __c4,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster,
_CUDA_VPTX::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_1d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, const void *__src)
inline _CCCL_DEVICE void
cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0};
const _CUDA_VSTD::int32_t __coords[]{__c0};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_2d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, const void *__src)
inline _CCCL_DEVICE void
cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_3d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, const void *__src)
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_4d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, const void *__src)
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE
void cp_async_bulk_tensor_5d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void *__src)
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
{
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
_CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
inline _CCCL_DEVICE
void fence_proxy_async_shared_cta() {
_CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
inline _CCCL_DEVICE void fence_proxy_async_shared_cta()
{
_CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
inline _CCCL_DEVICE
void cp_async_bulk_commit_group()
inline _CCCL_DEVICE void cp_async_bulk_commit_group()
{
_CUDA_VPTX::cp_async_bulk_commit_group();
_CUDA_VPTX::cp_async_bulk_commit_group();
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
template <int __n_prior>
inline _CCCL_DEVICE
void cp_async_bulk_wait_group_read()
inline _CCCL_DEVICE void cp_async_bulk_wait_group_read()
{
static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
_CUDA_VPTX::cp_async_bulk_wait_group_read(_CUDA_VPTX::n32_t<__n_prior>{});
Expand Down
12 changes: 6 additions & 6 deletions libcudacxx/include/cuda/discard_memory
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
# pragma system_header
#endif // no system header

#include <cuda/std/cstdint>
#include <cuda/ptx>
#include <cuda/std/cstdint>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

Expand All @@ -37,14 +37,14 @@ inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, size_t __nbyt
NV_PROVIDES_SM_80,
(if (!__isGlobal((void*) __ptr)) return;

char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
char* const __end_p = __p + __nbytes;
char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
char* const __end_p = __p + __nbytes;
static constexpr size_t _LINE_SIZE = 128;

// Trim the first block and last block if they're not 128 bytes aligned
size_t __misalignment = reinterpret_cast<uintptr_t>(__p) % _LINE_SIZE;
char* __start_aligned = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
char* const __end_aligned = __end_p - (reinterpret_cast<uintptr_t>(__end_p) % _LINE_SIZE);
size_t __misalignment = reinterpret_cast<uintptr_t>(__p) % _LINE_SIZE;
char* __start_aligned = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
char* const __end_aligned = __end_p - (reinterpret_cast<uintptr_t>(__end_p) % _LINE_SIZE);

while (__start_aligned < __end_aligned) {
asm volatile("discard.global.L2 [%0], 128;" ::"l"(__start_aligned) :);
Expand Down
Loading

0 comments on commit f542bf3

Please sign in to comment.