Skip to content

Commit

Permalink
Fixes #689: All remaining async subnamespace operations - `typed_se…
Browse files Browse the repository at this point in the history
…t()`, `set()` and zero()` - now out of the namespace, and unified with their non-async variants
  • Loading branch information
eyalroz committed Nov 16, 2024
1 parent 2dbc238 commit ee4e618
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 94 deletions.
144 changes: 64 additions & 80 deletions src/cuda/api/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,9 +378,10 @@ struct deleter {
* @param start The first location to set to @p value ; must be properly aligned.
* @param value A (properly aligned) value to set T-elements to.
* @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes).
* @param stream A stream on which to schedule this action; may be omitted.
*/
template <typename T>
void typed_set(T* start, const T& value, size_t num_elements);
void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream = {});

/**
* Sets all bytes in a region of memory to a fixed value
Expand All @@ -391,10 +392,26 @@ void typed_set(T* start, const T& value, size_t num_elements);
* @param start starting address of the memory region to set, in a CUDA
* device's global memory
* @param num_bytes size of the memory region in bytes
* @param stream an stream on which to schedule the operation; may be omitted
*
**
* Asynchronously sets all bytes in a stretch of memory to a single value
*
* @note asynchronous version of @ref memory::set(void*, int, size_t)
*
* @param start starting address of the memory region to set,
* in a CUDA device's global memory
* @param byte_value value to set the memory region to
* @param num_bytes size of the memory region in bytes
* @param stream A stream on which to schedule this action; may be omitted.
*/
inline void set(void* start, int byte_value, size_t num_bytes)
inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {})
{
return typed_set<unsigned char>(static_cast<unsigned char*>(start), static_cast<unsigned char>(byte_value), num_bytes);
return typed_set<unsigned char>(
static_cast<unsigned char*>(start),
static_cast<unsigned char>(byte_value),
num_bytes,
stream);
}

/**
Expand All @@ -404,10 +421,11 @@ inline void set(void* start, int byte_value, size_t num_bytes)
*
* @param byte_value value to set the memory region to
* @param region a region to zero-out, in a CUDA device's global memory
* @param stream A stream on which to schedule this action; may be omitted.
*/
inline void set(region_t region, int byte_value)
inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
{
set(region.start(), byte_value, region.size());
set(region.start(), byte_value, region.size(), stream);
}

/**
Expand All @@ -416,50 +434,43 @@ inline void set(region_t region, int byte_value)
* @param start the beginning of a region of memory to zero-out, accessible
* within a CUDA device's global memory
* @param num_bytes the size in bytes of the region of memory to zero-out
* @param stream A stream on which to schedule this action; may be omitted.
*/
inline void zero(void* start, size_t num_bytes)
inline void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream = {})
{
set(start, 0, num_bytes);
set(start, 0, num_bytes, stream);
}

/**
* Sets all bytes in a region of memory to 0 (zero)
*
* @param region the memory region to zero-out, accessible as a part of a
* CUDA device's global memory
* @param stream A stream on which to schedule this action; may be omitted.
*/
inline void zero(region_t region)
inline void zero(region_t region, optional_ref<const stream_t> stream = {})
{
zero(region.start(), region.size());
zero(region.start(), region.size(), stream);
}


/**
* Sets all bytes of a single pointed-to value to 0
*
* @param ptr pointer to a value of a certain type, accessible within
* in a CUDA device's global memory
* @param stream an existing stream on which to schedule this action; may be omitted
*/
template <typename T>
inline void zero(T* ptr)
inline void zero(T* ptr, optional_ref<const stream_t> stream = {})
{
zero(ptr, sizeof(T));
zero(ptr, sizeof(T), stream);
}

} // namespace device

/// Asynchronous memory operations
namespace detail_ {

/**
* Asynchronous versions of @ref memory::copy functions.
*
*
* @note Since we assume Compute Capability >= 2.0, all devices support the
* Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
* where the data is located, and one does not have to specify this.
*/

///@{

/**
Expand Down Expand Up @@ -717,8 +728,9 @@ inline void copy(T(&destination)[N], T* source, optional_ref<const stream_t> str
* memory, global CUDA-device-side memory or CUDA-managed memory.
* @param byte_value value to set the memory region to
* @param num_bytes The amount of memory to set to @p byte_value
* @param stream A stream on which to schedule this action; may be omitted.
*/
void set(void* ptr, int byte_value, size_t num_bytes);
void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream);

/**
* Sets all bytes in a region of memory to a fixed value
Expand All @@ -729,21 +741,23 @@ void set(void* ptr, int byte_value, size_t num_bytes);
* @param region the memory region to set; may be in host-side memory,
* global CUDA-device-side memory or CUDA-managed memory.
* @param byte_value value to set the memory region to
* @param stream A stream on which to schedule this action; may be omitted.
*/
inline void set(region_t region, int byte_value)
inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream)
{
return set(region.start(), byte_value, region.size());
return set(region.start(), byte_value, region.size(), stream);
}

/**
* Sets all bytes in a region of memory to 0 (zero)
*
* @param region the memory region to zero-out; may be in host-side memory,
* global CUDA-device-side memory or CUDA-managed memory.
* @param stream A stream on which to schedule this action; may be omitted.
*/
inline void zero(region_t region)
inline void zero(region_t region, optional_ref<const stream_t> stream)
{
return set(region, 0);
return set(region, 0, stream);
}

/**
Expand All @@ -752,18 +766,20 @@ inline void zero(region_t region)
* @param ptr the beginning of a region of memory to zero-out; may be in host-side
* memory, global CUDA-device-side memory or CUDA-managed memory.
* @param num_bytes the size in bytes of the region of memory to zero-out
* @param stream A stream on which to schedule this action; may be omitted.
*/
inline void zero(void* ptr, size_t num_bytes)
inline void zero(void* ptr, size_t num_bytes, optional_ref<const stream_t> stream)
{
return set(ptr, 0, num_bytes);
return set(ptr, 0, num_bytes, stream);
}

/**
* Sets all bytes of a single pointed-to value to 0
*
* @param ptr pointer to a single element of a certain type, which may
* be in host-side memory, global CUDA-device-side memory or CUDA-managed
* memory
* memory.
* @param stream A stream on which to schedule this action; may be omitted.
*/
template <typename T>
inline void zero(T* ptr)
Expand Down Expand Up @@ -1294,11 +1310,8 @@ inline void copy(void* destination, const_region_t source, optional_ref<const st
copy(destination, source, source.size(), stream);
}


namespace device {

namespace async {

namespace detail_ {

inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
Expand Down Expand Up @@ -1348,67 +1361,27 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han
/**
* Sets consecutive elements of a region of memory to a fixed value of some width
*
* @note A generalization of `async::set()`, for different-size units.
* @note A generalization of `set()`, for different-size units.
*
* @tparam T An unsigned integer type of size 1, 2, 4 or 8
* @param start The first location to set to @p value ; must be properly aligned.
* @param value A (properly aligned) value to set T-elements to.
* @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes).
* @param stream The stream on which to enqueue the operation.
* @param stream A stream on which to schedule this action; may be omitted.
*/
template <typename T>
void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream);

/**
* Asynchronously sets all bytes in a stretch of memory to a single value
*
* @note asynchronous version of @ref memory::set(void*, int, size_t)
*
* @param start starting address of the memory region to set,
* in a CUDA device's global memory
* @param byte_value value to set the memory region to
* @param num_bytes size of the memory region in bytes
* @param stream stream on which to schedule this action
*/
inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream)
{
return typed_set<unsigned char>(
static_cast<unsigned char*>(start),
static_cast<unsigned char>(byte_value),
num_bytes,
stream);
}

/**
* Asynchronously sets all bytes in a stretch of memory to 0.
*
* @note asynchronous version of @ref memory::zero(void*, size_t)
*
* @param start starting address of the memory region to set,
* in a CUDA device's global memory
* @param num_bytes size of the memory region in bytes
* @param stream stream on which to schedule this action
* @param start starting address of the memory region to set, in a CUDA device's global memory
* @param num_bytes size of the memory region in bytes
* @param stream stream on which to schedule this action
* @param stream A stream on which to enqueue the operation; may be omitted.
*/
void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream);

/**
* Asynchronously sets all bytes of a single pointed-to value
* to 0 (zero).
*
* @note asynchronous version of @ref memory::zero(T*)
*
* @param ptr a pointer to the value to be to zero; must be valid in the
* CUDA context of @p stream
* @param stream stream on which to schedule this action
*/
template <typename T>
inline void zero(T* ptr, optional_ref<const stream_t> stream)
{
zero(ptr, sizeof(T), stream);
}

} // namespace async

} // namespace device

namespace inter_context {
Expand Down Expand Up @@ -1851,10 +1824,13 @@ inline void deregister(const_region_t region)
* Sets all bytes in a stretch of host-side memory to a single value
*
* @note a wrapper for @ref ::std::memset
*
* @param byte_value The value to set each byte in the memory region to.
*/
///@{

/**
* @param start starting address of the memory region to set,
* in host memory; can be either CUDA-allocated or otherwise.
* @param byte_value value to set the memory region to
* @param num_bytes size of the memory region in bytes
*/
inline void set(void* start, int byte_value, size_t num_bytes)
Expand All @@ -1863,6 +1839,14 @@ inline void set(void* start, int byte_value, size_t num_bytes)
// TODO: Error handling?
}

/**
* @param region The region of memory to set to the fixed value
*/
inline void set(region_t region, int byte_value)
{
set(region.start(), byte_value, region.size());
}

/**
* Zero-out a region of host memory
*
Expand Down
31 changes: 19 additions & 12 deletions src/cuda/api/multi_wrapper_impls/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,6 @@ inline void free(void* ptr)

namespace async {

template <typename T>
inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream)
{
detail_::set(start, value, num_elements, stream.handle());
}

inline void zero(void* start, size_t num_bytes, const stream_t& stream)
{
Expand Down Expand Up @@ -411,34 +406,46 @@ inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attrib
namespace device {

template <typename T>
inline void typed_set(T* start, const T& value, size_t num_elements)
inline void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream)
{
if (stream) {
detail_::set(start, value, num_elements, stream->handle());
}
context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
static_assert(sizeof(T) == 1 or sizeof(T) == 2 or sizeof(T) == 4,
"Unsupported type size - only sizes 1, 2 and 4 are supported");
// TODO: Consider checking for alignment when compiling without NDEBUG
status_t result {CUDA_SUCCESS};
switch(sizeof(T)) {
case 1: result = cuMemsetD8 (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements); break;
case 2: result = cuMemsetD16(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
case 4: result = cuMemsetD32(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
case 1: result = stream ?
cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream->handle()) :
cuMemsetD8 (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements); break;
case 2: result = stream ?
cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream->handle()) :
cuMemsetD16 (address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
case 4: result = stream ?
cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream->handle()) :
cuMemsetD32 (address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
}
throw_if_error_lazy(result, "Setting global device memory bytes");
}

} // namespace device

inline void set(void* ptr, int byte_value, size_t num_bytes)
inline void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream)
{
switch ( type_of(ptr) ) {
case device_:
// case managed_:
case unified_:
memory::device::set(ptr, byte_value, num_bytes); break;
memory::device::set(ptr, byte_value, num_bytes, stream); break;
// case unregistered_:
case host_:
::std::memset(ptr, byte_value, num_bytes); break;
if (stream) {
throw ::std::invalid_argument("Asynchronous host-memory set's not currently supported");
} else { ::std::memset(ptr, byte_value, num_bytes); }
break;
default:
throw runtime_error(
cuda::status::invalid_value,
Expand Down
4 changes: 2 additions & 2 deletions src/cuda/api/stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ class stream_t {
{
// Is it necessary to set the device? I wonder.
CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
}

/// @copydoc memset(void *, int, size_t) const
Expand All @@ -504,7 +504,7 @@ class stream_t {
void memzero(void *start, size_t num_bytes) const
{
CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_);
memory::device::detail_::zero(start, num_bytes, associated_stream.handle_);
}

/**
Expand Down

0 comments on commit ee4e618

Please sign in to comment.