From e8d27d66f9b80f8bef29acecacc85fb5225c35e8 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Sun, 20 Oct 2024 01:21:57 +0300
Subject: [PATCH] Regards #689, Fixes #688: Unification of asynchronous &
 synchronous copy functions:

* All copy functions now take an optional stream via an `optional_ref` parameter;
* No longer using the `cuda::memory::async` subnamespace for any copy functions; they are all directly in `cuda::memory`
* Fixes #688: Now supporting async copy using copy parameters structures
* Explicitly including `memory.hpp` in `multi_wrapper_impls/memory.hpp`
---
 .../binaryPartitionCG/binaryPartitionCG.cu    |   4 +-
 .../p2pBandwidthLatencyTest.cu                |   2 +-
 .../simpleDrvRuntimePTX.cpp                   |   6 +-
 .../simpleStreams/simpleStreams.cu            |   4 +-
 examples/other/array_management.cu            |   8 +-
 src/cuda/api/memory.hpp                       | 999 ++++++++----------
 src/cuda/api/multi_wrapper_impls/memory.hpp   | 176 ++-
 src/cuda/api/stream.hpp                       |   2 +-
 src/cuda/api/types.hpp                        |   7 +
 9 files changed, 534 insertions(+), 674 deletions(-)
diff --git a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
index 8444a562..6877015c 100644
--- a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
+++ b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
@@ -124,8 +124,8 @@ int main(int argc, const char **argv)
 
 	stream.enqueue.kernel_launch(kernel, launch_config, d_inputArr.data(), d_numOfOdds.data(), d_sumOfOddEvenElems.data(), arrSize);
 
-	cuda::memory::async::copy(h_numOfOdds, d_numOfOdds, stream);
-	cuda::memory::async::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
+	cuda::memory::copy(h_numOfOdds, d_numOfOdds, stream);
+	cuda::memory::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
 
 	stream.synchronize();
 
diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
index f7541c36..da2972fb 100644
--- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
+++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -154,7 +154,7 @@ void enqueue_p2p_copy(
         // Since we assume Compute Capability >= 2.0, all devices support the
         // Unified Virtual Address Space, so we don't need to use
         // cudaMemcpyPeerAsync - cudaMemcpyAsync is enough.
-            cuda::memory::async::copy(dest, src, stream);
+            cuda::memory::copy(dest, src, stream);
         }
     }
 }
diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
index 352d037f..8f4211a8 100644
--- a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
+++ b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
@@ -152,8 +152,8 @@ int main(int argc, char** argv)
 	auto d_C = cuda::memory::make_unique_span<float>(device, N);
 
 
-	cuda::memory::async::copy(d_A, h_A.get(), size, stream);
-	cuda::memory::async::copy(d_B, h_B.get(), size, stream);
+	cuda::memory::copy(d_A, h_A.get(), size, stream);
+	cuda::memory::copy(d_B, h_B.get(), size, stream);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(N)
@@ -164,7 +164,7 @@ int main(int argc, char** argv)
 
     stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.data(), d_B.data(), d_C.data(), N);
 
-	cuda::memory::async::copy(h_C.get(), d_C, size, stream);
+	cuda::memory::copy(h_C.get(), d_C, size, stream);
 	stream.synchronize();
 
 	for (int i = 0; i < N; ++i) {
diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
index 88230be3..d4d873e9 100644
--- a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
+++ b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
@@ -143,7 +143,7 @@ void run_simple_streams_example(
 
 	// time memcpy from device
 	start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed
-	cuda::memory::async::copy(h_a.get(), d_a, streams[0]);
+	cuda::memory::copy(h_a.get(), d_a, streams[0]);
 	stop_event.record();
 	stop_event.synchronize(); // block until the event is actually recorded
 	auto time_memcpy = cuda::event::time_elapsed_between(start_event, stop_event);
@@ -207,7 +207,7 @@ void run_simple_streams_example(
 		//   commence executing when all previous CUDA calls in stream x have completed
 		for (int i = 0; i < nstreams; i++)
 		{
-			cuda::memory::async::copy(
+			cuda::memory::copy(
 				h_a.data() + i * params.n / nstreams,
 				d_a.data() + i * params.n / nstreams, nbytes / nstreams,
 				streams[i]);
diff --git a/examples/other/array_management.cu b/examples/other/array_management.cu
index a592d7e9..659381a0 100644
--- a/examples/other/array_management.cu
+++ b/examples/other/array_management.cu
@@ -97,8 +97,8 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
 
 	// also asynchronously
 	auto stream = device.create_stream(cuda::stream::async);
-	cuda::memory::async::copy(other_arr, span_out, stream);
-	cuda::memory::async::copy(span_in, other_arr, stream);
+	cuda::memory::copy(other_arr, span_out, stream);
+	cuda::memory::copy(span_in, other_arr, stream);
 	device.synchronize();
 	check_output_is_iota("copy from (managed) global memory into a 3D array, asynchronously", span_in);
 }
@@ -162,8 +162,8 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
 
 	// also asynchronously
 	auto stream = cuda::stream::create(device, cuda::stream::async);
-	cuda::memory::async::copy(other_arr, span_out, stream);
-	cuda::memory::async::copy(span_in, other_arr, stream);
+	cuda::memory::copy(other_arr, span_out, stream);
+	cuda::memory::copy(span_in, other_arr, stream);
 	device.synchronize();
 
 	check_output_is_iota("copy from (managed) global memory into a 2D array, asynchronously", span_in);
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index cd8e9dd7..3ee85249 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -424,68 +424,183 @@ inline void zero(T* ptr)
 
 } // namespace device
 
+/// Asynchronous memory operations
+namespace detail_ {
+
 /**
- * @note Since we assume Compute Capability >= 2.0, all devices support the
- * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
- * used in a copy function, where the data is located, and one does not have to specify this.
+ * Asynchronous versions of @ref memory::copy functions.
  *
- * @note the sources and destinations may all be in any memory space addressable
- * in the the unified virtual address space, which could be host-side memory,
- * device global memory, device constant memory etc.
  *
+ * @note Since we assume Compute Capability >= 2.0, all devices support the
+ * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
+ * where the data is located, and one does not have to specify this.
  */
+
 ///@{
 
 /**
- * Synchronously copy data between different locations in memory
+ * Asynchronously copies data between memory spaces or within a memory space, but
+ * within a single CUDA context.
  *
- * @param source A pointer to a a memory region of size @p num_bytes.
- * @param num_bytes The number of bytes to copy from @p source to @p destination
- */
-void copy(void *destination, const void *source, size_t num_bytes);
-
-/**
- * @param destination A memory region of the same size as @p source.
- * @param source A region whose contents is to be copied.
- */
-inline void copy(void* destination, const_region_t source)
+ * @param destination A pointer to a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param source A pointer to a memory region of size at least @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param num_bytes number of bytes to copy from @p source
+ * @param stream_handle The handle of a stream on which to schedule the copy operation
+*/
+inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle)
 {
-	return copy(destination, source.start(), source.size());
+	auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle);
+
+	// TODO: Determine whether it was from host to device, device to host etc and
+	// add this information to the error string
+	throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
 }
 
 /**
- * @param destination A region of memory to which to copy the data in @p source, of
- *     size at least that of @p source , either in host memory or on any CUDA
- *     device's global memory.
- * @param source A region whose contents is to be copied, either in host memory
- *     or on any CUDA device's global memory
+ * @param destination a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param source a memory region of size @p num_bytes, either in
+ * host memory or on any CUDA device's global memory
+ * @param stream_handle The handle of a stream on which to schedule the copy operation
  */
-inline void copy(region_t destination, const_region_t source)
+inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle)
 {
 #ifndef NDEBUG
 	if (destination.size() < source.size()) {
-		throw ::std::logic_error("Can't copy a large region into a smaller one");
+		throw ::std::logic_error("Source size exceeds destination size");
 	}
 #endif
-	return copy(destination.start(), source);
+	copy(destination.start(), source.start(), source.size(), stream_handle);
+}
+///@}
+
+using memory::copy_parameters_t;
+
+inline status_t multidim_copy_in_current_context(
+	::std::integral_constant<dimensionality_t, 2>,
+	copy_parameters_t<2> params,
+	optional<stream::handle_t> stream_handle)
+{
+	// Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters
+	// structure holds no information about contexts.
+	//
+	// Note: The stream handle, even if present, might be the null handle; for now
+	// we distinguish between using the null stream handle - the default stream's -
+	// and using the synchronous API
+	return stream_handle ?
+		   cuMemcpy2DAsync(&params, *stream_handle) :
+		   cuMemcpy2D(&params);
+}
+
+inline status_t multidim_copy_in_current_context(
+	::std::integral_constant<dimensionality_t, 3>,
+	copy_parameters_t<3> params,
+	optional<stream::handle_t> stream_handle)
+{
+	if (params.srcContext == params.dstContext) {
+		// TODO: Should we check it's also the current context?
+		using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
+		auto* intra_context_params = reinterpret_cast<intra_context_type *>(&params);
+		return stream_handle ?
+			   cuMemcpy3DAsync(intra_context_params, *stream_handle) :
+			   cuMemcpy3D(intra_context_params);
+	}
+	return stream_handle ?
+		cuMemcpy3DPeerAsync(&params, *stream_handle) :
+		cuMemcpy3DPeer(&params);
+}
+
+template<dimensionality_t NumDimensions>
+status_t multidim_copy_in_current_context(copy_parameters_t<NumDimensions> params, optional<stream::handle_t> stream_handle) {
+	return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
+}
+
+// Note: Assumes the stream handle is for a stream in the current context
+template<dimensionality_t NumDimensions>
+status_t multidim_copy(
+	context::handle_t                 context_handle,
+	copy_parameters_t<NumDimensions>  params,
+    optional<stream::handle_t>        stream_handle)
+{
+	CAW_SET_SCOPE_CONTEXT(context_handle);
+	return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
+}
+
+// Assumes the array and the stream share the same context, and that the destination is
+// accessible from that context (e.g. allocated within it, or being managed memory, etc.)
+template <typename T, dimensionality_t NumDimensions>
+void copy(T *destination, const array_t<T, NumDimensions>& source, optional<stream::handle_t> stream_handle)
+{
+	using  memory::endpoint_t;
+	auto dims = source.dimensions();
+	//auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
+	auto params = copy_parameters_t<NumDimensions> {};
+	params.clear_offset(endpoint_t::source);
+	params.clear_offset(endpoint_t::destination);
+	params.template set_extent<T>(dims);
+	params.set_endpoint(endpoint_t::source, source);
+	params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
+	params.set_default_pitches();
+	params.clear_rest();
+	auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
+	throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
+}
+
+
+template <typename T, dimensionality_t NumDimensions>
+void copy(const array_t<T, NumDimensions>&  destination, const T* source, optional<stream::handle_t> stream_handle)
+{
+	using memory::endpoint_t;
+	auto dims = destination.dimensions();
+	//auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
+	auto params = copy_parameters_t<NumDimensions>{};
+	params.clear_offset(endpoint_t::source);
+	params.clear_offset(endpoint_t::destination);
+	params.template set_extent<T>(dims);
+	params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
+	params.set_endpoint(endpoint_t::destination, destination);
+	params.set_default_pitches();
+	params.clear_rest();
+	auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
+	throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
 }
 
 /**
- * @param destination A region of memory to which to copy the data in @p source,
- *     of size at least that of @p source.
- * @param source A plain array whose contents is to be copied.
+ * Synchronously copies a single (typed) value between memory spaces or within a memory space.
+ *
+ * @note asynchronous version of @ref memory::copy_single
+ *
+ * @note assumes the source and destination are all valid in the same context as that of the
+ * context handle
+ *
+ * @param destination a value residing either in host memory or on any CUDA device's
+ *     global memory
+ * @param source a value residing either in host memory or on any CUDA device's global
+ *     memory
+ * @param stream_handle A stream on which to enqueue the copy operation
  */
-template <typename T, size_t N>
-inline void copy(region_t destination, const T(&source)[N])
+template <typename T>
+void copy_single(T* destination, const T* source, optional<stream::handle_t> stream_handle)
 {
-#ifndef NDEBUG
-	if (destination.size() < N * sizeof(T)) {
-		throw ::std::logic_error("Source size exceeds destination size");
-	}
-#endif
-	return copy(destination.start(), source, sizeof(T) * N);
+	copy(destination, source, sizeof(T), stream_handle);
 }
 
+} // namespace detail_
+
+/**
+ * @note Since we assume Compute Capability >= 2.0, all devices support the
+ * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
+ * used in a copy function, where the data is located, and one does not have to specify this.
+ *
+ * @note the sources and destinations may all be in any memory space addressable
+ * in the the unified virtual address space, which could be host-side memory,
+ * device global memory, device constant memory etc.
+ *
+ */
+///@{
+
 /**
  * Copy the contents of a C-style array into a span of same-type elements
  *
@@ -495,7 +610,7 @@ inline void copy(region_t destination, const T(&source)[N])
  *     of the first element, there is no array-decay.
  */
 template <typename T, size_t N>
-inline void copy(span<T> destination, const T(&source)[N])
+inline void copy(span<T> destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
 	if (destination.size() < N) {
@@ -505,29 +620,6 @@ inline void copy(span<T> destination, const T(&source)[N])
 	return copy(destination.data(), source, sizeof(T) * N);
 }
 
-/**
- * Copy the contents of memory region into a C-style array, interpreting the memory
- * as a sequence of elements of the array's element type
- *
- * @param destination A region of memory to which to copy the data in @p source,
- *     of size at least that of @p source.
- * @param source A region of at least `sizeof(T)*N` bytes with whose data to fill
- *     the @p destination array.
- */
-template <typename T, size_t N>
-inline void copy(T(&destination)[N], const_region_t source)
-{
-#ifndef NDEBUG
-	size_t required_size = N * sizeof(T);
-	if (source.size() != required_size) {
-		throw ::std::invalid_argument(
-			"Attempt to copy a region of " + ::std::to_string(source.size()) +
-				" bytes into an array of size " + ::std::to_string(required_size) + " bytes");
-	}
-#endif
-	return copy(destination, source.start(), sizeof(T) * N);
-}
-
 /**
  * Copy the contents of a span into a C-style array
  *
@@ -538,7 +630,7 @@ inline void copy(T(&destination)[N], const_region_t source)
  *     containing the data to be copied
  */
 template <typename T, size_t N>
-inline void copy(T(&destination)[N], span<T const> source)
+void copy(c_array<T,N>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
 	if (source.size() > N) {
@@ -547,7 +639,7 @@ inline void copy(T(&destination)[N], span<T const> source)
 			" elements into an array of " + ::std::to_string(N) + " elements");
 	}
 #endif
-	return copy(destination, source.start(), sizeof(T) * N);
+	return copy(destination, source.start(), sizeof(T) * N, stream);
 }
 
 /**
@@ -560,9 +652,9 @@ inline void copy(T(&destination)[N], span<T const> source)
  *     of the first element, there is no array-decay.
  */
 template <typename T, size_t N>
-inline void copy(void* destination, T (&source)[N])
+inline void copy(void* destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
 {
-	return copy(destination, source, sizeof(T) * N);
+	return copy(destination, source, sizeof(T) * N, stream);
 }
 
 /**
@@ -572,60 +664,22 @@ inline void copy(void* destination, T (&source)[N])
  *     @p source,of size at least that of @p source.; as it is taken by reference
  *     rather than by address of the first element, there is no array-decay.
  * @param source The starting address of a sequence of @tparam N elements to copy
- */
-template <typename T, size_t N>
-inline void copy(T(&destination)[N], T* source)
-{
-	return copy(destination, source, sizeof(T) * N);
-}
-
-/**
- * Copy one region of memory into another
  *
- * @param destination A region of memory to which to copy the data in @p source,
- *     of size at least that of @p source.
- * @param source A pointer to a a memory region of size @p num_bytes.
- * @param num_bytes The number of bytes to copy from @p source to @p destination
- */
-inline void copy(region_t destination, void* source, size_t num_bytes)
-{
-#ifndef NDEBUG
-	if (destination.size() < num_bytes) {
-		throw ::std::logic_error("Number of bytes to copy exceeds destination size");
-	}
-#endif
-	return copy(destination.start(), source, num_bytes);
-}
-
-/**
- * Copy one region of memory to another location
+**
+ * Asynchronously copies data from a memory region into a C-style array
  *
- * @param destination The beginning of a target region of memory (of size at least
- *     @p num_bytes) into which to copy
- * @param source A region of memory from which to copy, of size at least @p num_bytes
- * @param num_bytes The number of bytes to copy from @p source to @p destination
+ * @param destination A fixed-size C-style array, to which to copy the data in
+ *     @p source,of size at least that of @p source.; as it is taken by reference
+ *     rather than by address of the first element, there is no array-decay.
+ * @param source The starting address of a sequence of @tparam N elements to copy
+ * @param stream schedule the copy operation in this CUDA stream
  */
-inline void copy(void* destination, const_region_t source, size_t num_bytes)
+template <typename T, size_t N>
+inline void copy(c_array<T,N>& destination, T* source, optional_ref<const stream_t> stream = {})
 {
-#ifndef NDEBUG
-	if (source.size() < num_bytes) {
-		throw ::std::logic_error("Number of bytes to copy exceeds source size");
-	}
-#endif
-	return copy(destination, source.start(), num_bytes);
+	return copy(destination, source, sizeof(T) * N, stream);
 }
 
-/**
- * Copy memory between memory regions
- *
- * @param destination A target region of memory into which to copy; enough memory will
- *     be copied to fill this region
- * @param source The beginning of a region of memory from which to copy
- */
-inline void copy(region_t destination, void* source)
-{
-	return copy(destination, source, destination.size());
-}
 ///@}
 
 /**
@@ -694,38 +748,45 @@ inline void zero(T* ptr)
 
 namespace detail_ {
 
-inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params)
+inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2> two, copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
 {
 	// TODO: Move this logic into the scoped ensurer class
 	auto context_handle = context::current::detail_::get_handle();
 	if  (context_handle != context::detail_::none) {
-		return cuMemcpy2D(&params);
+		return detail_::multidim_copy_in_current_context(two, params, stream_handle);
 	}
 	auto current_device_id = cuda::device::current::detail_::get_id();
 	context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
 	context::current::detail_::push(context_handle);
 	// Note this _must_ be an intra-context copy, as inter-context is not supported
 	// and there's no indication of context in the relevant data structures
-	auto status = cuMemcpy2D(&params);
+	auto status = detail_::multidim_copy_in_current_context(two, params, stream_handle);
 	context::current::detail_::pop();
 	cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
 	return status;
 }
 
-inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params)
+inline status_t multidim_copy(context::handle_t context_handle, ::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
+{
+	context::current::detail_::scoped_override_t context_for_this_scope(context_handle);
+	return multidim_copy(::std::integral_constant<dimensionality_t, 2>{}, params, stream_handle);
+}
+
+inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params, optional<stream::handle_t> stream_handle)
 {
 	if (params.srcContext == params.dstContext) {
 		context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
-		auto *intra_context_params = reinterpret_cast<base_copy_params<3>::intra_context_type *>(&params);
-		return cuMemcpy3D(intra_context_params);
+		return detail_::multidim_copy_in_current_context(params, stream_handle);
 	}
-	return cuMemcpy3DPeer(&params);
+	return stream_handle ?
+		cuMemcpy3DPeerAsync(&params, *stream_handle) :
+		cuMemcpy3DPeer(&params);
 }
 
 template<dimensionality_t NumDimensions>
-status_t multidim_copy(copy_parameters_t<NumDimensions> params)
+status_t multidim_copy(copy_parameters_t<NumDimensions> params, stream::handle_t stream_handle)
 {
-	return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params);
+	return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
 }
 
 
@@ -742,11 +803,7 @@ status_t multidim_copy(copy_parameters_t<NumDimensions> params)
  * merely pass it on to the CUDA driver
  */
 template<dimensionality_t NumDimensions>
-void copy(copy_parameters_t<NumDimensions> params)
-{
-	status_t status = detail_::multidim_copy(params);
-	throw_if_error_lazy(status, "Copying using a general copy parameters structure");
-}
+void copy(copy_parameters_t<NumDimensions> params, optional_ref<const stream_t> stream = {});
 
 /**
  * Synchronously copies data from a CUDA array into non-array memory.
@@ -762,7 +819,7 @@ void copy(copy_parameters_t<NumDimensions> params)
  *     the target array context
  */
 template<typename T, dimensionality_t NumDimensions>
-void copy(const array_t<T, NumDimensions>& destination, const context_t& source_context, const T *source)
+void copy(const array_t<T, NumDimensions>& destination, const context_t& source_context, const T *source, optional_ref<const stream_t> stream = {})
 {
 	auto dims = destination.dimensions();
 	auto params = copy_parameters_t<NumDimensions> {};
@@ -771,7 +828,7 @@ void copy(const array_t<T, NumDimensions>& destination, const context_t& source_
 	params.set_endpoint(endpoint_t::source, source_context.handle(), const_cast<T*>(source), dims);
 	params.set_endpoint(endpoint_t::destination, destination);
 	params.clear_rest();
-	copy(params);
+	copy(params, stream);
 }
 
 /**
@@ -783,12 +840,17 @@ void copy(const array_t<T, NumDimensions>& destination, const context_t& source_
  * @param destination A {@tparam NumDimensions}-dimensional CUDA array
  * @param source A pointer to a region of contiguous memory holding `destination.size()` values
  * of type @tparam T. The memory may be located either on a CUDA device or in host memory.
+ *
+ * Asynchronously copies data into a CUDA array.
+ *
+ * @note asynchronous version of @ref memory::copy<T>(array_t<T, NumDimensions>&, const T*)
+ *
+ * @param destination A CUDA array to copy data into
+ * @param source A pointer to a a memory region of size `destination.size() * sizeof(T)`
+ * @param stream schedule the copy operation into this CUDA stream
  */
-template<typename T, dimensionality_t NumDimensions>
-void copy(const array_t<T, NumDimensions>& destination, const T *source)
-{
-	copy(destination, context_of(source), source);
-}
+template <typename T, dimensionality_t NumDimensions>
+void copy(array_t<T, NumDimensions>& destination, const T* source, optional_ref<const stream_t> stream = {});
 
 /**
  * Copies a contiguous sequence of elements in memory into a CUDA array
@@ -799,7 +861,7 @@ void copy(const array_t<T, NumDimensions>& destination, const T *source)
  * in the source span are ignored
  */
 template<typename T, dimensionality_t NumDimensions>
-void copy(const array_t<T, NumDimensions>& destination, span<T const> source)
+void copy(const array_t<T, NumDimensions>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
 	if (destination.size() < source.size()) {
@@ -808,7 +870,7 @@ void copy(const array_t<T, NumDimensions>& destination, span<T const> source)
 			" elements into a CUDA array of " + ::std::to_string(destination.size()) + " elements");
 	}
 #endif
-	copy(destination, source.data());
+	copy(destination, source.data(), stream);
 }
 
 /**
@@ -822,7 +884,7 @@ void copy(const array_t<T, NumDimensions>& destination, span<T const> source)
  * @param source A {@tparam NumDimensions}-dimensional CUDA array
  */
 template <typename T, dimensionality_t NumDimensions>
-void copy(const context_t& context, T *destination, const array_t<T, NumDimensions>& source)
+void copy(const context_t& context, T *destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {})
 {
 	auto dims = source.dimensions();
 	auto params = copy_parameters_t<NumDimensions> {};
@@ -833,7 +895,7 @@ void copy(const context_t& context, T *destination, const array_t<T, NumDimensio
 	params.template set_endpoint<T>(endpoint_t::destination, context.handle(), destination, dims);
 	params.set_default_pitches();
 	params.clear_rest();
-	copy(params);
+	copy(params, stream);
 }
 
 /**
@@ -845,12 +907,18 @@ void copy(const context_t& context, T *destination, const array_t<T, NumDimensio
  * @param destination A pointer to a region of contiguous memory holding `destination.size()` values
  * of type @tparam T. The memory may be located either on a CUDA device or in host memory.
  * @param source A {@tparam NumDimensions}-dimensional CUDA array
+ *
+ * Asynchronously copies data from a CUDA array elsewhere
+ *
+ * @note asynchronous version of @ref memory::copy
+ *
+ * @param destination A pointer to a a memory region of size `source.size() * sizeof(T)`
+ * @param source A CUDA array @ref cuda::array_t
+ * @param stream schedule the copy operation into this CUDA stream
  */
 template <typename T, dimensionality_t NumDimensions>
-void copy(T *destination, const array_t<T, NumDimensions>& source)
-{
-	copy(context_of(destination), destination, source);
-}
+void copy(T* destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {});
+
 
 /**
  * Copies the contents of a CUDA array into a sequence of contiguous elements in memory
@@ -860,7 +928,7 @@ void copy(T *destination, const array_t<T, NumDimensions>& source)
  * @note The @p destination span must be at least as larger as the volume of the array.
  */
 template <typename T, dimensionality_t NumDimensions>
-void copy(span<T> destination, const array_t<T, NumDimensions>& source)
+void copy(span<T> destination, const array_t<T, NumDimensions>& source, optional_ref <const stream_t> stream = {})
 {
 #ifndef NDEBUG
 	if (destination.size() < source.size()) {
@@ -869,7 +937,7 @@ void copy(span<T> destination, const array_t<T, NumDimensions>& source)
 			" elements into a span of " + ::std::to_string(destination.size()) + " elements");
 	}
 #endif
-	copy(destination.data(), source);
+	copy(destination.data(), source, stream);
 }
 
 /**
@@ -880,7 +948,7 @@ void copy(span<T> destination, const array_t<T, NumDimensions>& source)
  * @note The destination array must be at least as large in each dimension as the source array.
  */
 template <typename T, dimensionality_t NumDimensions>
-void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDimensions>& source)
+void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream)
 {
 	auto dims = source.dimensions();
 	auto params = copy_parameters_t<NumDimensions> {};
@@ -890,9 +958,9 @@ void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDime
 	params.set_endpoint(endpoint_t::source, source);
 	params.set_endpoint(endpoint_t::destination, destination);
 	params.set_default_pitches();
-	params.clear_rest();;
+	params.clear_rest();
 	auto status = //(source.context() == destination.context()) ?
-		detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
+		detail_::multidim_copy<NumDimensions>(source.context_handle(), params, stream);
 	throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region");
 }
 
@@ -903,14 +971,27 @@ void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDime
  *
  * @note the @p destination region must be large enough to hold all elements of the array,
  * and may also be larger.
+ *
+**
+ * Asynchronously copies data from a CUDA array elsewhere
+ *
+ * @note asynchronous version of @ref memory::copy
+ *
+ * @param destination A memory region of size `source.size() * sizeof(T)`
+ * @param source A CUDA array @ref cuda::array_t
+ * @param stream schedule the copy operation in this CUDA stream
  */
 template <typename T, dimensionality_t NumDimensions>
-void copy(region_t destination, const array_t<T, NumDimensions>& source)
+void copy(region_t destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {})
 {
+#ifndef NDEBUG
 	if (destination.size() < source.size_bytes()) {
-		throw ::std::logic_error("Attempt to copy an array into a memory region too small to hold the copy");
+		throw ::std::invalid_argument(
+			"Attempt to copy " + ::std::to_string(source.size_bytes()) + " bytes from an array into a "
+				"region of smaller size (" + ::std::to_string(destination.size()) + " bytes)");
 	}
-	copy(destination.start(), source);
+#endif
+	copy(destination.start(), source, stream);
 }
 
 /**
@@ -920,14 +1001,23 @@ void copy(region_t destination, const array_t<T, NumDimensions>& source)
  *
  * @note only as many elements as fit in the array are copied, while the source region may
  * be larger than what they take up.
+ *
+ * @param destination A CUDA array to copy data into
+ * @param source A memory region of size `destination.size() * sizeof(T)`
+ * @param stream schedule the copy operation into this CUDA stream (or leave empty for a
+ * synchronous copy)
  */
 template <typename T, dimensionality_t NumDimensions>
-void copy(const array_t<T, NumDimensions>& destination, const_region_t source)
+void copy(array_t<T, NumDimensions>& destination, const_region_t source, optional_ref<const stream_t> stream = {})
 {
+#ifndef NDEBUG
 	if (destination.size_bytes() < source.size()) {
-		throw ::std::logic_error("Attempt to copy into an array from a source region larger than the array's size");
+		throw ::std::invalid_argument(
+			"Attempt to copy a region of " + ::std::to_string(source.size()) +
+			" bytes into an array of size " + ::std::to_string(destination.size_bytes()) + " bytes");
 	}
-	copy(destination, static_cast<T const*>(source.start()));
+#endif
+	copy(destination, static_cast<T const*>(source.start()), stream);
 }
 
 /**
@@ -937,169 +1027,17 @@ void copy(const array_t<T, NumDimensions>& destination, const_region_t source)
  * device's global memory
  * @param source a value residing either in host memory or on any CUDA
  * device's global memory
- */
-template <typename T>
-void copy_single(T* destination, const T* source)
-{
-	copy(destination, source, sizeof(T));
-}
-
-/// Asynchronous memory operations
-namespace async {
-
-namespace detail_ {
-
-/**
- * Asynchronous versions of @ref memory::copy functions.
- *
- *
- * @note Since we assume Compute Capability >= 2.0, all devices support the
- * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
- * where the data is located, and one does not have to specify this.
- */
-
-///@{
-
-/**
- * Asynchronously copies data between memory spaces or within a memory space, but
- * within a single CUDA context.
- *
- * @param destination A pointer to a memory region of size @p num_bytes, either in
- * host memory or on any CUDA device's global memory
- * @param source A pointer to a memory region of size at least @p num_bytes, either in
- * host memory or on any CUDA device's global memory
- * @param num_bytes number of bytes to copy from @p source
- * @param stream_handle The handle of a stream on which to schedule the copy operation
-*/
-inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle)
-{
-	auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle);
-
-	// TODO: Determine whether it was from host to device, device to host etc and
-	// add this information to the error string
-	throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
-}
-
-/**
- * @param destination a memory region of size @p num_bytes, either in
- * host memory or on any CUDA device's global memory
- * @param source a memory region of size @p num_bytes, either in
- * host memory or on any CUDA device's global memory
- * @param stream_handle The handle of a stream on which to schedule the copy operation
- */
-inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle)
-{
-#ifndef NDEBUG
-	if (destination.size() < source.size()) {
-		throw ::std::logic_error("Source size exceeds destination size");
-	}
-#endif
-	copy(destination.start(), source.start(), source.size(), stream_handle);
-}
-///@}
-
-using memory::copy_parameters_t;
-
-inline status_t multidim_copy_in_current_context(
-	::std::integral_constant<dimensionality_t, 2>,
-	copy_parameters_t<2> params,
-	stream::handle_t stream_handle)
-{
-	// Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters
-	// structure holds no information about contexts.
-	return cuMemcpy2DAsync(&params, stream_handle);
-}
-
-inline status_t multidim_copy_in_current_context(
-	::std::integral_constant<dimensionality_t, 3>,
-	copy_parameters_t<3> params,
-	stream::handle_t stream_handle)
-{
-	if (params.srcContext == params.dstContext) {
-		using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
-		auto* intra_context_params = reinterpret_cast<intra_context_type *>(&params);
-		return cuMemcpy3DAsync(intra_context_params, stream_handle);
-	}
-	return cuMemcpy3DPeerAsync(&params, stream_handle);
-
-}
-
-template<dimensionality_t NumDimensions>
-status_t multidim_copy_in_current_context(copy_parameters_t<NumDimensions> params, stream::handle_t stream_handle) {
-	return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
-}
-
-// Note: Assumes the stream handle is for a stream in the current context
-template<dimensionality_t NumDimensions>
-status_t multidim_copy(
-	context::handle_t                 context_handle,
-	copy_parameters_t<NumDimensions>  params,
-	stream::handle_t                  stream_handle)
-{
-	CAW_SET_SCOPE_CONTEXT(context_handle);
-	return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
-}
-
-// Assumes the array and the stream share the same context, and that the destination is
-// accessible from that context (e.g. allocated within it, or being managed memory, etc.)
-template <typename T, dimensionality_t NumDimensions>
-void copy(T *destination, const array_t<T, NumDimensions>& source, stream::handle_t stream_handle)
-{
-	using  memory::endpoint_t;
-	auto dims = source.dimensions();
-	//auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
-	auto params = copy_parameters_t<NumDimensions> {};
-	params.clear_offset(endpoint_t::source);
-	params.clear_offset(endpoint_t::destination);
-	params.template set_extent<T>(dims);
-	params.set_endpoint(endpoint_t::source, source);
-	params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
-	params.set_default_pitches();
-	params.clear_rest();
-	auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
-	throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
-}
-
-
-template <typename T, dimensionality_t NumDimensions>
-void copy(const array_t<T, NumDimensions>&  destination, const T* source, stream::handle_t stream_handle)
-{
-	using memory::endpoint_t;
-	auto dims = destination.dimensions();
-	//auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
-	auto params = copy_parameters_t<NumDimensions>{};
-	params.clear_offset(endpoint_t::source);
-	params.clear_offset(endpoint_t::destination);
-	params.template set_extent<T>(dims);
-	params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
-	params.set_endpoint(endpoint_t::destination, destination);
-	params.set_default_pitches();
-	params.clear_rest();
-	auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
-	throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
-}
-
-/**
- * Synchronously copies a single (typed) value between memory spaces or within a memory space.
  *
- * @note asynchronous version of @ref memory::copy_single
+ * Copy a single (typed) value between memory locations
  *
- * @note assumes the source and destination are all valid in the same context as that of the
- * context handle
+ * @note asynchronous version of @ref memory::copy_single<T>(T&, const T&)
  *
- * @param destination a value residing either in host memory or on any CUDA device's
- *     global memory
- * @param source a value residing either in host memory or on any CUDA device's global
- *     memory
- * @param stream_handle A stream on which to enqueue the copy operation
+ * @param destination a value residing either in host memory or on any CUDA device's global memory
+ * @param source a value residing either in host memory or on any CUDA device's global memory
+ * @param stream The CUDA command queue on which this copying will be enqueued
  */
 template <typename T>
-void copy_single(T* destination, const T* source, stream::handle_t stream_handle)
-{
-	copy(destination, source, sizeof(T), stream_handle);
-}
-
-} // namespace detail_
+void copy_single(T* destination, const T* source, optional_ref<const stream_t> stream = {});
 
 /**
  * Asynchronously copies data between memory spaces or within a memory space.
@@ -1119,29 +1057,77 @@ void copy_single(T* destination, const T* source, stream::handle_t stream_handle
  * @param num_bytes The number of bytes to copy from @p source to @p destination
  * @param stream A stream on which to enqueue the copy operation
  */
-void copy(void* destination, void const* source, size_t num_bytes, const stream_t& stream);
+void copy(void* destination, void const* source, size_t num_bytes, optional_ref<const stream_t> stream = {});
+
 
 /**
- * Asynchronously copies data between memory regions
+ * Copy the contents of memory region into a C-style array, interpreting the memory
+ * as a sequence of elements of the array's element type
  *
- * @param destination The beginning of a memory region of size @p num_bytes, either in host
- *     memory or on any CUDA device's global memory. Must be registered with, or visible in,
- *     in the same context as @p stream.
- * @param source A memory region of size @p num_bytes, either in host memory or on any
- *     CUDA device's global memory. Must be defined in the same context as the stream.
- * @param num_bytes The number of bytes to copy from @p source to @p destination
+ * @param destination A region of memory to which to copy the data in @p source,
+ *     of size at least that of @p source.
+ * @param source A region of at least `sizeof(T)*N` bytes with whose data to fill
+ *     the @p destination array.
+ *
+**
+ * Asynchronously copies data from a memory region into a C-style array
+ *
+ * @param destination A fixed-size C-style array, to which to copy the data in
+ *     @p source,of size at least that of @p source.; as it is taken by reference
+ *     rather than by address of the first element, there is no array-decay.
+ * @param source A region of at least `sizeof(T)*N` bytes with whose data to fill
+ *     the @p destination array.
+ * @param stream schedule the copy operation in this CUDA stream
+ */
+template <typename T, size_t N>
+inline void copy(c_array<T,N>& destination, const_region_t source, optional_ref<const stream_t> stream = {})
+{
+#ifndef NDEBUG
+	size_t required_size = N * sizeof(T);
+	if (source.size() != required_size) {
+		throw ::std::invalid_argument(
+			"Attempt to copy a region of " + ::std::to_string(source.size()) +
+			" bytes into an array of size " + ::std::to_string(required_size) + " bytes");
+	}
+#endif
+	return copy(&(destination[0]), source.start(), sizeof(T) * N, stream);
+}
+
+/**
+ * @note Since we assume Compute Capability >= 2.0, all devices support the
+ * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
+ * used in a copy function, where the data is located, and one does not have to specify this.
+ *
+ * @note the sources and destinations may all be in any memory space addressable
+ * in the the unified virtual address space, which could be host-side memory,
+ * device global memory, device constant memory etc.
+ *
+ *
+**
+ * @param destination A region of memory to which to copy the data in @p source,
+ *     of size at least that of @p source.
+ * @param source A plain array whose contents is to be copied.
+ *
+ * **
+ * Asynchronously copies data from an array into a memory region
+ *
+ * @param destination A region of memory, either in host memory or on any CUDA device's
+ *     global memory. Must be defined in the same context as the stream.
+ * @param source An array, either in host memory or on any CUDA device's global memory.
  * @param stream A stream on which to enqueue the copy operation
  */
-inline void copy(void* destination, const_region_t source, size_t num_bytes, const stream_t& stream)
+template <typename T, size_t N>
+inline void copy(region_t destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
-	if (source.size() < num_bytes) {
-		throw ::std::logic_error("Attempt to copy more than the source region's size");
+	if (destination.size() < N) {
+		throw ::std::logic_error("Source size exceeds destination size");
 	}
 #endif
-	copy(destination, source.start(), num_bytes, stream);
+	return copy(destination.start(), source, sizeof(T) * N, stream);
 }
 
+
 /**
  * Asynchronously copies data between memory spaces or within a memory space.
  *
@@ -1153,7 +1139,7 @@ inline void copy(void* destination, const_region_t source, size_t num_bytes, con
  * @param num_bytes The number of bytes to copy from @p source to @p destination
  * @param stream A stream on which to enqueue the copy operation
  */
-inline void copy(region_t destination, const_region_t source, size_t num_bytes, const stream_t& stream)
+inline void copy(region_t destination, const_region_t source, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
 	if (destination.size() < num_bytes) {
@@ -1163,22 +1149,15 @@ inline void copy(region_t destination, const_region_t source, size_t num_bytes,
 	copy(destination.start(), source.start(), num_bytes, stream);
 }
 
-/**
- * Asynchronously copies data between memory regions
- *
- * @param destination Beginning of a memory region into which to copy data, either in host
- *      memory or on any CUDA device's global memory. The memory must be registered in,
- *      or visible within, the same context as {@p stream}.
- * @param source A memory region of size @p num_bytes, either in host memory or on any CUDA
- *     device's global memory. Must be defined in the same context as the stream.
- * @param stream A stream on which to enqueue the copy operation
- */
-inline void copy(void* destination, const_region_t source, const stream_t& stream)
-{
-	copy(destination, source, source.size(), stream);
-}
 
 /**
+ * @param destination A region of memory to which to copy the data in @p source, of
+ *     size at least that of @p source , either in host memory or on any CUDA
+ *     device's global memory.
+ * @param source A region whose contents is to be copied, either in host memory
+ *     or on any CUDA device's global memory
+ *
+**
  * Asynchronously copies data between memory regions
  *
  * @param destination A region of memory, either in host memory or on any CUDA device's
@@ -1187,12 +1166,20 @@ inline void copy(void* destination, const_region_t source, const stream_t& strea
  *     global memory. Must be defined in the same context as the stream.
  * @param stream A stream on which to enqueue the copy operation
  */
-inline void copy(region_t destination, const_region_t source, const stream_t& stream)
+inline void copy(region_t destination, const_region_t source, optional_ref<const stream_t> stream = {})
 {
 	copy(destination, source, source.size(), stream);
 }
 
+
 /**
+ * Copy memory between memory regions
+ *
+ * @param destination A target region of memory into which to copy; enough memory will
+ *     be copied to fill this region
+ * @param source The beginning of a region of memory from which to copy
+ *
+**
  * Asynchronously copies data between memory regions
  *
  * @param destination A region of memory, either in host memory or on any CUDA device's
@@ -1202,31 +1189,20 @@ inline void copy(region_t destination, const_region_t source, const stream_t& st
  *     in the same context as the stream.
  * @param stream A stream on which to enqueue the copy operation
  */
-inline void copy(region_t destination, void* source, const stream_t& stream)
+inline void copy(region_t destination, void* source, optional_ref<const stream_t> stream = {})
 {
 	return copy(destination.start(), source, destination.size(), stream);
 }
 
 /**
- * Asynchronously copies data from an array into a memory region
+ * Copy one region of memory into another
  *
- * @param destination A region of memory, either in host memory or on any CUDA device's
- *     global memory. Must be defined in the same context as the stream.
- * @param source An array, either in host memory or on any CUDA device's global memory.
- * @param stream A stream on which to enqueue the copy operation
- */
-template <typename T, size_t N>
-inline void copy(region_t destination, const T(&source)[N], const stream_t& stream)
-{
-#ifndef NDEBUG
-	if (destination.size() < N) {
-		throw ::std::logic_error("Source size exceeds destination size");
-	}
-#endif
-	return copy(destination.start(), source, sizeof(T) * N, stream);
-}
-
-/**
+ * @param destination A region of memory to which to copy the data in @p source,
+ *     of size at least that of @p source.
+ * @param source A pointer to a a memory region of size @p num_bytes.
+ * @param num_bytes The number of bytes to copy from @p source to @p destination
+ *
+**
  * Asynchronously copies data from one region of memory to another
  *
  * @param destination A region of memory, either in host memory or on any CUDA device's
@@ -1235,7 +1211,7 @@ inline void copy(region_t destination, const T(&source)[N], const stream_t& stre
  * @param num_bytes Amount of memory to copy
  * @param stream A stream on which to enqueue the copy operation
  */
-inline void copy(region_t destination, void* source, size_t num_bytes, const stream_t& stream)
+inline void copy(region_t destination, void* source, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
 	if (destination.size() < num_bytes) {
@@ -1246,127 +1222,53 @@ inline void copy(region_t destination, void* source, size_t num_bytes, const str
 }
 
 /**
- * Asynchronously copies data into a CUDA array.
- *
- * @note asynchronous version of @ref memory::copy<T>(array_t<T, NumDimensions>&, const T*)
- *
- * @param destination A CUDA array to copy data into
- * @param source A pointer to a a memory region of size `destination.size() * sizeof(T)`
- * @param stream schedule the copy operation into this CUDA stream
- */
-template <typename T, dimensionality_t NumDimensions>
-void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream);
-
-/**
- * Asynchronously copies data into a CUDA array.
- *
- * @note asynchronous version of @ref memory::copy<T>(array_t<T, NumDimensions>&, const T*)
- *
- * @param destination A CUDA array to copy data into
- * @param source A memory region of size `destination.size() * sizeof(T)`
- * @param stream schedule the copy operation into this CUDA stream
- */
-template <typename T, dimensionality_t NumDimensions>
-void copy(array_t<T, NumDimensions>& destination, const_region_t source, const stream_t& stream)
-{
-#ifndef NDEBUG
-	size_t required_size = destination.size() * sizeof(T);
-	if (source.size() != required_size) {
-		throw ::std::invalid_argument(
-			"Attempt to copy a region of " + ::std::to_string(source.size()) +
-			" bytes into an array of size " + ::std::to_string(required_size) + " bytes");
-	}
-#endif
-	copy(destination, static_cast<T const*>(source.start()), stream);
-}
-
-/**
- * Asynchronously copies data from a CUDA array elsewhere
- *
- * @note asynchronous version of @ref memory::copy
+ * Copy one region of memory to another location
  *
- * @param destination A pointer to a a memory region of size `source.size() * sizeof(T)`
- * @param source A CUDA array @ref cuda::array_t
- * @param stream schedule the copy operation into this CUDA stream
- */
-template <typename T, dimensionality_t NumDimensions>
-void copy(T* destination, const array_t<T, NumDimensions>& source, const stream_t& stream);
-
-/**
- * Asynchronously copies data from a CUDA array elsewhere
+ * @param destination The beginning of a target region of memory (of size at least
+ *     @p num_bytes) into which to copy
+ * @param source A region of memory from which to copy, of size at least @p num_bytes
+ * @param num_bytes The number of bytes to copy from @p source to @p destination
  *
- * @note asynchronous version of @ref memory::copy
+**
+ * Asynchronously copies data between memory regions
  *
- * @param destination A memory region of size `source.size() * sizeof(T)`
- * @param source A CUDA array @ref cuda::array_t
- * @param stream schedule the copy operation in this CUDA stream
+ * @param destination The beginning of a memory region of size @p num_bytes, either in host
+ *     memory or on any CUDA device's global memory. Must be registered with, or visible in,
+ *     in the same context as @p stream.
+ * @param source A memory region of size @p num_bytes, either in host memory or on any
+ *     CUDA device's global memory. Must be defined in the same context as the stream.
+ * @param num_bytes The number of bytes to copy from @p source to @p destination
+ * @param stream A stream on which to enqueue the copy operation
  */
-template <typename T, dimensionality_t NumDimensions>
-void copy(region_t destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
+inline void copy(void* destination, const_region_t source, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
-	size_t required_size = source.size() * sizeof(T);
-	if (destination.size() < required_size) {
-		throw ::std::invalid_argument(
-			"Attempt to copy " + ::std::to_string(required_size) + " bytes from an array into a "
-			"region of smaller size (" + ::std::to_string(destination.size()) + " bytes)");
+	if (source.size() < num_bytes) {
+		throw ::std::logic_error("Attempt to copy more than the source region's size");
 	}
 #endif
-	copy(destination.start(), source, stream);
+	copy(destination, source.start(), num_bytes, stream);
 }
 
 /**
- * Asynchronously copies data from a memory region into a C-style array
+ * @param destination A memory region of the same size as @p source.
+ * @param source A region whose contents is to be copied.
  *
- * @param destination A fixed-size C-style array, to which to copy the data in
- *     @p source,of size at least that of @p source.; as it is taken by reference
- *     rather than by address of the first element, there is no array-decay.
- * @param source The starting address of a sequence of @tparam N elements to copy
- * @param stream schedule the copy operation in this CUDA stream
- */
-template <typename T, size_t N>
-inline void copy(T(&destination)[N], T* source, const stream_t& stream)
-{
-	return copy(destination, source, sizeof(T) * N, stream);
-}
-
-/**
- * Asynchronously copies data from a memory region into a C-style array
+**
+ * Asynchronously copies data between memory regions
  *
- * @param destination A fixed-size C-style array, to which to copy the data in
- *     @p source,of size at least that of @p source.; as it is taken by reference
- *     rather than by address of the first element, there is no array-decay.
- * @param source A region of at least `sizeof(T)*N` bytes with whose data to fill
- *     the @p destination array.
- * @param stream schedule the copy operation in this CUDA stream
+ * @param destination Beginning of a memory region into which to copy data, either in host
+ *      memory or on any CUDA device's global memory. The memory must be registered in,
+ *      or visible within, the same context as {@p stream}.
+ * @param source A memory region of size @p num_bytes, either in host memory or on any CUDA
+ *     device's global memory. Must be defined in the same context as the stream.
+ * @param stream A stream on which to enqueue the copy operation
  */
-template <typename T, size_t N>
-inline void copy(T(&destination)[N], const_region_t source, const stream_t& stream)
+inline void copy(void* destination, const_region_t source, optional_ref<const stream_t> stream = {})
 {
-#ifndef NDEBUG
-	size_t required_size = N * sizeof(T);
-	if (source.size() != required_size) {
-		throw ::std::invalid_argument(
-			"Attempt to copy a region of " + ::std::to_string(source.size()) +
-				" bytes into an array of size " + ::std::to_string(required_size) + " bytes");
-	}
-#endif
-	return copy(destination, source.start(), sizeof(T) * N, stream);
+	copy(destination, source, source.size(), stream);
 }
 
-/**
- * Copy a single (typed) value between memory locations
- *
- * @note asynchronous version of @ref memory::copy_single<T>(T&, const T&)
- *
- * @param destination a value residing either in host memory or on any CUDA device's global memory
- * @param source a value residing either in host memory or on any CUDA device's global memory
- * @param stream The CUDA command queue on which this copying will be enqueued
- */
-template <typename T>
-void copy_single(T* destination, const T* source, const stream_t& stream);
-
-} // namespace async
 
 namespace device {
 
@@ -1430,7 +1332,7 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han
  * @param stream The stream on which to enqueue the operation.
  */
 template <typename T>
-void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream);
+void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream);
 
 /**
  * Asynchronously sets all bytes in a stretch of memory to a single value
@@ -1443,7 +1345,7 @@ void typed_set(T* start, const T& value, size_t num_elements, const stream_t& st
  * @param num_bytes size of the memory region in bytes
  * @param stream stream on which to schedule this action
  */
-inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream)
+inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream)
 {
 	return typed_set<unsigned char>(
 		static_cast<unsigned char*>(start),
@@ -1462,7 +1364,7 @@ inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& s
  * @param num_bytes size of the memory region in bytes
  * @param stream stream on which to schedule this action
  */
-void zero(void* start, size_t num_bytes, const stream_t& stream);
+void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream);
 
 /**
  * Asynchronously sets all bytes of a single pointed-to value
@@ -1475,7 +1377,7 @@ void zero(void* start, size_t num_bytes, const stream_t& stream);
  * @param stream stream on which to schedule this action
  */
 template <typename T>
-inline void zero(T* ptr, const stream_t& stream)
+inline void zero(T* ptr, optional_ref<const stream_t> stream)
 {
 	zero(ptr, sizeof(T), stream);
 }
@@ -1486,51 +1388,21 @@ inline void zero(T* ptr, const stream_t& stream)
 
 namespace inter_context {
 
-namespace detail_ {
-
-inline void copy(
-	void *             destination_address,
-	context::handle_t  destination_context,
-	const void *       source_address,
-	context::handle_t  source_context,
-	size_t             num_bytes)
-{
-	auto status = cuMemcpyPeer(
-		reinterpret_cast<device::address_t>(destination_address),
-		destination_context,
-		reinterpret_cast<device::address_t>(source_address),
-		source_context, num_bytes);
-	throw_if_error_lazy(status,
-		::std::string("Failed copying data between devices: From address ")
-			+ cuda::detail_::ptr_as_hex(source_address) + " in "
-			+ context::detail_::identify(source_context) + " to address "
-			+ cuda::detail_::ptr_as_hex(destination_address) + " in "
-			+ context::detail_::identify(destination_context) );
-}
-
-} // namespace detail_
-
 void copy(
-	void *             destination,
-	const context_t&   destination_context,
-	const void *       source_address,
-	const context_t&   source_context,
-	size_t             num_bytes);
-
-inline void copy(
-	void *             destination,
-	const context_t&   destination_context,
-	const_region_t     source,
-	const context_t&   source_context)
-{
-	copy(destination, destination_context, source.start(), source_context, source.size());
-}
-
+	void *                        destination,
+	const context_t&              destination_context,
+	const void *                  source_address,
+	const context_t&              source_context,
+	size_t                        num_bytes,
+	optional_ref<const stream_t>  stream);
+
+/*
 inline void copy(
 	region_t           destination,
 	const context_t&   destination_context,
 	const_region_t     source,
-	const context_t&   source_context)
+	const context_t&   source_context,
+	optional_ref<const stream_t> stream)
 {
 #ifndef NDEBUG
 	if (destination.size() < destination.size()) {
@@ -1539,108 +1411,111 @@ inline void copy(
 				" bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
 	}
 #endif
-	copy(destination.start(), destination_context, source, source_context);
+	copy(destination.start(), destination_context, source, source_context, stream);
 }
+*/
+
+
+/*
 
 template <typename T, dimensionality_t NumDimensions>
 inline void copy(
 	array_t<T, NumDimensions>  destination,
-	array_t<T, NumDimensions>  source)
+	array_t<T, NumDimensions>  source,
+	optional_ref<const stream_t> stream)
 {
 	// for arrays, a single mechanism handles both intra- and inter-context copying
-	return memory::copy(destination, source);
+	return memory::copy(destination, source, stream);
 }
-
-namespace async {
+*/
 
 namespace detail_ {
 
-inline void copy(
-	void *destination,
-	context::handle_t destination_context_handle,
-	const void *source,
-	context::handle_t source_context_handle,
-	size_t num_bytes,
-	stream::handle_t stream_handle)
-{
-	auto result = cuMemcpyPeerAsync(
-		device::address(destination),
-		destination_context_handle,
-		device::address(source),
-		source_context_handle,
-		num_bytes, stream_handle);
-
-	// TODO: Determine whether it was from host to device, device to host etc and
-	// add this information to the error string
-	throw_if_error_lazy(result, "Scheduling an inter-context memory copy from "
-		+ context::detail_::identify(source_context_handle) + " to "
-		+ context::detail_::identify(destination_context_handle) + " on "
-		+ stream::detail_::identify(stream_handle));
-}
-
 /**
  * @param destination a memory region of size @p num_bytes, either in
  * host memory or on any CUDA device's global memory
  * @param source a memory region of size @p num_bytes, either in
  * host memory or on any CUDA device's global memory
  * @param stream_handle The handle of a stream on which to schedule the copy operation
- */
+ *
 inline void copy(
 	region_t destination,
 	context::handle_t destination_context_handle,
 	const_region_t source,
 	context::handle_t source_context_handle,
-	stream::handle_t stream_handle)
+	optional<stream::handle_t> stream_handle)
 {
 #ifndef NDEBUG
 	if (destination.size() < source.size()) {
 		throw ::std::logic_error("Can't copy a large region into a smaller one");
 	}
 #endif
-	copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(),
-		stream_handle);
+	copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(), stream_handle);
 }
+ */
 
 } // namespace detail_
 
 /// Asynchronously copy a region of memory defined in one context into a region defined in another
 void copy(
-	void *           destination_address,
-	context_t        destination_context,
-	const void *     source_address,
-	context_t        source_context,
-	size_t           num_bytes,
-	const stream_t&  stream);
+	void *                        destination_address,
+	const context_t&              destination_context,
+	const void *                  source_address,
+	const context_t&              source_context,
+	size_t                        num_bytes,
+	optional_ref<const stream_t> stream);
 
 /// Asynchronously copy a region of memory defined in one context into a region defined in another
-void copy(
-	void *           destination,
-	context_t        destination_context,
-	const_region_t   source,
-	context_t        source_context,
-	const stream_t&  stream);
+inline void copy(
+	void *                        destination,
+	const context_t&              destination_context,
+	const_region_t                source,
+	const context_t&              source_context,
+	optional_ref<const stream_t>  stream)
+{
+	copy(destination, destination_context, source.start(), source_context, source.size(), stream);
+}
 
 /// Asynchronously copy a region of memory defined in one context into a region defined in another
 inline void copy(
-	region_t        destination,
-	context_t        destination_context,
-	const_region_t   source,
-	context_t        source_context,
-	const stream_t&  stream);
+	region_t                      destination,
+	const context_t&              destination_context,
+	const void*                   source,
+	const context_t&              source_context,
+	optional_ref<const stream_t>  stream)
+{
+	copy(destination.start(), destination_context, source, source_context, destination.size(), stream);
+}
+
+/// Asynchronously copy a region of memory defined in one context into a region defined in another
+inline void copy(
+	region_t                      destination,
+	const context_t&              destination_context,
+	const_region_t                source,
+	const context_t&              source_context,
+	optional_ref<const stream_t>  stream)
+{
+#ifndef NDEBUG
+	if (destination.size() < destination.size()) {
+		throw ::std::invalid_argument(
+			"Attempt to copy a region of " + ::std::to_string(source.size()) +
+			" bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
+	}
+#endif
+	copy(destination.start(), destination_context, source, source_context, stream);
+}
 
 /// Asynchronously copy a CUDA array defined in one context into a CUDA array defined in another
 template <typename T, dimensionality_t NumDimensions>
 inline void copy(
-	array_t<T, NumDimensions>  destination,
-	array_t<T, NumDimensions>  source,
-	const stream_t&            stream)
+	array_t<T, NumDimensions>     destination,
+	array_t<T, NumDimensions>     source,
+	optional_ref<const stream_t>  stream)
 {
 	// for arrays, a single mechanism handles both intra- and inter-context copying
-	return memory::async::copy(destination, source, stream);
+	return memory::copy(destination, source, stream);
 }
 
-} // namespace async
-
 } // namespace inter_context
 
 /// Host-side (= system) memory which is "pinned", i.e. resides in
diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp
index f9b80c65..d8ab0514 100644
--- a/src/cuda/api/multi_wrapper_impls/memory.hpp
+++ b/src/cuda/api/multi_wrapper_impls/memory.hpp
@@ -13,6 +13,7 @@
 
 #include <cuda_runtime_api.h>
 
+#include "../memory.hpp"
 #include "../array.hpp"
 #include "../device.hpp"
 #include "../event.hpp"
@@ -28,23 +29,13 @@ namespace cuda {
 
 namespace memory {
 
-namespace async {
-
-inline void copy(void *destination, const void *source, size_t num_bytes, const stream_t& stream)
-{
-	detail_::copy(destination, source, num_bytes, stream.handle());
-}
-
-// Note: Assumes the source pointer is valid in the stream's context
-template <typename T, dimensionality_t NumDimensions>
-inline void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream)
-{
-	detail_::copy<T, NumDimensions>(destination, source, stream.handle());
-}
-
 template <typename T, dimensionality_t NumDimensions>
-inline void copy(array_t<T, NumDimensions>& destination, span<T const> source, const stream_t& stream)
+inline void copy(array_t<T, NumDimensions>& destination, span<T const> source, optional_ref<const stream_t> stream)
 {
+	if (not stream) {
+		memory::copy<T, NumDimensions>(destination, source);
+		return;
+	}
 #ifndef NDEBUG
 	if (source.size() != destination.size()) {
 		throw ::std::invalid_argument(
@@ -52,41 +43,63 @@ inline void copy(array_t<T, NumDimensions>& destination, span<T const> source, c
 			" elements into an array of " + ::std::to_string(destination.size()) + " elements");
 	}
 #endif
-	detail_::copy<T, NumDimensions>(destination, source.data(), stream.handle());
+	detail_::copy<T, NumDimensions>(destination, source.data(), stream->handle());
 }
 
 // Note: Assumes the destination, source and stream are all usable on the same content
 template <typename T, dimensionality_t NumDimensions>
-inline void copy(T* destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
+inline void copy(T* destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream)
 {
-	if (stream.context_handle() != source.context_handle()) {
+	if (not stream) {
+		memory::copy(context_of(destination), destination, source);
+		return;
+	}
+	if (stream->context_handle() != source.context_handle()) {
 		throw ::std::invalid_argument("Attempt to copy an array in"
-									+ context::detail_::identify(source.context_handle()) + " via "
-									+ stream::detail_::identify(stream));
+									  + context::detail_::identify(source.context_handle()) + " via "
+									  + stream::detail_::identify(*stream));
 	}
-	detail_::copy<T, NumDimensions>(destination, source, stream.handle());
+	detail_::copy<T, NumDimensions>(destination, source, stream->handle());
 }
 
-template <typename T, dimensionality_t NumDimensions>
-inline void copy(span<T> destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
+template<dimensionality_t NumDimensions>
+void copy(copy_parameters_t<NumDimensions> params, optional_ref<const stream_t> stream)
 {
-#ifndef NDEBUG
-	if (destination.size() != source.size()) {
-		throw ::std::invalid_argument(
-			"Attempt to copy " + ::std::to_string(source.size()) +
-			" elements into an array of " + ::std::to_string(destination.size()) + " elements");
-	}
-#endif
-	copy(destination.data(), source, stream);
+	stream::handle_t stream_handle = stream ? stream->handle() : nullptr;
+	status_t status = detail_::multidim_copy(params, stream_handle);
+	throw_if_error_lazy(status, "Copying using a general copy parameters structure");
 }
 
+
 template <typename T>
-inline void copy_single(T* destination, const T* source, const stream_t& stream)
+void copy_single(T* destination, const T* source, optional_ref<const stream_t> stream)
 {
-	detail_::copy_single(destination, source, sizeof(T), stream.handle());
+	memory::copy(destination, source, sizeof(T), stream);
 }
 
-} // namespace async
+// Note: Assumes the source pointer is valid in the stream's context
+template <typename T, dimensionality_t NumDimensions>
+inline void copy(array_t<T, NumDimensions>& destination, const T* source, optional_ref<const stream_t> stream)
+{
+	if (not stream) {
+		memory::copy(destination, context_of(source), source);
+		return;
+	}
+	detail_::copy<T, NumDimensions>(destination, source, stream->handle());
+}
+
+inline void copy(void *destination, const void *source, size_t num_bytes, optional_ref<const stream_t> stream)
+{
+	if (not stream) {
+		context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
+		auto result = cuMemcpy(device::address(destination), device::address(source), num_bytes);
+		// TODO: Determine whether it was from host to device, device to host etc and
+		// add this information to the error string
+		throw_if_error_lazy(result, "Synchronously copying data");
+		return;
+	}
+	detail_::copy(destination, source, num_bytes, stream->handle());
+}
 
 namespace device {
 
@@ -95,7 +108,6 @@ inline region_t allocate(const context_t& context, size_t size_in_bytes)
 	return detail_::allocate(context.handle(), size_in_bytes);
 }
 
-
 inline region_t allocate(const device_t& device, size_t size_in_bytes)
 {
 	auto pc = device.primary_context();
@@ -133,62 +145,39 @@ inline void zero(void* start, size_t num_bytes, const stream_t& stream)
 namespace inter_context {
 
 inline void copy(
-	void *        destination_address,
-	context_t     destination_context,
-	const void *  source_address,
-	context_t     source_context,
-	size_t        num_bytes)
-{
-	return detail_::copy(
-	destination_address, destination_context.handle(),
-	source_address, source_context.handle(), num_bytes);
-}
-
-namespace async {
-
-inline void copy(
-	void *           destination_address,
-	context_t        destination_context,
-	const void *     source_address,
-	context_t        source_context,
+	void *           destination,
+	const context_t& destination_context,
+	const void *     source,
+	const context_t& source_context,
 	size_t           num_bytes,
-	const stream_t&  stream)
-{
-	return detail_::copy(
-	destination_address, destination_context.handle(), source_address,
-	source_context.handle(), num_bytes, stream.handle());
-}
+	optional_ref<const stream_t> stream = {})
+{
+	auto status = stream ?
+		cuMemcpyPeer(
+		  device::address(destination),
+		  destination_context.handle(),
+		  device::address(source),
+		  source_context.handle(),
+		  num_bytes) :
+		cuMemcpyPeerAsync(
+		  device::address(destination),
+		  destination_context.handle(),
+		  device::address(source),
+		  source_context.handle(),
+		  num_bytes,
+		  stream->handle());
 
-inline void copy(
-	region_t         destination,
-	context_t        destination_context,
-	const_region_t   source,
-	context_t        source_context,
-	const stream_t&  stream)
-{
-#ifndef NDEBUG
-	if (destination.size() < destination.size()) {
-		throw ::std::invalid_argument(
-		"Attempt to copy a region of " + ::std::to_string(source.size()) +
-		" bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
-	}
-#endif
-	copy(destination.start(), destination_context, source, source_context, stream);
-}
-
-
-inline void copy(
-	void *           destination,
-	context_t        destination_context,
-	const_region_t   source,
-	context_t        source_context,
-	const stream_t&  stream)
-{
-	copy(destination, destination_context, source.start(), source_context, source.size(), stream);
+	// TODO: Determine whether it was from host to device, device to host etc and
+	// add this information to the error string
+	throw_if_error_lazy(status,
+		::std::string("Failed copying data between devices: From address ")
+		+ cuda::detail_::ptr_as_hex(source) + " in "
+		+ context::detail_::identify(source_context.handle()) + " to address "
+		+ cuda::detail_::ptr_as_hex(destination) + " in "
+		+ context::detail_::identify(destination_context.handle()) +
+		(stream ? " on " + stream::detail_::identify(*stream) : ""));
 }
 
-} // namespace async
-
 } // namespace inter_context
 
 namespace managed {
@@ -258,9 +247,7 @@ inline void prefetch(
 	detail_::prefetch(region, destination.id(), stream.handle());
 }
 
-inline void prefetch_to_host(
-	const_region_t   region,
-	const stream_t&  stream)
+inline void prefetch_to_host(const_region_t region, const stream_t& stream)
 {
 	detail_::prefetch(region, CU_DEVICE_CPU, stream.handle());
 }
@@ -406,15 +393,6 @@ inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attrib
 } // namespace detail_
 } // namespace pointer
 
-inline void copy(void *destination, const void *source, size_t num_bytes)
-{
-	context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
-	auto result = cuMemcpy(device::address(destination), device::address(source), num_bytes);
-	// TODO: Determine whether it was from host to device, device to host etc and
-	// add this information to the error string
-	throw_if_error_lazy(result, "Synchronously copying data");
-}
-
 namespace device {
 
 template <typename T>
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index ce464aa6..cf6547e2 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -431,7 +431,7 @@ class stream_t {
 			// CUDA doesn't seem to need us to be in the stream's context to enqueue the copy;
 			// however, unfortunately, it does require us to be in _some_ context.
 			context::current::detail_::scoped_ensurer_t ensure_we_have_a_current_scope{associated_stream.context_handle_};
-			memory::async::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
+			memory::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
 		}
 
 		/// @copybrief copy(void *, const void *, size_t) const
diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp
index 6ec1ddb3..bbcac245 100644
--- a/src/cuda/api/types.hpp
+++ b/src/cuda/api/types.hpp
@@ -29,6 +29,7 @@
 #endif
 
 #include "detail/optional.hpp"
+#include "detail/optional_ref.hpp"
 #include "detail/span.hpp"
 #include "detail/region.hpp"
 #include "detail/type_traits.hpp"
@@ -59,6 +60,12 @@
 /// @brief Definitions and functionality wrapping CUDA APIs.
 namespace cuda {
 
+// This alias for plain C arrays is required due to an MSVC bug, making it fail to
+// accept straight up C array reference parameters to functions under some circumstances;
+// see: https://developercommunity.visualstudio.com/t/MSVC-rejects-syntax-of-reference-to-C-ar/10792039
+template <typename T, size_t N>
+using c_array = T[N];
+
 /**
  * Indicates either the result (success or error index) of a CUDA Runtime or Driver API call,
  * or the overall status of the API (which is typically the last triggered error).