Regards #689, Fixes #688: Unification of asynchronous & synchronous c…

…opy functions: * All copy functions now take an optional stream via an `optional_ref` parameter; * No longer using the `cuda::memory::async` subnamespace for any copy functions; they are all directly in `cuda::memory` * Fixes #688: Now supporting async copy using copy parameters structures * Explicitly including `memory.hpp` in `multi_wrapper_impls/memory.hpp`
eyalroz · Nov 17, 2024 · e8d27d6 · e8d27d6
1 parent 7556da9
commit e8d27d6
Show file tree

Hide file tree

Showing 9 changed files with 534 additions and 674 deletions.
diff --git a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
@@ -124,8 +124,8 @@ int main(int argc, const char **argv)
 
 	stream.enqueue.kernel_launch(kernel, launch_config, d_inputArr.data(), d_numOfOdds.data(), d_sumOfOddEvenElems.data(), arrSize);
 
-	cuda::memory::async::copy(h_numOfOdds, d_numOfOdds, stream);
-	cuda::memory::async::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
+	cuda::memory::copy(h_numOfOdds, d_numOfOdds, stream);
+	cuda::memory::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
 
 	stream.synchronize();
 

diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -154,7 +154,7 @@ void enqueue_p2p_copy(
         // Since we assume Compute Capability >= 2.0, all devices support the
         // Unified Virtual Address Space, so we don't need to use
         // cudaMemcpyPeerAsync - cudaMemcpyAsync is enough.
-            cuda::memory::async::copy(dest, src, stream);
+            cuda::memory::copy(dest, src, stream);
         }
     }
 }

diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
@@ -152,8 +152,8 @@ int main(int argc, char** argv)
 	auto d_C = cuda::memory::make_unique_span<float>(device, N);
 
 
-	cuda::memory::async::copy(d_A, h_A.get(), size, stream);
-	cuda::memory::async::copy(d_B, h_B.get(), size, stream);
+	cuda::memory::copy(d_A, h_A.get(), size, stream);
+	cuda::memory::copy(d_B, h_B.get(), size, stream);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(N)
@@ -164,7 +164,7 @@ int main(int argc, char** argv)
 
     stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.data(), d_B.data(), d_C.data(), N);
 
-	cuda::memory::async::copy(h_C.get(), d_C, size, stream);
+	cuda::memory::copy(h_C.get(), d_C, size, stream);
 	stream.synchronize();
 
 	for (int i = 0; i < N; ++i) {

diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
@@ -143,7 +143,7 @@ void run_simple_streams_example(
 
 	// time memcpy from device
 	start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed
-	cuda::memory::async::copy(h_a.get(), d_a, streams[0]);
+	cuda::memory::copy(h_a.get(), d_a, streams[0]);
 	stop_event.record();
 	stop_event.synchronize(); // block until the event is actually recorded
 	auto time_memcpy = cuda::event::time_elapsed_between(start_event, stop_event);
@@ -207,7 +207,7 @@ void run_simple_streams_example(
 		//   commence executing when all previous CUDA calls in stream x have completed
 		for (int i = 0; i < nstreams; i++)
 		{
-			cuda::memory::async::copy(
+			cuda::memory::copy(
 				h_a.data() + i * params.n / nstreams,
 				d_a.data() + i * params.n / nstreams, nbytes / nstreams,
 				streams[i]);

diff --git a/examples/other/array_management.cu b/examples/other/array_management.cu
@@ -97,8 +97,8 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
 
 	// also asynchronously
 	auto stream = device.create_stream(cuda::stream::async);
-	cuda::memory::async::copy(other_arr, span_out, stream);
-	cuda::memory::async::copy(span_in, other_arr, stream);
+	cuda::memory::copy(other_arr, span_out, stream);
+	cuda::memory::copy(span_in, other_arr, stream);
 	device.synchronize();
 	check_output_is_iota("copy from (managed) global memory into a 3D array, asynchronously", span_in);
 }
@@ -162,8 +162,8 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
 
 	// also asynchronously
 	auto stream = cuda::stream::create(device, cuda::stream::async);
-	cuda::memory::async::copy(other_arr, span_out, stream);
-	cuda::memory::async::copy(span_in, other_arr, stream);
+	cuda::memory::copy(other_arr, span_out, stream);
+	cuda::memory::copy(span_in, other_arr, stream);
 	device.synchronize();
 
 	check_output_is_iota("copy from (managed) global memory into a 2D array, asynchronously", span_in);