Regards #690: We've extracted the free() calls out of the async sub…

…-namespace + some comment tweaks and redundancy removals
eyalroz · Nov 16, 2024 · 5344e8a · 5344e8a
1 parent 4f93c92
commit 5344e8a
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 30 deletions.
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
@@ -229,41 +229,55 @@ inline region_t allocate(
 	return allocate_in_current_context(size_in_bytes, stream_handle);
 }
 
-} // namespace detail_
+#if CUDA_VERSION >= 11020
+inline void free_on_stream(
+	void*              allocated_region_start,
+	stream::handle_t   stream_handle)
+{
+	auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
+	throw_if_error_lazy(status,
+		"Failed scheduling an asynchronous freeing of the global memory region starting at "
+		+ cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
+		+ stream::detail_::identify(stream_handle));
+}
+#endif // CUDA_VERSION >= 11020
 
-/// Free a region of device-side memory (regardless of how it was allocated)
-inline void free(void* ptr)
+inline void free_in_current_context(
+	context::handle_t          current_context_handle,
+	void*                      allocated_region_start)
 {
-	auto result = cuMemFree(address(ptr));
-#ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
+	auto result = cuMemFree(address(allocated_region_start));
 	if (result == status::success) { return; }
-#else
-	if (result == status::success or result == status::context_is_destroyed) { return; }
+#ifndef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
+	if (result == status::context_is_destroyed) { return; }
 #endif
-	throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(ptr));
+	throw runtime_error(result, "Freeing device memory at "
+		+ cuda::detail_::ptr_as_hex(allocated_region_start)
+		+ " in " + context::detail_::identify(current_context_handle));
 }
 
-/// @copydoc free(void*)
-inline void free(region_t region) { free(region.start()); }
+} // namespace detail_
 
+/// Free a region of device-side memory (regardless of how it was allocated)
 #if CUDA_VERSION >= 11020
-namespace async {
-
-namespace detail_ {
+inline void free(void* region_start, optional_ref<const stream_t> stream = {});
+#else
+inline void free(void* ptr);
+#endif
 
-inline void free(
-	context::handle_t  context_handle,
-	stream::handle_t   stream_handle,
-	void*              allocated_region_start)
+/// @copydoc free(void*, optional_ref<const stream_t>)
+#if CUDA_VERSION >= 11020
+inline void free(region_t region, optional_ref<const stream_t> stream = {})
+#else
+inline void free(region_t region)
+#endif
 {
-	auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
-	throw_if_error_lazy(status,
-		"Failed scheduling an asynchronous freeing of the global memory region starting at "
-		+ cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
-		+ stream::detail_::identify(stream_handle, context_handle) );
+	free(region.start(), stream);
 }
 
-} // namespace detail_
+#if CUDA_VERSION >= 11020
+
+namespace async {
 
 /**
  * Schedule a de-allocation of device-side memory on a CUDA stream.

diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp
@@ -121,13 +121,25 @@ inline region_t allocate(size_t size_in_bytes, optional_ref<const stream_t> stre
 		detail_::allocate_in_current_context(size_in_bytes);
 }
 
-namespace async {
+#endif // CUDA_VERSION >= 11020
 
-inline void free(const stream_t& stream, void* region_start)
+#if CUDA_VERSION >= 11020
+inline void free(void* region_start, optional_ref<const stream_t> stream)
+#else
+inline void free(void* ptr)
+#endif // CUDA_VERSION >= 11020
 {
-	return detail_::free(stream.context().handle(), stream.handle(), region_start);
+#if CUDA_VERSION >= 11020
+	if (stream) {
+		detail_::free_on_stream(region_start, stream->handle());
+		return;
+	}
+#endif
+	context::current::detail_::scoped_existence_ensurer_t ensurer;
+	detail_::free_in_current_context(ensurer.context_handle,region_start);
 }
-#endif // CUDA_VERSION >= 11020
+
+namespace async {
 
 template <typename T>
 inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream)

diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
@@ -601,14 +601,14 @@ class stream_t {
 		///@{
 		void free(void* region_start) const
 		{
-			memory::device::async::free(associated_stream, region_start);
+			memory::device::free(region_start, associated_stream);
 		}
 
 		void free(memory::region_t region) const
 		{
-			memory::device::async::free(associated_stream, region);
+			memory::device::free(region, associated_stream);
 		}
-#endif
+#endif // CUDA_VERSION >= 11020
 
 		/**
 		 * Sets the attachment of a region of managed memory (i.e. in the address space visible