diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp index d9f6310103..97fdb9512d 100644 --- a/source/adapters/cuda/device.hpp +++ b/source/adapters/cuda/device.hpp @@ -11,6 +11,8 @@ #include +#include + #include "common.hpp" struct ur_device_handle_t_ { @@ -23,6 +25,7 @@ struct ur_device_handle_t_ { std::atomic_uint32_t RefCount; ur_platform_handle_t Platform; uint32_t DeviceIndex; + umf_memory_provider_handle_t umfCUDAprovider[UMF_MEMORY_TYPE_SHARED]; static constexpr uint32_t MaxWorkItemDimensions = 3u; size_t MaxWorkItemSizes[MaxWorkItemDimensions]; @@ -115,6 +118,16 @@ struct ur_device_handle_t_ { uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; }; + void setUmfCUDAprovider(umf_usm_memory_type_t memType, + umf_memory_provider_handle_t _umfCUDAprovider) { + umfCUDAprovider[(int)memType - 1] = _umfCUDAprovider; + } + + umf_memory_provider_handle_t + getUmfCUDAprovider(umf_usm_memory_type_t memType) { + return umfCUDAprovider[(int)memType - 1]; + } + // bookkeeping for mipmappedArray leaks in Mapping external Memory std::map ChildCuarrayFromMipmapMap; }; diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp index 20518494f7..0bbc67f420 100644 --- a/source/adapters/cuda/platform.cpp +++ b/source/adapters/cuda/platform.cpp @@ -13,6 +13,7 @@ #include "common.hpp" #include "context.hpp" #include "device.hpp" +#include "umf_helpers.hpp" #include #include @@ -115,6 +116,67 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; throw; } + + try { + umf_cuda_memory_provider_params_handle_t cu_memory_provider_params = + nullptr; + umf_result_t umf_result = + umfCUDAMemoryProviderParamsCreate(&cu_memory_provider_params); + if (umf_result != UMF_RESULT_SUCCESS) { + Result = umf::umf2urResult(umf_result); + throw Result; + } + + for (int i = 0; i < NumDevices; ++i) { + ur_device_handle_t_ *device_handle = Platform.Devices[i].get(); + CUdevice device = device_handle->get(); + CUcontext context = device_handle->getNativeContext(); + + for (int memType = UMF_MEMORY_TYPE_HOST; + memType <= UMF_MEMORY_TYPE_SHARED; memType++) { + umf_result = umfCUDAMemoryProviderParamsSetContext( + cu_memory_provider_params, context); + if (umf_result != UMF_RESULT_SUCCESS) { + Result = umf::umf2urResult(umf_result); + throw Result; + } + + umf_result = umfCUDAMemoryProviderParamsSetDevice( + cu_memory_provider_params, device); + if (umf_result != UMF_RESULT_SUCCESS) { + Result = umf::umf2urResult(umf_result); + throw Result; + } + + umf_result = umfCUDAMemoryProviderParamsSetMemoryType( + cu_memory_provider_params, (umf_usm_memory_type_t)memType); + if (umf_result != UMF_RESULT_SUCCESS) { + Result = umf::umf2urResult(umf_result); + throw Result; + } + + umf_memory_provider_handle_t umfCUDAprovider = nullptr; + umf_result = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + cu_memory_provider_params, + &umfCUDAprovider); + if (umf_result != UMF_RESULT_SUCCESS) { + Result = umf::umf2urResult(umf_result); + throw Result; + } + + device_handle->setUmfCUDAprovider( + (umf_usm_memory_type_t)memType, umfCUDAprovider); + } + } + + umfCUDAMemoryProviderParamsDestroy(cu_memory_provider_params); + } catch (ur_result_t Err) { + Result = Err; + throw Err; + } catch (...) { + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + throw; + } }, Result); diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp index 863d90cd79..e7add7a20f 100644 --- a/source/adapters/cuda/usm.cpp +++ b/source/adapters/cuda/usm.cpp @@ -102,7 +102,7 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } -ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) { +ur_result_t USMFreeImpl(ur_context_handle_t hContext, void *Pointer) { ur_result_t Result = UR_RESULT_SUCCESS; try { unsigned int IsManaged; @@ -114,13 +114,29 @@ ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) { (CUdeviceptr)Pointer)); UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST, UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + std::vector Devices = hContext->getDevices(); + ur_device_handle_t Device0 = Devices[0]; + if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) { // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed // with cuMemFree - UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer)); + // UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer)); + umf_memory_provider_handle_t umfCUDAprovider = + Device0->getUmfCUDAprovider( + umf_usm_memory_type_t::UMF_MEMORY_TYPE_DEVICE); + umf_result_t umf_result = umfMemoryProviderFree(umfCUDAprovider, Pointer, + 0 /* size is unknown */); + UR_CHECK_ERROR(umf::umf2urResult(umf_result)); } else { // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost - UR_CHECK_ERROR(cuMemFreeHost(Pointer)); + // UR_CHECK_ERROR(cuMemFreeHost(Pointer)); + umf_memory_provider_handle_t umfCUDAprovider = + Device0->getUmfCUDAprovider( + umf_usm_memory_type_t::UMF_MEMORY_TYPE_HOST); + umf_result_t umf_result = umfMemoryProviderFree(umfCUDAprovider, Pointer, + 0 /* size is unknown */); + UR_CHECK_ERROR(umf::umf2urResult(umf_result)); } } catch (ur_result_t Err) { Result = Err; @@ -143,7 +159,12 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, uint32_t Alignment) { try { ScopedContext Active(Device); - UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size)); + // UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size)); + umf_memory_provider_handle_t umfCUDAprovider = Device->getUmfCUDAprovider( + umf_usm_memory_type_t::UMF_MEMORY_TYPE_DEVICE); + umf_result_t umf_result = + umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr); + UR_CHECK_ERROR(umf::umf2urResult(umf_result)); } catch (ur_result_t Err) { return Err; } @@ -164,8 +185,13 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, uint32_t Alignment) { try { ScopedContext Active(Device); - UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size, - CU_MEM_ATTACH_GLOBAL)); + // UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size, + // CU_MEM_ATTACH_GLOBAL)); + umf_memory_provider_handle_t umfCUDAprovider = Device->getUmfCUDAprovider( + umf_usm_memory_type_t::UMF_MEMORY_TYPE_SHARED); + umf_result_t umf_result = + umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr); + UR_CHECK_ERROR(umf::umf2urResult(umf_result)); } catch (ur_result_t Err) { return Err; } @@ -179,11 +205,18 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, return UR_RESULT_SUCCESS; } -ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t, +ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t hContext, ur_usm_host_mem_flags_t, size_t Size, uint32_t Alignment) { try { - UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size)); + // UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size)); + std::vector Devices = hContext->getDevices(); + ur_device_handle_t Device0 = Devices[0]; + umf_memory_provider_handle_t umfCUDAprovider = Device0->getUmfCUDAprovider( + umf_usm_memory_type_t::UMF_MEMORY_TYPE_HOST); + umf_result_t umf_result = + umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr); + UR_CHECK_ERROR(umf::umf2urResult(umf_result)); } catch (ur_result_t Err) { return Err; } diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index b5fb64cfc5..605d4611ff 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -64,7 +64,7 @@ else() set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "Build UMF examples") set(UMF_BUILD_SHARED_LIBRARY ${UMF_BUILD_SHARED_LIBRARY} CACHE INTERNAL "Build UMF shared library") set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool") - set(UMF_BUILD_CUDA_PROVIDER OFF CACHE INTERNAL "Build UMF CUDA provider") + set(UMF_BUILD_CUDA_PROVIDER ON CACHE INTERNAL "Build UMF CUDA provider") FetchContent_MakeAvailable(unified-memory-framework) FetchContent_GetProperties(unified-memory-framework)