Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] POC: use UMF CUDA provider #2480

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions source/adapters/cuda/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

#include <ur/ur.hpp>

#include <umf/providers/provider_cuda.h>

#include "common.hpp"

struct ur_device_handle_t_ {
Expand All @@ -23,6 +25,7 @@ struct ur_device_handle_t_ {
std::atomic_uint32_t RefCount;
ur_platform_handle_t Platform;
uint32_t DeviceIndex;
umf_memory_provider_handle_t umfCUDAprovider[UMF_MEMORY_TYPE_SHARED];

static constexpr uint32_t MaxWorkItemDimensions = 3u;
size_t MaxWorkItemSizes[MaxWorkItemDimensions];
Expand Down Expand Up @@ -115,6 +118,16 @@ struct ur_device_handle_t_ {

uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };

void setUmfCUDAprovider(umf_usm_memory_type_t memType,
umf_memory_provider_handle_t _umfCUDAprovider) {
umfCUDAprovider[(int)memType - 1] = _umfCUDAprovider;
}

umf_memory_provider_handle_t
getUmfCUDAprovider(umf_usm_memory_type_t memType) {
return umfCUDAprovider[(int)memType - 1];
}

// bookkeeping for mipmappedArray leaks in Mapping external Memory
std::map<CUarray, CUmipmappedArray> ChildCuarrayFromMipmapMap;
};
Expand Down
62 changes: 62 additions & 0 deletions source/adapters/cuda/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "common.hpp"
#include "context.hpp"
#include "device.hpp"
#include "umf_helpers.hpp"

#include <cassert>
#include <cuda.h>
Expand Down Expand Up @@ -115,6 +116,67 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
throw;
}

try {
umf_cuda_memory_provider_params_handle_t cu_memory_provider_params =
nullptr;
umf_result_t umf_result =
umfCUDAMemoryProviderParamsCreate(&cu_memory_provider_params);
if (umf_result != UMF_RESULT_SUCCESS) {
Result = umf::umf2urResult(umf_result);
throw Result;
}

for (int i = 0; i < NumDevices; ++i) {
ur_device_handle_t_ *device_handle = Platform.Devices[i].get();
CUdevice device = device_handle->get();
CUcontext context = device_handle->getNativeContext();

for (int memType = UMF_MEMORY_TYPE_HOST;
memType <= UMF_MEMORY_TYPE_SHARED; memType++) {
umf_result = umfCUDAMemoryProviderParamsSetContext(
cu_memory_provider_params, context);
if (umf_result != UMF_RESULT_SUCCESS) {
Result = umf::umf2urResult(umf_result);
throw Result;
}

umf_result = umfCUDAMemoryProviderParamsSetDevice(
cu_memory_provider_params, device);
if (umf_result != UMF_RESULT_SUCCESS) {
Result = umf::umf2urResult(umf_result);
throw Result;
}

umf_result = umfCUDAMemoryProviderParamsSetMemoryType(
cu_memory_provider_params, (umf_usm_memory_type_t)memType);
if (umf_result != UMF_RESULT_SUCCESS) {
Result = umf::umf2urResult(umf_result);
throw Result;
}

umf_memory_provider_handle_t umfCUDAprovider = nullptr;
umf_result = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
cu_memory_provider_params,
&umfCUDAprovider);
if (umf_result != UMF_RESULT_SUCCESS) {
Result = umf::umf2urResult(umf_result);
throw Result;
}

device_handle->setUmfCUDAprovider(
(umf_usm_memory_type_t)memType, umfCUDAprovider);
}
}

umfCUDAMemoryProviderParamsDestroy(cu_memory_provider_params);
} catch (ur_result_t Err) {
Result = Err;
throw Err;
} catch (...) {
Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
throw;
}
},
Result);

Expand Down
49 changes: 41 additions & 8 deletions source/adapters/cuda/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
return UR_RESULT_SUCCESS;
}

ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) {
ur_result_t USMFreeImpl(ur_context_handle_t hContext, void *Pointer) {
ur_result_t Result = UR_RESULT_SUCCESS;
try {
unsigned int IsManaged;
Expand All @@ -114,13 +114,29 @@ ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) {
(CUdeviceptr)Pointer));
UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST,
UR_RESULT_ERROR_INVALID_MEM_OBJECT);

std::vector<ur_device_handle_t> Devices = hContext->getDevices();
ur_device_handle_t Device0 = Devices[0];

if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) {
// Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
// with cuMemFree
UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer));
// UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer));
umf_memory_provider_handle_t umfCUDAprovider =
Device0->getUmfCUDAprovider(
umf_usm_memory_type_t::UMF_MEMORY_TYPE_DEVICE);
umf_result_t umf_result = umfMemoryProviderFree(umfCUDAprovider, Pointer,
0 /* size is unknown */);
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
} else {
// Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
UR_CHECK_ERROR(cuMemFreeHost(Pointer));
// UR_CHECK_ERROR(cuMemFreeHost(Pointer));
umf_memory_provider_handle_t umfCUDAprovider =
Device0->getUmfCUDAprovider(
umf_usm_memory_type_t::UMF_MEMORY_TYPE_HOST);
umf_result_t umf_result = umfMemoryProviderFree(umfCUDAprovider, Pointer,
0 /* size is unknown */);
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
}
} catch (ur_result_t Err) {
Result = Err;
Expand All @@ -143,7 +159,12 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
uint32_t Alignment) {
try {
ScopedContext Active(Device);
UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size));
// UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size));
umf_memory_provider_handle_t umfCUDAprovider = Device->getUmfCUDAprovider(
umf_usm_memory_type_t::UMF_MEMORY_TYPE_DEVICE);
umf_result_t umf_result =
umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr);
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
} catch (ur_result_t Err) {
return Err;
}
Expand All @@ -164,8 +185,13 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
uint32_t Alignment) {
try {
ScopedContext Active(Device);
UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size,
CU_MEM_ATTACH_GLOBAL));
// UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size,
// CU_MEM_ATTACH_GLOBAL));
umf_memory_provider_handle_t umfCUDAprovider = Device->getUmfCUDAprovider(
umf_usm_memory_type_t::UMF_MEMORY_TYPE_SHARED);
umf_result_t umf_result =
umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr);
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
} catch (ur_result_t Err) {
return Err;
}
Expand All @@ -179,11 +205,18 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
return UR_RESULT_SUCCESS;
}

ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t,
ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t hContext,
ur_usm_host_mem_flags_t, size_t Size,
uint32_t Alignment) {
try {
UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size));
// UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size));
std::vector<ur_device_handle_t> Devices = hContext->getDevices();
ur_device_handle_t Device0 = Devices[0];
umf_memory_provider_handle_t umfCUDAprovider = Device0->getUmfCUDAprovider(
umf_usm_memory_type_t::UMF_MEMORY_TYPE_HOST);
umf_result_t umf_result =
umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr);
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
} catch (ur_result_t Err) {
return Err;
}
Expand Down
2 changes: 1 addition & 1 deletion source/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ else()
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "Build UMF examples")
set(UMF_BUILD_SHARED_LIBRARY ${UMF_BUILD_SHARED_LIBRARY} CACHE INTERNAL "Build UMF shared library")
set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool")
set(UMF_BUILD_CUDA_PROVIDER OFF CACHE INTERNAL "Build UMF CUDA provider")
set(UMF_BUILD_CUDA_PROVIDER ON CACHE INTERNAL "Build UMF CUDA provider")

FetchContent_MakeAvailable(unified-memory-framework)
FetchContent_GetProperties(unified-memory-framework)
Expand Down
Loading