Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into Issue220_Add-support-…
Browse files Browse the repository at this point in the history
…for-ODS
  • Loading branch information
Wee-Free-Scot committed Jan 4, 2024
2 parents 1245943 + c311fe8 commit 4906cb5
Show file tree
Hide file tree
Showing 71 changed files with 1,531 additions and 380 deletions.
21 changes: 14 additions & 7 deletions .github/workflows/e2e_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ jobs:
strategy:
matrix:
adapter: [
{name: CUDA}
{name: CUDA, str_name: cuda, prefix: "ext_oneapi_", config: "--cuda --hip", unit: "gpu"},
{name: OPENCL, str_name: opencl, prefix: "", config: "", unit: "cpu"}
]
build_type: [Release]
compiler: [{c: clang, cxx: clang++}]
Expand Down Expand Up @@ -59,12 +60,18 @@ jobs:
run: LD_LIBRARY_PATH=${{github.workspace}}/dpcpp_compiler/lib
cmake --build ${{github.workspace}}/ur-repo/build -j $(nproc)

- name: Set env vars & pre setup
- name: Set prefer UR
run: echo "SYCL_PREFER_UR=1" >> $GITHUB_ENV

- name: Set CUDA env vars
if: matrix.adapter.name == 'CUDA'
run: |
echo "SYCL_PREFER_UR=1" >> $GITHUB_ENV
echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
source /opt/intel/oneapi/setvars.sh
- name: Run pre setup
run: |
source /opt/intel/oneapi/setvars.sh --force
sycl-ls
- name: Configure SYCL
Expand All @@ -73,7 +80,7 @@ jobs:
-t ${{matrix.build_type}}
-o ${{github.workspace}}/sycl_build
--cmake-gen "Unix Makefiles"
--ci-defaults --cuda --hip
--ci-defaults ${{matrix.adapter.config}}
--cmake-opt="-DLLVM_INSTALL_UTILS=ON"
--cmake-opt="-DSYCL_PI_TESTS=OFF"
--cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache
Expand All @@ -91,7 +98,7 @@ jobs:
- name: Swap UR loader and adapters
run: |
cp ${{github.workspace}}/ur-repo/build/lib/libur_loader.so* ${{github.workspace}}/sycl_build/lib/
cp ${{github.workspace}}/ur-repo/build/lib/libur_adapter_cuda.so* ${{github.workspace}}/sycl_build/lib/
cp ${{github.workspace}}/ur-repo/build/lib/libur_adapter_${{matrix.adapter.str_name}}.so* ${{github.workspace}}/sycl_build/lib/
- name: Set additional env. vars
run: |
Expand All @@ -110,7 +117,7 @@ jobs:
-GNinja
-B ${{github.workspace}}/build-e2e/
-S ${{github.workspace}}/sycl-repo/sycl/test-e2e/
-DSYCL_TEST_E2E_TARGETS="ext_oneapi_cuda:gpu"
-DSYCL_TEST_E2E_TARGETS="${{matrix.adapter.prefix}}${{matrix.adapter.str_name}}:${{matrix.adapter.unit}}"
-DCMAKE_CXX_COMPILER="$(which clang++)"
-DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py"
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace
set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
"Path of the SYCL runtime library directory")
option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)

include(Assertions)

set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
Expand Down
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,6 @@
[![Coverity](https://scan.coverity.com/projects/28213/badge.svg)](https://scan.coverity.com/projects/oneapi-src-unified-runtime)
[![codecov.io](https://codecov.io/github/oneapi-src/unified-runtime/coverage.svg?branch=main)](https://codecov.io/github/oneapi-src/unified-runtime?branch=master)

## Adapters
Adapter implementations for Unified Runtime currently reside in the [SYCL repository](https://github.com/intel/llvm/tree/sycl/sycl/plugins/unified_runtime/ur). This branch contains scripts to automatically
fetch and build them directly in the UR tree. The adapters are disabled by default,
see cmake options for details.

<!-- TODO: add general description and purpose of the project -->

## Table of contents
Expand Down
30 changes: 30 additions & 0 deletions cmake/Assertions.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# From the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# This is lifted from llvm's LLVM_ENABLE_ASSERTIONS implementation
# https://github.com/llvm/llvm-project/blob/6be0e979896f7dd610abf263f845c532f1be3762/llvm/cmake/modules/HandleLLVMOptions.cmake#L89
if(UR_ENABLE_ASSERTIONS)
# MSVC doesn't like _DEBUG on release builds
if( NOT MSVC )
add_compile_definitions(_DEBUG)
endif()
# On non-Debug builds cmake automatically defines NDEBUG, so we
# explicitly undefine it:
if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
add_compile_options($<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:-UNDEBUG>)
if (MSVC)
# Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
foreach (flags_var_to_scrub
CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_CXX_FLAGS_MINSIZEREL
CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS_MINSIZEREL)
string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
"${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
endforeach()
endif()
endif()
endif()
3 changes: 3 additions & 0 deletions include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -4078,6 +4078,9 @@ urProgramCreateWithIL(
///
/// @details
/// - The application may call this function from simultaneous threads.
/// - Following a successful call to this entry point, `phProgram` will
/// contain a binary of type ::UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT or
/// ::UR_PROGRAM_BINARY_TYPE_LIBRARY for `hDevice`.
///
/// @remarks
/// _Analogues_
Expand Down
1 change: 1 addition & 0 deletions scripts/core/program.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ analogue:
- "**clCreateProgramWithBinary**"
details:
- "The application may call this function from simultaneous threads."
- "Following a successful call to this entry point, `phProgram` will contain a binary of type $X_PROGRAM_BINARY_TYPE_COMPILED_OBJECT or $X_PROGRAM_BINARY_TYPE_LIBRARY for `hDevice`."
params:
- type: $x_context_handle_t
name: hContext
Expand Down
3 changes: 3 additions & 0 deletions source/adapters/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp
${CMAKE_CURRENT_SOURCE_DIR}/memory.hpp
${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.hpp
${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp
${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
${CMAKE_CURRENT_SOURCE_DIR}/program.hpp
Expand All @@ -38,6 +40,7 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/tracing.cpp
${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.hpp
)
Expand Down
198 changes: 198 additions & 0 deletions source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,91 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType,
Params.Depth = 1;
}

// Helper function for enqueuing memory fills
static ur_result_t enqueueCommandBufferFillHelper(
ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice,
const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize,
size_t Size, uint32_t NumSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
ur_exp_command_buffer_sync_point_t *SyncPoint) {
ur_result_t Result = UR_RESULT_SUCCESS;
std::vector<CUgraphNode> DepsList;
UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
SyncPointWaitList, DepsList),
Result);

try {
const size_t N = Size / PatternSize;
auto Value = *static_cast<const uint32_t *>(Pattern);
auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
? *static_cast<CUdeviceptr *>(DstDevice)
: (CUdeviceptr)DstDevice;

if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
// Create a new node
CUgraphNode GraphNode;
CUDA_MEMSET_NODE_PARAMS NodeParams = {};
NodeParams.dst = DstPtr;
NodeParams.elementSize = PatternSize;
NodeParams.height = N;
NodeParams.pitch = PatternSize;
NodeParams.value = Value;
NodeParams.width = 1;

UR_CHECK_ERROR(cuGraphAddMemsetNode(
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));

// Get sync point and register the cuNode with it.
*SyncPoint =
CommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));

} else {
// CUDA has no memset functions that allow setting values more than 4
// bytes. UR API lets you pass an arbitrary "pattern" to the buffer
// fill, which can be more than 4 bytes. We must break up the pattern
// into 4 byte values, and set the buffer using multiple strided calls.
// This means that one cuGraphAddMemsetNode call is made for every 4 bytes
// in the pattern.

size_t NumberOfSteps = PatternSize / sizeof(uint32_t);

// we walk up the pattern in 4-byte steps, and call cuMemset for each
// 4-byte chunk of the pattern.
for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
// take 4 bytes of the pattern
auto Value = *(static_cast<const uint32_t *>(Pattern) + Step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t));

// Create a new node
CUgraphNode GraphNode;
// Update NodeParam
CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
NodeParamsStep.elementSize = 4;
NodeParamsStep.height = N;
NodeParamsStep.pitch = PatternSize;
NodeParamsStep.value = Value;
NodeParamsStep.width = 1;

UR_CHECK_ERROR(cuGraphAddMemsetNode(
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
DepsList.size(), &NodeParamsStep,
CommandBuffer->Device->getContext()));

// Get sync point and register the cuNode with it.
*SyncPoint = CommandBuffer->AddSyncPoint(
std::make_shared<CUgraphNode>(GraphNode));
}
}
} catch (ur_result_t Err) {
Result = Err;
}
return Result;
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
Expand Down Expand Up @@ -525,6 +610,119 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
return Result;
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */,
size_t /*Size*/, ur_usm_migration_flags_t /*Flags*/,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
// Prefetch cmd is not supported by Cuda Graph.
// We implement it as an empty node to enforce dependencies.
ur_result_t Result = UR_RESULT_SUCCESS;
CUgraphNode GraphNode;

std::vector<CUgraphNode> DepsList;
UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
pSyncPointWaitList, DepsList),
Result);

try {
// Add an empty node to preserve dependencies.
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
DepsList.data(), DepsList.size()));

// Get sync point and register the cuNode with it.
*pSyncPoint =
hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));

setErrorMessage("Prefetch hint ignored and replaced with empty node as "
"prefetch is not supported by CUDA Graph backend",
UR_RESULT_SUCCESS);
Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
} catch (ur_result_t Err) {
Result = Err;
}
return Result;
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */,
size_t /*Size*/, ur_usm_advice_flags_t /*Advice*/,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
// Mem-Advise cmd is not supported by Cuda Graph.
// We implement it as an empty node to enforce dependencies.
ur_result_t Result = UR_RESULT_SUCCESS;
CUgraphNode GraphNode;

std::vector<CUgraphNode> DepsList;
UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
pSyncPointWaitList, DepsList),
Result);

try {
// Add an empty node to preserve dependencies.
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
DepsList.data(), DepsList.size()));

// Get sync point and register the cuNode with it.
*pSyncPoint =
hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));

setErrorMessage("Memory advice ignored and replaced with empty node as "
"memory advice is not supported by CUDA Graph backend",
UR_RESULT_SUCCESS);
Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
} catch (ur_result_t Err) {
Result = Err;
}

return Result;
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
const void *pPattern, size_t patternSize, size_t offset, size_t size,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
auto ArgsAreMultiplesOfPatternSize =
(offset % patternSize == 0) || (size % patternSize == 0);

auto PatternIsValid = (pPattern != nullptr);

auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
(patternSize > 0); // is a positive power of two
UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
PatternSizeIsValid,
UR_RESULT_ERROR_INVALID_SIZE);

auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;

return enqueueCommandBufferFillHelper(
hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
ur_exp_command_buffer_handle_t hCommandBuffer, void *pPtr,
const void *pPattern, size_t patternSize, size_t size,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint) {

auto PatternIsValid = (pPattern != nullptr);

auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
(patternSize > 0); // is a positive power of two

UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE);
return enqueueCommandBufferFillHelper(
hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size,
numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
Expand Down
Loading

0 comments on commit 4906cb5

Please sign in to comment.