Merge remote-tracking branch 'origin/main' into Issue220_Add-support-…

…for-ODS
Wee-Free-Scot · Jan 4, 2024 · 4906cb5 · 4906cb5
2 parents 1245943 + c311fe8
commit 4906cb5
Show file tree

Hide file tree

Showing 71 changed files with 1,531 additions and 380 deletions.
diff --git a/.github/workflows/e2e_nightly.yml b/.github/workflows/e2e_nightly.yml
@@ -11,7 +11,8 @@ jobs:
     strategy:
       matrix:
         adapter: [
-          {name: CUDA}
+          {name: CUDA, str_name: cuda, prefix: "ext_oneapi_", config: "--cuda --hip", unit: "gpu"},
+          {name: OPENCL, str_name: opencl, prefix: "", config: "", unit: "cpu"}
         ]
         build_type: [Release]
         compiler: [{c: clang, cxx: clang++}]
@@ -59,12 +60,18 @@ jobs:
       run: LD_LIBRARY_PATH=${{github.workspace}}/dpcpp_compiler/lib
         cmake --build ${{github.workspace}}/ur-repo/build -j $(nproc)
 
-    - name: Set env vars & pre setup
+    - name: Set prefer UR
+      run: echo "SYCL_PREFER_UR=1" >> $GITHUB_ENV
+
+    - name: Set CUDA env vars
+      if: matrix.adapter.name == 'CUDA'
       run: |
-        echo "SYCL_PREFER_UR=1" >> $GITHUB_ENV
         echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
         echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
-        source /opt/intel/oneapi/setvars.sh
+
+    - name: Run pre setup
+      run: |
+        source /opt/intel/oneapi/setvars.sh --force
         sycl-ls
 
     - name: Configure SYCL
@@ -73,7 +80,7 @@ jobs:
         -t ${{matrix.build_type}}
         -o ${{github.workspace}}/sycl_build
         --cmake-gen "Unix Makefiles"
-        --ci-defaults --cuda --hip
+        --ci-defaults ${{matrix.adapter.config}}
         --cmake-opt="-DLLVM_INSTALL_UTILS=ON"
         --cmake-opt="-DSYCL_PI_TESTS=OFF"
         --cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache
@@ -91,7 +98,7 @@ jobs:
     - name: Swap UR loader and adapters
       run: |
         cp ${{github.workspace}}/ur-repo/build/lib/libur_loader.so* ${{github.workspace}}/sycl_build/lib/
-        cp ${{github.workspace}}/ur-repo/build/lib/libur_adapter_cuda.so* ${{github.workspace}}/sycl_build/lib/
+        cp ${{github.workspace}}/ur-repo/build/lib/libur_adapter_${{matrix.adapter.str_name}}.so* ${{github.workspace}}/sycl_build/lib/
 
     - name: Set additional env. vars
       run: |
@@ -110,7 +117,7 @@ jobs:
         -GNinja
         -B ${{github.workspace}}/build-e2e/
         -S ${{github.workspace}}/sycl-repo/sycl/test-e2e/
-        -DSYCL_TEST_E2E_TARGETS="ext_oneapi_cuda:gpu"
+        -DSYCL_TEST_E2E_TARGETS="${{matrix.adapter.prefix}}${{matrix.adapter.str_name}}:${{matrix.adapter.unit}}"
         -DCMAKE_CXX_COMPILER="$(which clang++)"
         -DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py"
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -47,6 +47,9 @@ option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace
 set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
 set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
     "Path of the SYCL runtime library directory")
+option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
+
+include(Assertions)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

diff --git a/README.md b/README.md
@@ -6,11 +6,6 @@
 [![Coverity](https://scan.coverity.com/projects/28213/badge.svg)](https://scan.coverity.com/projects/oneapi-src-unified-runtime)
 [![codecov.io](https://codecov.io/github/oneapi-src/unified-runtime/coverage.svg?branch=main)](https://codecov.io/github/oneapi-src/unified-runtime?branch=master)
 
-## Adapters
-Adapter implementations for Unified Runtime currently reside in the [SYCL repository](https://github.com/intel/llvm/tree/sycl/sycl/plugins/unified_runtime/ur). This branch contains scripts to automatically
-fetch and build them directly in the UR tree. The adapters are disabled by default,
-see cmake options for details.
-
 <!-- TODO: add general description and purpose of the project -->
 
 ## Table of contents

diff --git a/cmake/Assertions.cmake b/cmake/Assertions.cmake
@@ -0,0 +1,30 @@
+# From the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This is lifted from llvm's LLVM_ENABLE_ASSERTIONS implementation
+# https://github.com/llvm/llvm-project/blob/6be0e979896f7dd610abf263f845c532f1be3762/llvm/cmake/modules/HandleLLVMOptions.cmake#L89
+if(UR_ENABLE_ASSERTIONS)
+  # MSVC doesn't like _DEBUG on release builds
+  if( NOT MSVC )
+    add_compile_definitions(_DEBUG)
+  endif()
+  # On non-Debug builds cmake automatically defines NDEBUG, so we
+  # explicitly undefine it:
+  if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+    add_compile_options($<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:-UNDEBUG>)
+    if (MSVC)
+      # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
+      foreach (flags_var_to_scrub
+          CMAKE_CXX_FLAGS_RELEASE
+          CMAKE_CXX_FLAGS_RELWITHDEBINFO
+          CMAKE_CXX_FLAGS_MINSIZEREL
+          CMAKE_C_FLAGS_RELEASE
+          CMAKE_C_FLAGS_RELWITHDEBINFO
+          CMAKE_C_FLAGS_MINSIZEREL)
+        string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
+          "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
+      endforeach()
+    endif()
+  endif()
+endif()
diff --git a/include/ur_api.h b/include/ur_api.h
@@ -4078,6 +4078,9 @@ urProgramCreateWithIL(
 ///
 /// @details
 ///     - The application may call this function from simultaneous threads.
+///     - Following a successful call to this entry point, `phProgram` will
+///       contain a binary of type ::UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT or
+///       ::UR_PROGRAM_BINARY_TYPE_LIBRARY for `hDevice`.
 ///
 /// @remarks
 ///   _Analogues_

diff --git a/scripts/core/program.yml b/scripts/core/program.yml
@@ -127,6 +127,7 @@ analogue:
     - "**clCreateProgramWithBinary**"
 details:
     - "The application may call this function from simultaneous threads."
+    - "Following a successful call to this entry point, `phProgram` will contain a binary of type $X_PROGRAM_BINARY_TYPE_COMPILED_OBJECT or $X_PROGRAM_BINARY_TYPE_LIBRARY for `hDevice`."
 params:
     - type: $x_context_handle_t
       name: hContext

diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt
@@ -27,6 +27,8 @@ add_ur_adapter(${TARGET_NAME}
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/memory.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/program.hpp
@@ -38,6 +40,7 @@ add_ur_adapter(${TARGET_NAME}
     ${CMAKE_CURRENT_SOURCE_DIR}/tracing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.hpp
 )

diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
@@ -99,6 +99,91 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType,
   Params.Depth = 1;
 }
 
+// Helper function for enqueuing memory fills
+static ur_result_t enqueueCommandBufferFillHelper(
+    ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice,
+    const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize,
+    size_t Size, uint32_t NumSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
+    ur_exp_command_buffer_sync_point_t *SyncPoint) {
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
+                                 SyncPointWaitList, DepsList),
+          Result);
+
+  try {
+    const size_t N = Size / PatternSize;
+    auto Value = *static_cast<const uint32_t *>(Pattern);
+    auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
+                      ? *static_cast<CUdeviceptr *>(DstDevice)
+                      : (CUdeviceptr)DstDevice;
+
+    if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
+      // Create a new node
+      CUgraphNode GraphNode;
+      CUDA_MEMSET_NODE_PARAMS NodeParams = {};
+      NodeParams.dst = DstPtr;
+      NodeParams.elementSize = PatternSize;
+      NodeParams.height = N;
+      NodeParams.pitch = PatternSize;
+      NodeParams.value = Value;
+      NodeParams.width = 1;
+
+      UR_CHECK_ERROR(cuGraphAddMemsetNode(
+          &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
+          DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
+
+      // Get sync point and register the cuNode with it.
+      *SyncPoint =
+          CommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+
+    } else {
+      // CUDA has no memset functions that allow setting values more than 4
+      // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
+      // fill, which can be more than 4 bytes. We must break up the pattern
+      // into 4 byte values, and set the buffer using multiple strided calls.
+      // This means that one cuGraphAddMemsetNode call is made for every 4 bytes
+      // in the pattern.
+
+      size_t NumberOfSteps = PatternSize / sizeof(uint32_t);
+
+      // we walk up the pattern in 4-byte steps, and call cuMemset for each
+      // 4-byte chunk of the pattern.
+      for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
+        // take 4 bytes of the pattern
+        auto Value = *(static_cast<const uint32_t *>(Pattern) + Step);
+
+        // offset the pointer to the part of the buffer we want to write to
+        auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t));
+
+        // Create a new node
+        CUgraphNode GraphNode;
+        // Update NodeParam
+        CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
+        NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
+        NodeParamsStep.elementSize = 4;
+        NodeParamsStep.height = N;
+        NodeParamsStep.pitch = PatternSize;
+        NodeParamsStep.value = Value;
+        NodeParamsStep.width = 1;
+
+        UR_CHECK_ERROR(cuGraphAddMemsetNode(
+            &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
+            DepsList.size(), &NodeParamsStep,
+            CommandBuffer->Device->getContext()));
+
+        // Get sync point and register the cuNode with it.
+        *SyncPoint = CommandBuffer->AddSyncPoint(
+            std::make_shared<CUgraphNode>(GraphNode));
+      }
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
@@ -525,6 +610,119 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
   return Result;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */,
+    size_t /*Size*/, ur_usm_migration_flags_t /*Flags*/,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint) {
+  // Prefetch cmd is not supported by Cuda Graph.
+  // We implement it as an empty node to enforce dependencies.
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  try {
+    // Add an empty node to preserve dependencies.
+    UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
+                                       DepsList.data(), DepsList.size()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+
+    setErrorMessage("Prefetch hint ignored and replaced with empty node as "
+                    "prefetch is not supported by CUDA Graph backend",
+                    UR_RESULT_SUCCESS);
+    Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */,
+    size_t /*Size*/, ur_usm_advice_flags_t /*Advice*/,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint) {
+  // Mem-Advise cmd is not supported by Cuda Graph.
+  // We implement it as an empty node to enforce dependencies.
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  try {
+    // Add an empty node to preserve dependencies.
+    UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
+                                       DepsList.data(), DepsList.size()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+
+    setErrorMessage("Memory advice ignored and replaced with empty node as "
+                    "memory advice is not supported by CUDA Graph backend",
+                    UR_RESULT_SUCCESS);
+    Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
+    const void *pPattern, size_t patternSize, size_t offset, size_t size,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint) {
+  auto ArgsAreMultiplesOfPatternSize =
+      (offset % patternSize == 0) || (size % patternSize == 0);
+
+  auto PatternIsValid = (pPattern != nullptr);
+
+  auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
+                            (patternSize > 0); // is a positive power of two
+  UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
+                PatternSizeIsValid,
+            UR_RESULT_ERROR_INVALID_SIZE);
+
+  auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+
+  return enqueueCommandBufferFillHelper(
+      hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
+      size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, void *pPtr,
+    const void *pPattern, size_t patternSize, size_t size,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint) {
+
+  auto PatternIsValid = (pPattern != nullptr);
+
+  auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
+                            (patternSize > 0); // is a positive power of two
+
+  UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE);
+  return enqueueCommandBufferFillHelper(
+      hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size,
+      numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,