From 21684e684847752948bc4cd3d122bd282ad8ba53 Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Tue, 3 Dec 2024 23:48:27 -0800
Subject: [PATCH 1/4] fix metadata with assert

---
 .../loader/layers/sanitizer/asan/asan_ddi.cpp |  41 +-
 .../sanitizer/asan/asan_interceptor.cpp       | 375 +++++++++---------
 .../sanitizer/asan/asan_interceptor.hpp       |  12 +-
 3 files changed, 211 insertions(+), 217 deletions(-)
diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 774ce3a61d..dca3f4bb05 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -52,12 +52,6 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices,
     return UR_RESULT_SUCCESS;
 }
 
-bool isInstrumentedKernel(ur_kernel_handle_t hKernel) {
-    auto hProgram = GetProgram(hKernel);
-    auto PI = getAsanInterceptor()->getProgramInfo(hProgram);
-    return PI->isKernelInstrumented(hKernel);
-}
-
 } // namespace
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -465,12 +459,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
     getContext()->logger.debug("==== urEnqueueKernelLaunch");
 
-    if (!isInstrumentedKernel(hKernel)) {
-        return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
-                               pGlobalWorkSize, pLocalWorkSize,
-                               numEventsInWaitList, phEventWaitList, phEvent);
-    }
-
     USMLaunchInfo LaunchInfo(GetContext(hKernel), GetDevice(hQueue),
                              pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
                              workDim);
@@ -1362,9 +1350,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreate(
     getContext()->logger.debug("==== urKernelCreate");
 
     UR_CALL(pfnCreate(hProgram, pKernelName, phKernel));
-    if (isInstrumentedKernel(*phKernel)) {
-        UR_CALL(getAsanInterceptor()->insertKernel(*phKernel));
-    }
+    UR_CALL(getAsanInterceptor()->insertKernel(*phKernel));
 
     return UR_RESULT_SUCCESS;
 }
@@ -1385,9 +1371,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain(
     UR_CALL(pfnRetain(hKernel));
 
     auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
-    if (KernelInfo) {
-        KernelInfo->RefCount++;
-    }
+    KernelInfo->RefCount++;
 
     return UR_RESULT_SUCCESS;
 }
@@ -1407,10 +1391,8 @@ __urdlllocal ur_result_t urKernelRelease(
     UR_CALL(pfnRelease(hKernel));
 
     auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
-    if (KernelInfo) {
-        if (--KernelInfo->RefCount == 0) {
-            UR_CALL(getAsanInterceptor()->eraseKernel(hKernel));
-        }
+    if (--KernelInfo->RefCount == 0) {
+        UR_CALL(getAsanInterceptor()->eraseKernel(hKernel));
     }
 
     return UR_RESULT_SUCCESS;
@@ -1439,8 +1421,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue(
     std::shared_ptr<KernelInfo> KernelInfo;
     if (argSize == sizeof(ur_mem_handle_t) &&
         (MemBuffer = getAsanInterceptor()->getMemBuffer(
-             *ur_cast<const ur_mem_handle_t *>(pArgValue))) &&
-        (KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel))) {
+             *ur_cast<const ur_mem_handle_t *>(pArgValue)))) {
+        auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);
         KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer);
     } else {
@@ -1470,8 +1452,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj(
 
     std::shared_ptr<MemBuffer> MemBuffer;
     std::shared_ptr<KernelInfo> KernelInfo;
-    if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue)) &&
-        (KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel))) {
+    if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue))) {
+        auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);
         KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer);
     } else {
@@ -1501,7 +1483,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal(
         "==== urKernelSetArgLocal (argIndex={}, argSize={})", argIndex,
         argSize);
 
-    if (auto KI = getAsanInterceptor()->getKernelInfo(hKernel)) {
+    {
+        auto KI = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KI->Mutex);
         // TODO: get local variable alignment
         auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal(
@@ -1538,8 +1521,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer(
         pArgValue);
 
     std::shared_ptr<KernelInfo> KI;
-    if (getAsanInterceptor()->getOptions().DetectKernelArguments &&
-        (KI = getAsanInterceptor()->getKernelInfo(hKernel))) {
+    if (getAsanInterceptor()->getOptions().DetectKernelArguments) {
+        auto KI = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KI->Mutex);
         KI->PointerArgs[argIndex] = {pArgValue, GetCurrentBacktrace()};
     }
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index 271d846990..edfd200167 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -255,9 +255,6 @@ ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
     auto ContextInfo = getContextInfo(Context);
     auto DeviceInfo = getDeviceInfo(Device);
     auto KernelInfo = getKernelInfo(Kernel);
-    assert(KernelInfo && "Kernel should be instrumented");
-
-    UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get()));
 
     ManagedQueue InternalQueue(Context, Device);
     if (!InternalQueue) {
@@ -648,7 +645,13 @@ ur_result_t AsanInterceptor::insertKernel(ur_kernel_handle_t Kernel) {
     if (m_KernelMap.find(Kernel) != m_KernelMap.end()) {
         return UR_RESULT_SUCCESS;
     }
-    m_KernelMap.emplace(Kernel, std::make_shared<KernelInfo>(Kernel));
+
+    auto hProgram = GetProgram(Kernel);
+    auto PI = getAsanInterceptor()->getProgramInfo(hProgram);
+    bool IsInstrumented = PI->isKernelInstrumented(Kernel);
+
+    m_KernelMap.emplace(Kernel,
+                        std::make_shared<KernelInfo>(Kernel, IsInstrumented));
     return UR_RESULT_SUCCESS;
 }
 
@@ -689,204 +692,210 @@ ur_result_t AsanInterceptor::prepareLaunch(
     std::shared_ptr<ContextInfo> &ContextInfo,
     std::shared_ptr<DeviceInfo> &DeviceInfo, ur_queue_handle_t Queue,
     ur_kernel_handle_t Kernel, USMLaunchInfo &LaunchInfo) {
+    auto KernelInfo = getKernelInfo(Kernel);
 
-    do {
-        auto KernelInfo = getKernelInfo(Kernel);
-        assert(KernelInfo && "Kernel should be instrumented");
-
-        // Validate pointer arguments
-        if (getOptions().DetectKernelArguments) {
-            for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) {
-                auto Ptr = PtrPair.first;
-                if (Ptr == nullptr) {
-                    continue;
-                }
-                if (auto ValidateResult = ValidateUSMPointer(
-                        ContextInfo->Handle, DeviceInfo->Handle, (uptr)Ptr)) {
-                    ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr,
-                                                ValidateResult, PtrPair.second);
-                    exitWithErrors();
-                }
+    auto ArgNums = GetKernelNumArgs(Kernel);
+    auto LocalMemoryUsage =
+        GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle);
+    auto PrivateMemoryUsage =
+        GetKernelPrivateMemorySize(Kernel, DeviceInfo->Handle);
+
+    getContext()->logger.info(
+        "KernelInfo {} (Name={}, ArgNums={}, IsInstrumented={}, "
+        "LocalMemory={}, PrivateMemory={})",
+        (void *)Kernel, GetKernelName(Kernel), ArgNums,
+        KernelInfo->IsInstrumented, LocalMemoryUsage, PrivateMemoryUsage);
+
+    // Validate pointer arguments
+    if (getOptions().DetectKernelArguments) {
+        for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) {
+            auto Ptr = PtrPair.first;
+            if (Ptr == nullptr) {
+                continue;
             }
-        }
-
-        // Set membuffer arguments
-        for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
-            char *ArgPointer = nullptr;
-            UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
-            ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-                Kernel, ArgIndex, nullptr, ArgPointer);
-            if (URes != UR_RESULT_SUCCESS) {
-                getContext()->logger.error(
-                    "Failed to set buffer {} as the {} arg to kernel {}: {}",
-                    ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
-                    URes);
+            if (auto ValidateResult = ValidateUSMPointer(
+                    ContextInfo->Handle, DeviceInfo->Handle, (uptr)Ptr)) {
+                ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr,
+                                            ValidateResult, PtrPair.second);
+                exitWithErrors();
             }
         }
+    }
 
-        // Set launch info argument
-        auto ArgNums = GetKernelNumArgs(Kernel);
-        if (ArgNums) {
-            getContext()->logger.debug(
-                "launch_info {} (numLocalArgs={}, localArgs={})",
-                (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs,
-                (void *)LaunchInfo.Data->LocalArgs);
-            ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
-                Kernel, ArgNums - 1, nullptr, LaunchInfo.Data);
-            if (URes != UR_RESULT_SUCCESS) {
-                getContext()->logger.error("Failed to set launch info: {}",
-                                           URes);
-                return URes;
-            }
+    // Set membuffer arguments
+    for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
+        char *ArgPointer = nullptr;
+        UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgIndex, nullptr, ArgPointer);
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error(
+                "Failed to set buffer {} as the {} arg to kernel {}: {}",
+                ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
+                URes);
+            return URes;
         }
+    }
 
-        LaunchInfo.Data->GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
-        LaunchInfo.Data->GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
-        LaunchInfo.Data->DeviceTy = DeviceInfo->Type;
-        LaunchInfo.Data->Debug = getOptions().Debug ? 1 : 0;
-
-        if (LaunchInfo.LocalWorkSize.empty()) {
-            LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
-            auto URes =
-                getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
-                    Kernel, Queue, LaunchInfo.WorkDim,
-                    LaunchInfo.GlobalWorkOffset, LaunchInfo.GlobalWorkSize,
-                    LaunchInfo.LocalWorkSize.data());
-            if (URes != UR_RESULT_SUCCESS) {
-                if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
-                    return URes;
-                }
-                // If urKernelGetSuggestedLocalWorkSize is not supported by driver, we fallback
-                // to inefficient implementation
-                for (size_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
-                    LaunchInfo.LocalWorkSize[Dim] = 1;
-                }
-            }
-        }
+    if (!KernelInfo->IsInstrumented) {
+        return UR_RESULT_SUCCESS;
+    }
 
-        const size_t *LocalWorkSize = LaunchInfo.LocalWorkSize.data();
-        uint32_t NumWG = 1;
-        for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
-            NumWG *= (LaunchInfo.GlobalWorkSize[Dim] + LocalWorkSize[Dim] - 1) /
-                     LocalWorkSize[Dim];
+    // Set launch info argument
+    {
+        assert(ArgNums >= 1 &&
+               "Sanitized Kernel should have at least one argument");
+        ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer(
+            Kernel, ArgNums - 1, nullptr, LaunchInfo.Data);
+        if (URes != UR_RESULT_SUCCESS) {
+            getContext()->logger.error("Failed to set launch info: {}", URes);
+            return URes;
         }
+    }
 
-        auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle,
-                                            Device = DeviceInfo->Handle,
-                                            Queue](size_t Size, uptr &Ptr) {
-            void *Allocated = nullptr;
-            auto URes = getContext()->urDdiTable.USM.pfnDeviceAlloc(
-                Context, Device, nullptr, nullptr, Size, &Allocated);
-            if (URes != UR_RESULT_SUCCESS) {
+    UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get()));
+
+    LaunchInfo.Data->GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin;
+    LaunchInfo.Data->GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd;
+    LaunchInfo.Data->DeviceTy = DeviceInfo->Type;
+    LaunchInfo.Data->Debug = getOptions().Debug ? 1 : 0;
+
+    getContext()->logger.info(
+        "LaunchInfo {} (device={}, debug={}, numLocalArgs={}, localArgs={})",
+        (void *)LaunchInfo.Data, ToString(LaunchInfo.Data->DeviceTy),
+        LaunchInfo.Data->Debug, LaunchInfo.Data->NumLocalArgs,
+        (void *)LaunchInfo.Data->LocalArgs);
+
+    // urKernelGetSuggestedLocalWorkSize must be called after urKernelSetArgPointer
+    if (LaunchInfo.LocalWorkSize.empty()) {
+        LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
+        auto URes =
+            getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
+                Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset,
+                LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data());
+        if (URes != UR_RESULT_SUCCESS) {
+            if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
                 return URes;
             }
-            // Initialize shadow memory
-            URes = EnqueueUSMBlockingSet(Queue, Allocated, 0, Size);
-            if (URes != UR_RESULT_SUCCESS) {
-                [[maybe_unused]] auto URes =
-                    getContext()->urDdiTable.USM.pfnFree(Context, Allocated);
-                assert(URes == UR_RESULT_SUCCESS &&
-                       "urUSMFree failed at allocating shadow memory");
-                Allocated = nullptr;
+            // If urKernelGetSuggestedLocalWorkSize is not supported by driver, we fallback
+            // to inefficient implementation
+            for (size_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
+                LaunchInfo.LocalWorkSize[Dim] = 1;
             }
-            Ptr = (uptr)Allocated;
+        }
+    }
+
+    const size_t *LocalWorkSize = LaunchInfo.LocalWorkSize.data();
+    uint32_t NumWG = 1;
+    for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) {
+        NumWG *= (LaunchInfo.GlobalWorkSize[Dim] + LocalWorkSize[Dim] - 1) /
+                 LocalWorkSize[Dim];
+    }
+
+    auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle,
+                                        Device = DeviceInfo->Handle,
+                                        Queue](size_t Size, uptr &Ptr) {
+        void *Allocated = nullptr;
+        auto URes = getContext()->urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, nullptr, nullptr, Size, &Allocated);
+        if (URes != UR_RESULT_SUCCESS) {
             return URes;
-        };
-
-        auto LocalMemoryUsage =
-            GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle);
-        auto PrivateMemoryUsage =
-            GetKernelPrivateMemorySize(Kernel, DeviceInfo->Handle);
-
-        getContext()->logger.info(
-            "KernelInfo {} (LocalMemory={}, PrivateMemory={})", (void *)Kernel,
-            LocalMemoryUsage, PrivateMemoryUsage);
-
-        // Write shadow memory offset for local memory
-        if (getOptions().DetectLocals) {
-            // CPU needn't this
-            if (DeviceInfo->Type == DeviceType::GPU_PVC ||
-                DeviceInfo->Type == DeviceType::GPU_DG2) {
-                const size_t LocalMemorySize =
-                    GetDeviceLocalMemorySize(DeviceInfo->Handle);
-                const size_t LocalShadowMemorySize =
-                    (NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
-
-                getContext()->logger.debug(
-                    "LocalMemory(WorkGroup={}, LocalMemorySize={}, "
-                    "LocalShadowMemorySize={})",
-                    NumWG, LocalMemorySize, LocalShadowMemorySize);
-
-                if (EnqueueAllocateShadowMemory(
-                        LocalShadowMemorySize,
-                        LaunchInfo.Data->LocalShadowOffset) !=
-                    UR_RESULT_SUCCESS) {
-                    getContext()->logger.warning(
-                        "Failed to allocate shadow memory for local "
-                        "memory, maybe the number of workgroup ({}) is too "
-                        "large",
-                        NumWG);
-                    getContext()->logger.warning(
-                        "Skip checking local memory of kernel <{}>",
-                        GetKernelName(Kernel));
-                } else {
-                    LaunchInfo.Data->LocalShadowOffsetEnd =
-                        LaunchInfo.Data->LocalShadowOffset +
-                        LocalShadowMemorySize - 1;
-
-                    ContextInfo->Stats.UpdateShadowMalloced(
-                        LocalShadowMemorySize);
-
-                    getContext()->logger.info(
-                        "ShadowMemory(Local, {} - {})",
-                        (void *)LaunchInfo.Data->LocalShadowOffset,
-                        (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
-                }
+        }
+        // Initialize shadow memory
+        URes = EnqueueUSMBlockingSet(Queue, Allocated, 0, Size);
+        if (URes != UR_RESULT_SUCCESS) {
+            [[maybe_unused]] auto URes =
+                getContext()->urDdiTable.USM.pfnFree(Context, Allocated);
+            assert(URes == UR_RESULT_SUCCESS &&
+                   "urUSMFree failed at allocating shadow memory");
+            Allocated = nullptr;
+        }
+        Ptr = (uptr)Allocated;
+        return URes;
+    };
+
+    // Write shadow memory offset for local memory
+    if (getOptions().DetectLocals) {
+        // CPU needn't this
+        if (DeviceInfo->Type == DeviceType::GPU_PVC ||
+            DeviceInfo->Type == DeviceType::GPU_DG2) {
+            const size_t LocalMemorySize =
+                GetDeviceLocalMemorySize(DeviceInfo->Handle);
+            const size_t LocalShadowMemorySize =
+                (NumWG * LocalMemorySize) >> ASAN_SHADOW_SCALE;
+
+            getContext()->logger.debug(
+                "LocalMemory(WorkGroup={}, LocalMemorySize={}, "
+                "LocalShadowMemorySize={})",
+                NumWG, LocalMemorySize, LocalShadowMemorySize);
+
+            if (EnqueueAllocateShadowMemory(
+                    LocalShadowMemorySize,
+                    LaunchInfo.Data->LocalShadowOffset) != UR_RESULT_SUCCESS) {
+                getContext()->logger.warning(
+                    "Failed to allocate shadow memory for local "
+                    "memory, maybe the number of workgroup ({}) is too "
+                    "large",
+                    NumWG);
+                getContext()->logger.warning(
+                    "Skip checking local memory of kernel <{}>",
+                    GetKernelName(Kernel));
+            } else {
+                LaunchInfo.Data->LocalShadowOffsetEnd =
+                    LaunchInfo.Data->LocalShadowOffset + LocalShadowMemorySize -
+                    1;
+
+                ContextInfo->Stats.UpdateShadowMalloced(LocalShadowMemorySize);
+
+                getContext()->logger.info(
+                    "ShadowMemory(Local, {} - {})",
+                    (void *)LaunchInfo.Data->LocalShadowOffset,
+                    (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
             }
         }
+    }
 
-        // Write shadow memory offset for private memory
-        if (getOptions().DetectPrivates) {
-            if (DeviceInfo->Type == DeviceType::CPU) {
-                LaunchInfo.Data->PrivateShadowOffset =
-                    DeviceInfo->Shadow->ShadowBegin;
-            } else if (DeviceInfo->Type == DeviceType::GPU_PVC ||
-                       DeviceInfo->Type == DeviceType::GPU_DG2) {
-                const size_t PrivateShadowMemorySize =
-                    (NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
-
-                getContext()->logger.debug("PrivateMemory(WorkGroup={}, "
-                                           "PrivateShadowMemorySize={})",
-                                           NumWG, PrivateShadowMemorySize);
-
-                if (EnqueueAllocateShadowMemory(
-                        PrivateShadowMemorySize,
-                        LaunchInfo.Data->PrivateShadowOffset) !=
-                    UR_RESULT_SUCCESS) {
-                    getContext()->logger.warning(
-                        "Failed to allocate shadow memory for private "
-                        "memory, maybe the number of workgroup ({}) is too "
-                        "large",
-                        NumWG);
-                    getContext()->logger.warning(
-                        "Skip checking private memory of kernel <{}>",
-                        GetKernelName(Kernel));
-                } else {
-                    LaunchInfo.Data->PrivateShadowOffsetEnd =
-                        LaunchInfo.Data->PrivateShadowOffset +
-                        PrivateShadowMemorySize - 1;
-
-                    ContextInfo->Stats.UpdateShadowMalloced(
-                        PrivateShadowMemorySize);
-
-                    getContext()->logger.info(
-                        "ShadowMemory(Private, {} - {})",
-                        (void *)LaunchInfo.Data->PrivateShadowOffset,
-                        (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
-                }
+    // Write shadow memory offset for private memory
+    if (getOptions().DetectPrivates) {
+        if (DeviceInfo->Type == DeviceType::CPU) {
+            LaunchInfo.Data->PrivateShadowOffset =
+                DeviceInfo->Shadow->ShadowBegin;
+        } else if (DeviceInfo->Type == DeviceType::GPU_PVC ||
+                   DeviceInfo->Type == DeviceType::GPU_DG2) {
+            const size_t PrivateShadowMemorySize =
+                (NumWG * ASAN_PRIVATE_SIZE) >> ASAN_SHADOW_SCALE;
+
+            getContext()->logger.debug("PrivateMemory(WorkGroup={}, "
+                                       "PrivateShadowMemorySize={})",
+                                       NumWG, PrivateShadowMemorySize);
+
+            if (EnqueueAllocateShadowMemory(
+                    PrivateShadowMemorySize,
+                    LaunchInfo.Data->PrivateShadowOffset) !=
+                UR_RESULT_SUCCESS) {
+                getContext()->logger.warning(
+                    "Failed to allocate shadow memory for private "
+                    "memory, maybe the number of workgroup ({}) is too "
+                    "large",
+                    NumWG);
+                getContext()->logger.warning(
+                    "Skip checking private memory of kernel <{}>",
+                    GetKernelName(Kernel));
+            } else {
+                LaunchInfo.Data->PrivateShadowOffsetEnd =
+                    LaunchInfo.Data->PrivateShadowOffset +
+                    PrivateShadowMemorySize - 1;
+
+                ContextInfo->Stats.UpdateShadowMalloced(
+                    PrivateShadowMemorySize);
+
+                getContext()->logger.info(
+                    "ShadowMemory(Private, {} - {})",
+                    (void *)LaunchInfo.Data->PrivateShadowOffset,
+                    (void *)LaunchInfo.Data->PrivateShadowOffsetEnd);
             }
         }
-    } while (false);
+    }
 
     return UR_RESULT_SUCCESS;
 }
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
index 926be1388e..d24be1e1f2 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp
@@ -85,6 +85,9 @@ struct KernelInfo {
     ur_kernel_handle_t Handle;
     std::atomic<int32_t> RefCount = 1;
 
+    // sanitized kernel
+    bool IsInstrumented = false;
+
     // lock this mutex if following fields are accessed
     ur_shared_mutex Mutex;
     std::unordered_map<uint32_t, std::shared_ptr<MemBuffer>> BufferArgs;
@@ -94,7 +97,8 @@ struct KernelInfo {
     // Need preserve the order of local arguments
     std::map<uint32_t, LocalArgsInfo> LocalArgs;
 
-    explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) {
+    explicit KernelInfo(ur_kernel_handle_t Kernel, bool IsInstrumented)
+        : Handle(Kernel), IsInstrumented(IsInstrumented) {
         [[maybe_unused]] auto Result =
             getContext()->urDdiTable.Kernel.pfnRetain(Kernel);
         assert(Result == UR_RESULT_SUCCESS);
@@ -272,10 +276,8 @@ class AsanInterceptor {
 
     std::shared_ptr<KernelInfo> getKernelInfo(ur_kernel_handle_t Kernel) {
         std::shared_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
-        if (m_KernelMap.find(Kernel) != m_KernelMap.end()) {
-            return m_KernelMap[Kernel];
-        }
-        return nullptr;
+        assert(m_KernelMap.find(Kernel) != m_KernelMap.end());
+        return m_KernelMap[Kernel];
     }
 
     const AsanOptions &getOptions() { return m_Options; }

From 75a82aa9250d6609dacb18c0b887f19f3f76b5bc Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Fri, 13 Dec 2024 08:09:54 +0100
Subject: [PATCH 2/4] fix build

---
 source/loader/layers/sanitizer/asan/asan_interceptor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index ad3beda007..fb17b0a7f5 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -836,8 +836,9 @@ ur_result_t AsanInterceptor::prepareLaunch(
 
     getContext()->logger.info(
         "LaunchInfo {} (device={}, debug={}, numLocalArgs={}, localArgs={})",
-        (void *)LaunchInfo.Data.getDevicePtr(), LaunchInfo.Data.Host.DeviceTy,
-        LaunchInfo.Data.Host.Debug, LaunchInfo.Data.Host.NumLocalArgs,
+        (void *)LaunchInfo.Data.getDevicePtr(),
+        ToString(LaunchInfo.Data.Host.DeviceTy), LaunchInfo.Data.Host.Debug,
+        LaunchInfo.Data.Host.NumLocalArgs,
         (void *)LaunchInfo.Data.Host.LocalArgs);
 
     return UR_RESULT_SUCCESS;

From d05b5d5b5bf887c0eae98301e965ab55ca158531 Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Mon, 16 Dec 2024 09:55:18 +0100
Subject: [PATCH 3/4] init LaunchInfo

---
 source/loader/CMakeLists.txt                           | 1 +
 source/loader/layers/sanitizer/asan/asan_ddi.cpp       | 1 +
 source/loader/layers/sanitizer/asan/asan_libdevice.hpp | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
index d8f6056ae9..a10e99f422 100644
--- a/source/loader/CMakeLists.txt
+++ b/source/loader/CMakeLists.txt
@@ -136,6 +136,7 @@ if(UR_ENABLE_SANITIZER)
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_buffer.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_buffer.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_ddi.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_ddi.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_interceptor.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_interceptor.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_libdevice.hpp
diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 380b51a0da..9378544d65 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -464,6 +464,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
     LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue),
                           pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
                           workDim);
+    UR_CALL(LaunchInfo.Data.syncToDevice(hQueue));
 
     UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo));
 
diff --git a/source/loader/layers/sanitizer/asan/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan/asan_libdevice.hpp
index a2d5ecd6be..4c6aaaeac8 100644
--- a/source/loader/layers/sanitizer/asan/asan_libdevice.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_libdevice.hpp
@@ -66,7 +66,7 @@ struct AsanRuntimeData {
     uint32_t Debug = 0;
 
     int ReportFlag = 0;
-    AsanErrorReport Report[ASAN_MAX_NUM_REPORTS];
+    AsanErrorReport Report[ASAN_MAX_NUM_REPORTS] = {};
 };
 
 constexpr unsigned ASAN_SHADOW_SCALE = 4;

From 390d0b556d67cb31abb87854016df4ca86d18ea8 Mon Sep 17 00:00:00 2001
From: "Zhao, Yang2" <yang2.zhao@intel.com>
Date: Mon, 16 Dec 2024 09:59:01 +0100
Subject: [PATCH 4/4] remove unused var

---
 source/loader/layers/sanitizer/asan/asan_ddi.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 9378544d65..bf4dff157a 100644
--- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -1420,7 +1420,6 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue(
     getContext()->logger.debug("==== urKernelSetArgValue");
 
     std::shared_ptr<MemBuffer> MemBuffer;
-    std::shared_ptr<KernelInfo> KernelInfo;
     if (argSize == sizeof(ur_mem_handle_t) &&
         (MemBuffer = getAsanInterceptor()->getMemBuffer(
              *ur_cast<const ur_mem_handle_t *>(pArgValue)))) {
@@ -1453,7 +1452,6 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj(
     getContext()->logger.debug("==== urKernelSetArgMemObj");
 
     std::shared_ptr<MemBuffer> MemBuffer;
-    std::shared_ptr<KernelInfo> KernelInfo;
     if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue))) {
         auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel);
         std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);