From a87ac06524c106becfa867965197f192c3e17228 Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Thu, 16 Jan 2025 22:29:44 +0100 Subject: [PATCH 01/30] [NFC][SYCL] Fix self-contained matrix headers build (#16665) Signed-off-by: Sidorov, Dmitry --- .../oneapi/matrix/matrix-unified-utils.hpp | 1 + .../test/self-contained-headers/lit.local.cfg | 21 ------------------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp index ec14cf6da1931..865735617bbd8 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp @@ -11,6 +11,7 @@ #include // std::optional #include // std::string_view #include // __spv namespace +#include // bfloat16 #include // std::pair namespace sycl { diff --git a/sycl/test/self-contained-headers/lit.local.cfg b/sycl/test/self-contained-headers/lit.local.cfg index 0f0e9e15b0e53..d61551742dd85 100644 --- a/sycl/test/self-contained-headers/lit.local.cfg +++ b/sycl/test/self-contained-headers/lit.local.cfg @@ -13,25 +13,4 @@ config.sycl_headers_xfail = [ os.path.join( "sycl", "ext", "intel", "esimd", "detail", "types_elementary.hpp" ), - os.path.join( - "sycl", "ext", "oneapi", "matrix", "matrix-hip.hpp" - ), - os.path.join( - "sycl", "ext", "oneapi", "matrix", "matrix-intel.hpp" - ), - os.path.join( - "sycl", "ext", "oneapi", "matrix", "matrix-tensorcores.hpp" - ), - os.path.join( - "sycl", "ext", "oneapi", "matrix", "matrix-unified-utils.hpp" - ), - os.path.join( - "sycl", "ext", "oneapi", "matrix", "matrix-unified.hpp" - ), - os.path.join( - "sycl", "ext", "oneapi", "matrix", "matrix.hpp" - ), - os.path.join( - "sycl", "ext", "oneapi", "matrix", "static-query-use.hpp" - ), ] From d70ed197bf0649c077b89160bb0ad13862f79dd6 Mon Sep 17 00:00:00 2001 From: przemektmalon Date: Fri, 17 Jan 2025 07:39:21 +0000 Subject: [PATCH 02/30] [SYCL][Bindless][E2E] Enable 3-channel image test for Intel GPUs (#16537) This patch enables the 3-channel image E2E test on Intel GPUs. --- .../bindless_images/3_channel_format.cpp | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/sycl/test-e2e/bindless_images/3_channel_format.cpp b/sycl/test-e2e/bindless_images/3_channel_format.cpp index a3668f4f31973..2cf5df5b98990 100644 --- a/sycl/test-e2e/bindless_images/3_channel_format.cpp +++ b/sycl/test-e2e/bindless_images/3_channel_format.cpp @@ -1,7 +1,7 @@ -// REQUIRES: cuda +// REQUIRES: aspect-ext_oneapi_bindless_images // RUN: %{build} -o %t.out -// RUN: %{run} %t.out +// RUN: %{run-unfiltered-devices} env NEOReadDebugKeys=1 UseBindlessMode=1 UseExternalAllocatorForSshAndDsh=1 %t.out #include #include @@ -21,19 +21,22 @@ int main() { auto ctxt = q.get_context(); constexpr size_t width = 512; - std::vector out(width); - std::vector expected(width); - std::vector dataIn(width); - float exp = 512; - for (int i = 0; i < width; i++) { + std::vector out(width); + std::vector expected(width); + std::vector dataIn(width * 3); + unsigned short exp = 512; + for (unsigned int i = 0; i < width; i++) { expected[i] = exp; - dataIn[i] = sycl::float3(exp, width, i); + dataIn[(i * 3)] = exp; + dataIn[(i * 3) + 1] = static_cast(width); + dataIn[(i * 3) + 2] = static_cast(i); } try { // Main point of this test is to check creating an image // with a 3-channel format - syclexp::image_descriptor desc({width}, 3, sycl::image_channel_type::fp32); + syclexp::image_descriptor desc({width}, 3, + sycl::image_channel_type::unsigned_int16); syclexp::image_mem imgMem(desc, dev, ctxt); @@ -46,7 +49,7 @@ int main() { syclexp::unsampled_image_handle imgHandle = sycl::ext::oneapi::experimental::create_image(imgMem, desc, dev, ctxt); - sycl::buffer buf(out.data(), width); + sycl::buffer buf(out.data(), width); q.submit([&](sycl::handler &cgh) { sycl::accessor outAcc{buf, cgh}; @@ -55,9 +58,9 @@ int main() { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) // This shouldn't be hit anyway since CUDA doesn't support // 3-channel formats, but we need to ensure the kernel can compile - using pixel_t = sycl::float4; + using pixel_t = sycl::ushort4; #else - using pixel_t = sycl::float3; + using pixel_t = sycl::ushort3; #endif auto pixel = syclexp::fetch_image(imgHandle, int(id[0])); outAcc[id] = pixel[0]; @@ -83,7 +86,7 @@ int main() { } bool validated = true; - for (int i = 0; i < width; i++) { + for (unsigned int i = 0; i < width; i++) { bool mismatch = false; if (out[i] != expected[i]) { mismatch = true; From a73541ce8ab5f80f7ad325014e797400abdf4a4d Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Fri, 17 Jan 2025 12:10:23 +0100 Subject: [PATCH 03/30] [SYCL][E2E][NFC] Update tracker (#16657) --- sycl/test-e2e/DeviceLib/exp/exp-std-complex-edge-cases.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/DeviceLib/exp/exp-std-complex-edge-cases.hpp b/sycl/test-e2e/DeviceLib/exp/exp-std-complex-edge-cases.hpp index 992cd6147b535..438cb1ba459a6 100644 --- a/sycl/test-e2e/DeviceLib/exp/exp-std-complex-edge-cases.hpp +++ b/sycl/test-e2e/DeviceLib/exp/exp-std-complex-edge-cases.hpp @@ -335,7 +335,7 @@ template bool test() { CHECK(!std::signbit(r.real()), passed, i); #ifdef _WIN32 // This check fails on win, temporary skipping: - // CMPLRLLVM-61834 + // CMPLRLLVM-64900 // TODO: Delete this macro block when fixed if (std::is_same_v) continue; From 9219ede8dba4021d83c8a7c4fece99f00a5f93b1 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Fri, 17 Jan 2025 14:05:49 +0100 Subject: [PATCH 04/30] [SYCL][NFC] Stop using deprecated `InsertionPoint` constructor (#16658) See llvm/llvm-project#102608 --- .../ESIMD/ESIMDOptimizeVecArgCallConv.cpp | 3 +-- llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp | 4 ++-- llvm/lib/SYCLLowerIR/GlobalOffset.cpp | 2 +- .../LocalAccessorToSharedMemory.cpp | 2 +- llvm/lib/SYCLLowerIR/LowerInvokeSimd.cpp | 4 ++-- llvm/lib/SYCLLowerIR/LowerWGScope.cpp | 2 +- llvm/lib/SYCLLowerIR/SpecConstants.cpp | 24 +++++++++---------- .../PrepareSYCLNativeCPU.cpp | 2 +- 8 files changed, 21 insertions(+), 22 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDOptimizeVecArgCallConv.cpp b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDOptimizeVecArgCallConv.cpp index 806c0c20cb01b..4c20a879a5b64 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDOptimizeVecArgCallConv.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDOptimizeVecArgCallConv.cpp @@ -354,9 +354,8 @@ optimizeFunction(Function *OldF, // preserve data flow equality to the original. unsigned OldArgNo = PI.getFormalParam().getArgNo(); unsigned NewArgNo = oldArgNo2NewArgNo(OldArgNo, SretInd); - Instruction *At = nullptr; Value *Val = NewF->getArg(NewArgNo); - StoreInst *St = new StoreInst(Val, Alloca, false, Al, At); + StoreInst *St = new StoreInst(Val, Alloca, false, Al); NewInsts.push_back(St); } } diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 09101f0df7207..0ec014c702628 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1239,7 +1239,7 @@ static Instruction *addCastInstIfNeeded(Instruction *OldI, Instruction *NewI, if (OITy != NITy) { auto CastOpcode = CastInst::getCastOpcode(NewI, false, OITy, false); NewI = CastInst::Create(CastOpcode, NewI, OITy, - NewI->getName() + ".cast.ty", OldI); + NewI->getName() + ".cast.ty", OldI->getIterator()); NewI->setDebugLoc(OldI->getDebugLoc()); } return NewI; @@ -1565,7 +1565,7 @@ static void translateESIMDIntrinsicCall(CallInst &CI) { CallInst *NewCI = IntrinsicInst::Create( NewFDecl, GenXArgs, NewFDecl->getReturnType()->isVoidTy() ? "" : CI.getName() + ".esimd", - &CI); + CI.getIterator()); NewCI->setDebugLoc(CI.getDebugLoc()); if (DoesFunctionReturnStructure) { IRBuilder<> Builder(&CI); diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index 3873b2f8837e5..67c75b49a4881 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -250,7 +250,7 @@ void GlobalOffsetPass::addImplicitParameterToCallers( /* Func= */ CalleeWithImplicitParam, /* Args= */ ImplicitOffsets, /* NameStr= */ Twine(), - /* InsertBefore= */ CallToOld); + /* InsertBefore= */ CallToOld->getIterator()); NewCallInst->setTailCallKind(CallToOld->getTailCallKind()); NewCallInst->copyMetadata(*CallToOld); CallToOld->replaceAllUsesWith(NewCallInst); diff --git a/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp index 220e04de41157..0d30d2725b43e 100644 --- a/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp @@ -177,7 +177,7 @@ Function *LocalAccessorToSharedMemoryPass::processKernel(Module &M, ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, false), NFA, }, - /* NameStr= */ Twine{NFA->getName()}, InsertBefore); + /* NameStr= */ Twine{NFA->getName()}, InsertBefore->getIterator()); // Then create a bitcast to make sure the new pointer is the same type // as the old one. This will only ever be a `i8 addrspace(3)*` to `i32 // addrspace(3)*` type of cast. diff --git a/llvm/lib/SYCLLowerIR/LowerInvokeSimd.cpp b/llvm/lib/SYCLLowerIR/LowerInvokeSimd.cpp index 8d72f8466335f..e118ecb0152b6 100644 --- a/llvm/lib/SYCLLowerIR/LowerInvokeSimd.cpp +++ b/llvm/lib/SYCLLowerIR/LowerInvokeSimd.cpp @@ -461,8 +461,8 @@ bool processInvokeSimdCall(CallInst *InvokeSimd, NewInvokeSimdArgs.push_back(NewHelper); auto ThirdArg = std::next(InvokeSimd->arg_begin(), 2); NewInvokeSimdArgs.append(ThirdArg, InvokeSimd->arg_end()); - CallInst *NewInvokeSimd = - CallInst::Create(NewInvokeSimdF, NewInvokeSimdArgs, "", InvokeSimd); + CallInst *NewInvokeSimd = CallInst::Create( + NewInvokeSimdF, NewInvokeSimdArgs, "", InvokeSimd->getIterator()); // - transfer flags, attributes (with shrinking), calling convention: NewInvokeSimd->copyIRFlags(InvokeSimd); NewInvokeSimd->setCallingConv(InvokeSimd->getCallingConv()); diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index b087221e5f37a..815f5cfd248a6 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -981,7 +981,7 @@ Value *spirv::genPseudoLocalID(Instruction &Before, const Triple &TT) { Align Alignment = M.getDataLayout().getPreferredAlign(G); G->setAlignment(MaybeAlign(Alignment)); } - Value *Res = new LoadInst(G->getValueType(), G, "", &Before); + Value *Res = new LoadInst(G->getValueType(), G, "", Before.getIterator()); return Res; } } diff --git a/llvm/lib/SYCLLowerIR/SpecConstants.cpp b/llvm/lib/SYCLLowerIR/SpecConstants.cpp index c00fe479ad365..08be340512f9b 100644 --- a/llvm/lib/SYCLLowerIR/SpecConstants.cpp +++ b/llvm/lib/SYCLLowerIR/SpecConstants.cpp @@ -514,15 +514,15 @@ Instruction *emitCall(Type *RetTy, StringRef BaseFunctionName, auto *NewFT = FunctionType::get(NewRetTy, ArgTys, false /*isVarArg*/); auto NewFC = M->getOrInsertFunction(FunctionName, NewFT); - auto *Call = - CallInst::Create(NewFT, NewFC.getCallee(), Args, "", InsertBefore); + auto *Call = CallInst::Create(NewFT, NewFC.getCallee(), Args, "", + InsertBefore->getIterator()); if (IsSPIROrSPIRV) { cast(NewFC.getCallee()) ->setCallingConv(CallingConv::SPIR_FUNC); Call->setCallingConv(CallingConv::SPIR_FUNC); } return CastInst::CreateTruncOrBitCast(Call, RetTy, "tobool", - InsertBefore); + InsertBefore->getIterator()); } } @@ -711,7 +711,7 @@ Value *createLoadFromBuffer(CallInst *InsertBefore, Value *Buffer, Type *Int32Ty = Type::getInt32Ty(C); GetElementPtrInst *GEP = GetElementPtrInst::Create( Int8Ty, Buffer, {ConstantInt::get(Int32Ty, Offset, false)}, "gep", - InsertBefore); + InsertBefore->getIterator()); Instruction *BitCast = nullptr; if (SCType->isIntegerTy(1)) // No bitcast to i1 before load @@ -719,14 +719,14 @@ Value *createLoadFromBuffer(CallInst *InsertBefore, Value *Buffer, else BitCast = new BitCastInst(GEP, PointerType::get(SCType, GEP->getAddressSpace()), - "bc", InsertBefore); + "bc", InsertBefore->getIterator()); // When we encounter i1 spec constant, we still load the whole byte Value *Load = new LoadInst(SCType->isIntegerTy(1) ? Int8Ty : SCType, BitCast, - "load", InsertBefore); + "load", InsertBefore->getIterator()); if (SCType->isIntegerTy(1)) // trunc back to i1 if necessary Load = CastInst::CreateIntegerCast(Load, SCType, /* IsSigned */ false, - "tobool", InsertBefore); + "tobool", InsertBefore->getIterator()); return Load; } @@ -993,8 +993,8 @@ PreservedAnalyses SpecConstantsPass::run(Module &M, if (SCTy->isIntegerTy(1)) { assert(DefaultValue->getType()->isIntegerTy(8) && "For bool spec constant default value is expected to be i8"); - Replacement = - new TruncInst(DefaultValue, Type::getInt1Ty(Ctx), "bool", CI); + Replacement = new TruncInst(DefaultValue, Type::getInt1Ty(Ctx), + "bool", CI->getIterator()); } else Replacement = DefaultValue; } @@ -1021,9 +1021,9 @@ PreservedAnalyses SpecConstantsPass::run(Module &M, Value *ArraySize = Mode == HandlingMode::emulation ? DefaultValue : Replacement; assert(ArraySize->getType()->isIntegerTy() && "Expecting integer type"); - Replacement = - new AllocaInst(Intr->getAllocatedType(), Intr->getAddressSpace(), - ArraySize, Intr->getAlign(), "alloca", CI); + Replacement = new AllocaInst( + Intr->getAllocatedType(), Intr->getAddressSpace(), ArraySize, + Intr->getAlign(), "alloca", CI->getIterator()); } if (HasSretParameter) diff --git a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp index 340df9d4e7264..2316aae2da6e4 100644 --- a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp @@ -426,7 +426,7 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, if (nullptr == ReplaceFunc) ReplaceFunc = getReplaceFunc(M, Entry.second, Use, Args); auto *NewI = CallInst::Create(ReplaceFunc->getFunctionType(), ReplaceFunc, - Args, "", I); + Args, "", I->getIterator()); // If the parent function has debug info, we need to make sure that the // CallInstructions in it have debug info, otherwise we end up with // invalid IR after inlining. From b9a755831a77550ec77e13e97b87eabd256d2f8e Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Fri, 17 Jan 2025 16:43:47 +0000 Subject: [PATCH 05/30] [SYCL] Update UR tag for L0 synchronize fix (#16629) - Removes extra fence synchronization when updating graphs on L0 --------- Co-authored-by: Kenneth Benzie (Benie) --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 891fe4ea0c308..9c18fb006a799 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit eaea885d5477c8936209175a5b00062ca44f5765 -# Merge: af4ab49c 2a03334c +# commit 222e4b1d51536bb38e03e2000a79679af0a44a6d +# Merge: 30d183a0 28108a7e # Author: Kenneth Benzie (Benie) -# Date: Thu Jan 16 14:30:47 2025 +0000 -# Merge pull request #2569 from zhaomaosu/asan-only-warn-host-ptr -# [DevASAN] Only report warning if passing host ptr to kernel -set(UNIFIED_RUNTIME_TAG eaea885d5477c8936209175a5b00062ca44f5765) +# Date: Fri Jan 17 10:28:34 2025 +0000 +# Merge pull request #2561 from Bensuo/ben/cmd-buffer-l0-fence +# [L0][CMDBUF] Optimize fence/event waits during update +set(UNIFIED_RUNTIME_TAG 222e4b1d51536bb38e03e2000a79679af0a44a6d) From 98c0d5d05547ccc124a39b2598ad3094dbc98569 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 17 Jan 2025 10:02:48 -0800 Subject: [PATCH 06/30] [CI] Remove always empty `matrix.extra_cmake_args` (#16674) Unused after https://github.com/intel/llvm/pull/16071. --- .github/workflows/sycl-linux-precommit.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index 6d578580a13db..79beaaeb274be 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -242,7 +242,6 @@ jobs: env: '{"LIT_FILTER":"PerformanceTests/"}' extra_lit_opts: -a -j 1 --param enable-perf-tests=True - extra_cmake_args: ${{ matrix.extra_cmake_args }} ref: ${{ github.sha }} merge_ref: '' From a5e8209764845e088844bc9ce5cc5710955b449d Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 17 Jan 2025 10:05:59 -0800 Subject: [PATCH 07/30] [SYCL] Remove `IsDeprecatedDeviceCopyable` (#16615) This was discussed in https://github.com/intel/llvm/pull/15342#discussion_r1751868640 and the consensus seemed to be that we should drop it right away in a separate PR, do it here. Technically, it is a breaking change that could also be considered a bugfix. An example of a class failing the updated check is ``` struct Kernel { Kernel(int); Kernel(const Kernel&) = default; Kernel& operator=(const Kernel&) { return *this; } // non-trivial }; ``` An additional minor reason (other than not being SYCL-conformant) to drop it right away is to save a tiny bit of compile time that is currently used to support something violating the spec. This required some fixes in the reductions implementation to make sure the kernel we submit internally are actually device copyable. --- .../sycl/detail/is_device_copyable.hpp | 28 ++++++------------- sycl/include/sycl/reduction.hpp | 6 +++- sycl/test/basic_tests/is_device_copyable.cpp | 10 ------- 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/sycl/include/sycl/detail/is_device_copyable.hpp b/sycl/include/sycl/detail/is_device_copyable.hpp index 388029e6a16a3..a5dcd9d0235f1 100644 --- a/sycl/include/sycl/detail/is_device_copyable.hpp +++ b/sycl/include/sycl/detail/is_device_copyable.hpp @@ -31,6 +31,8 @@ inline namespace _V1 { template struct is_device_copyable; namespace detail { +template struct tuple; + template struct is_device_copyable_impl : std::is_trivially_copyable {}; @@ -70,6 +72,10 @@ template struct is_device_copyable> : std::bool_constant<(... && is_device_copyable::value)> {}; +template +struct is_device_copyable> + : std::bool_constant<(... && is_device_copyable::value)> {}; + // std::variant is implicitly device copyable type if each type T of // Ts... is device copyable. template @@ -83,22 +89,6 @@ struct is_device_copyable : is_device_copyable {}; template inline constexpr bool is_device_copyable_v = is_device_copyable::value; namespace detail { -template -struct IsDeprecatedDeviceCopyable : std::false_type {}; - -// TODO: using C++ attribute [[deprecated]] or the macro __SYCL2020_DEPRECATED -// does not produce expected warning message for the type 'T'. -template -struct __SYCL2020_DEPRECATED("This type isn't device copyable in SYCL 2020") - IsDeprecatedDeviceCopyable< - T, std::enable_if_t && - std::is_trivially_destructible_v && - !is_device_copyable_v>> : std::true_type {}; - -template -struct __SYCL2020_DEPRECATED("This type isn't device copyable in SYCL 2020") - IsDeprecatedDeviceCopyable : IsDeprecatedDeviceCopyable {}; - #ifdef __SYCL_DEVICE_ONLY__ // Checks that the fields of the type T with indices 0 to (NumFieldsToCheck - // 1) are device copyable. @@ -106,8 +96,7 @@ template struct CheckFieldsAreDeviceCopyable : CheckFieldsAreDeviceCopyable { using FieldT = decltype(__builtin_field_type(T, NumFieldsToCheck - 1)); - static_assert(is_device_copyable_v || - detail::IsDeprecatedDeviceCopyable::value, + static_assert(is_device_copyable_v, "The specified type is not device copyable"); }; @@ -119,8 +108,7 @@ template struct CheckBasesAreDeviceCopyable : CheckBasesAreDeviceCopyable { using BaseT = decltype(__builtin_base_type(T, NumBasesToCheck - 1)); - static_assert(is_device_copyable_v || - detail::IsDeprecatedDeviceCopyable::value, + static_assert(is_device_copyable_v, "The specified type is not device copyable"); }; diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index 576dac72a3ce2..7f006648b6cad 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -389,7 +389,7 @@ class ReductionIdentityContainer< static constexpr bool has_identity = true; ReductionIdentityContainer(const T &) {} - ReductionIdentityContainer() {} + ReductionIdentityContainer() = default; /// Returns the statically known identity value. static constexpr T getIdentity() { @@ -407,6 +407,10 @@ class ReductionIdentityContainer< ReductionIdentityContainer(const T &Identity) : MIdentity(Identity) {} + // Make it trivially copyable (need at least on of the special member + // functions): + ReductionIdentityContainer(const ReductionIdentityContainer &) = default; + /// Returns the identity value given by user. T getIdentity() const { return MIdentity; } diff --git a/sycl/test/basic_tests/is_device_copyable.cpp b/sycl/test/basic_tests/is_device_copyable.cpp index 1c4199e954530..3e48bd5d77857 100644 --- a/sycl/test/basic_tests/is_device_copyable.cpp +++ b/sycl/test/basic_tests/is_device_copyable.cpp @@ -25,14 +25,6 @@ struct BCopyable { BCopyable(const BCopyable &x) : i(x.i) {} }; -// Not trivially copyable, but trivially copy constructible/destructible. -// Such types are passed to kernels to stay compatible with deprecated -// sycl 1.2.1 rules. -struct C : A { - const A C2; - C() : A{0}, C2{2} {} -}; - // Not copyable type, but it will be declared as device copyable. struct DCopyable { int i; @@ -67,7 +59,6 @@ void test() { A IamGood; IamGood.i = 0; BCopyable IamBadButCopyable(1); - C IamAlsoGood; DCopyable IamAlsoBadButCopyable{0}; marray MarrayForCopyableIsCopyable(0); range<2> Range{1,2}; @@ -78,7 +69,6 @@ void test() { int A = IamGood.i; int B = IamBadButCopyable.i; int C = IamAlsoBadButCopyable.i; - int D = IamAlsoGood.i; int E = MarrayForCopyableIsCopyable[0]; int F = Range[1]; int G = Id[2]; From 5fb402c19d1b91f52f0e46eca416c855fa0bc014 Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Fri, 17 Jan 2025 14:19:23 -0800 Subject: [PATCH 08/30] [SYCL] fix incorrect application of binary AND in tests (#16610) &= with 0 is always 0. For tests that are returning 0 as success, using binary AND will mask errors, leading the test to incorrectly pass when it should fail. I wasted an hour last week being mislead by this problem. I audited all our tests and fortunately the ones making this mistake were relatively few. --- sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp | 28 +++++++++---------- .../launch_queries/max_num_work_groups.cpp | 8 +++--- sycl/test-e2e/Scheduler/MultipleDevices.cpp | 10 +++---- sycl/test-e2e/XPTI/buffer/in_cycle.cpp | 4 +-- sycl/test-e2e/XPTI/buffer/recursion.cpp | 2 +- sycl/test/fpga_tests/fpga_io_pipes.cpp | 6 ++-- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp b/sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp index 382dbf525098c..0eb58c2fcf66c 100644 --- a/sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp +++ b/sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp @@ -329,26 +329,26 @@ int main() { } // Non-blocking pipes - int Result = test_simple_nb_pipe(Queue); - Result &= test_simple_nb_pipe(Queue); + int Error = test_simple_nb_pipe(Queue); + Error |= test_simple_nb_pipe(Queue); class forward_nb_pipe; - Result &= test_simple_nb_pipe(Queue); - Result &= test_simple_nb_pipe, /*test number*/ 4>(Queue); - Result &= test_multiple_nb_pipe(Queue); + Error |= test_simple_nb_pipe(Queue); + Error |= test_simple_nb_pipe, /*test number*/ 4>(Queue); + Error |= test_multiple_nb_pipe(Queue); // Blocking pipes - Result &= test_simple_bl_pipe(Queue); - Result &= test_simple_bl_pipe(Queue); + Error |= test_simple_bl_pipe(Queue); + Error |= test_simple_bl_pipe(Queue); class forward_bl_pipe; - Result &= test_simple_bl_pipe(Queue); - Result &= test_simple_bl_pipe, /*test number*/ 9>(Queue); - Result &= test_multiple_bl_pipe(Queue); + Error |= test_simple_bl_pipe(Queue); + Error |= test_simple_bl_pipe, /*test number*/ 9>(Queue); + Error |= test_multiple_bl_pipe(Queue); // Test for an array data passing through a pipe - Result &= test_array_th_nb_pipe(Queue); - Result &= test_array_th_bl_pipe(Queue); + Error |= test_array_th_nb_pipe(Queue); + Error |= test_array_th_bl_pipe(Queue); // TODO Remove when #14308 is closed - std::cerr << "DEBUG: Finished with result " << Result << std::endl; - return Result; + std::cerr << "DEBUG: Finished with result " << Error << std::endl; + return Error; } diff --git a/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp index 30b1240099111..d85279aeb2bc1 100644 --- a/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp +++ b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp @@ -206,8 +206,8 @@ int main() { using namespace kernels; - int ret{0}; - ret &= test_max_num_work_groups(q, dev); - ret &= test_max_num_work_groups(q, dev); - return ret; + int Error{0}; + Error |= test_max_num_work_groups(q, dev); + Error |= test_max_num_work_groups(q, dev); + return Error; } diff --git a/sycl/test-e2e/Scheduler/MultipleDevices.cpp b/sycl/test-e2e/Scheduler/MultipleDevices.cpp index 3976512e2d6e7..f848ce79ab9a4 100644 --- a/sycl/test-e2e/Scheduler/MultipleDevices.cpp +++ b/sycl/test-e2e/Scheduler/MultipleDevices.cpp @@ -92,11 +92,11 @@ int multidevice_test(queue MyQueue1, queue MyQueue2) { int main() { - int Result = -1; + int Error = 0; try { queue MyQueue1(cpu_selector_v); queue MyQueue2(cpu_selector_v); - Result &= multidevice_test(MyQueue1, MyQueue2); + Error |= multidevice_test(MyQueue1, MyQueue2); } catch (sycl::exception &) { std::cout << "Skipping CPU and CPU" << std::endl; } @@ -104,7 +104,7 @@ int main() { try { queue MyQueue1(cpu_selector_v); queue MyQueue2(gpu_selector_v); - Result &= multidevice_test(MyQueue1, MyQueue2); + Error |= multidevice_test(MyQueue1, MyQueue2); } catch (sycl::exception &) { std::cout << "Skipping CPU and GPU" << std::endl; } @@ -112,10 +112,10 @@ int main() { try { queue MyQueue1(gpu_selector_v); queue MyQueue2(gpu_selector_v); - Result &= multidevice_test(MyQueue1, MyQueue2); + Error |= multidevice_test(MyQueue1, MyQueue2); } catch (sycl::exception &) { std::cout << "Skipping GPU and GPU" << std::endl; } - return Result; + return Error; } diff --git a/sycl/test-e2e/XPTI/buffer/in_cycle.cpp b/sycl/test-e2e/XPTI/buffer/in_cycle.cpp index 7dbd84d336289..18075e22a7ca8 100644 --- a/sycl/test-e2e/XPTI/buffer/in_cycle.cpp +++ b/sycl/test-e2e/XPTI/buffer/in_cycle.cpp @@ -45,7 +45,7 @@ bool func(sycl::queue &Queue, int depth = 0) { } if (depth > 0) - MismatchFound &= func(Queue, depth - 1); + MismatchFound |= func(Queue, depth - 1); return MismatchFound; } int main() { @@ -66,7 +66,7 @@ int main() { // CHECK:{{[0-9]+}}|Release buffer|[[USERID3]]|[[BEID3]] // CHECK:{{[0-9]+}}|Destruct buffer|[[USERID3]] for (int i = 0; i < 3; i++) - MismatchFound &= func(Queue); + MismatchFound |= func(Queue); return MismatchFound; } diff --git a/sycl/test-e2e/XPTI/buffer/recursion.cpp b/sycl/test-e2e/XPTI/buffer/recursion.cpp index 00d90390311a1..9a2044c0e8d97 100644 --- a/sycl/test-e2e/XPTI/buffer/recursion.cpp +++ b/sycl/test-e2e/XPTI/buffer/recursion.cpp @@ -65,7 +65,7 @@ int main() { // CHECK:{{[0-9]+}}|Destruct buffer|[[USERID2]] // CHECK:{{[0-9]+}}|Release buffer|[[USERID1]]|[[BEID1]] // CHECK:{{[0-9]+}}|Destruct buffer|[[USERID1]] - MismatchFound &= func(Queue, 2); + MismatchFound |= func(Queue, 2); return MismatchFound; } diff --git a/sycl/test/fpga_tests/fpga_io_pipes.cpp b/sycl/test/fpga_tests/fpga_io_pipes.cpp index 0513386f97dd1..89876fc963fb9 100644 --- a/sycl/test/fpga_tests/fpga_io_pipes.cpp +++ b/sycl/test/fpga_tests/fpga_io_pipes.cpp @@ -123,10 +123,10 @@ int main() { } // Non-blocking pipes - int Result = test_io_nb_pipe(Queue); + int Error = test_io_nb_pipe(Queue); // 0 if successful // Blocking pipes - Result &= test_io_bl_pipe(Queue); + Error |= test_io_bl_pipe(Queue); - return Result; + return Error; } From 2fcddee30815de911966ffea61ac035008292c62 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 17 Jan 2025 14:35:35 -0800 Subject: [PATCH 09/30] [CI] Simplify nightly docker images (#16680) Have a single docker image created with all the dependencies pre-installed so that it could be used for all internal CI needs for every target. Other workflows should be changed after this is merged and new image is uploaded to the registry. --- .github/workflows/sycl-nightly.yml | 38 +++---------------- ...nstalled.Dockerfile => nightly.Dockerfile} | 4 +- sycl/doc/developer/DockerBKMs.md | 9 ++--- 3 files changed, 11 insertions(+), 40 deletions(-) rename devops/containers/{ubuntu2204_preinstalled.Dockerfile => nightly.Dockerfile} (79%) diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 0c3ff68b27efd..01831b428088c 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -242,7 +242,7 @@ jobs: body: "Daily build ${{ steps.tag.outputs.TAG }}" target_commitish: ${{ github.sha }} - ubuntu2204_docker_build_push: + docker_build_push: if: github.repository == 'intel/llvm' runs-on: [Linux, build] permissions: @@ -254,42 +254,16 @@ jobs: with: name: sycl_linux_default path: devops/ - - name: Build and Push Container (with drivers) + - name: Build and Push Container uses: ./devops/actions/build_container with: push: ${{ github.ref_name == 'sycl' }} - file: ubuntu2204_preinstalled + file: nightly username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} build-args: | base_image=ghcr.io/intel/llvm/ubuntu2404_intel_drivers - base_tag=latest + base_tag=alldeps tags: | - ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:${{ github.sha }} - ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:latest - - name: Build and Push Container (no drivers) - uses: ./devops/actions/build_container - with: - push: ${{ github.ref_name == 'sycl' }} - file: ubuntu2204_preinstalled - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - build-args: | - base_image=ghcr.io/intel/llvm/ubuntu2204_base - base_tag=latest - tags: | - ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:no-drivers-${{ github.sha }} - ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:no-drivers - - name: Build and Push Container (Build image) - uses: ./devops/actions/build_container - with: - push: ${{ github.ref_name == 'sycl' }} - file: ubuntu2204_preinstalled - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - build-args: | - base_image=ghcr.io/intel/llvm/ubuntu2204_build - base_tag=latest - tags: | - ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:build-${{ github.sha }} - ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:build + ghcr.io/${{ github.repository }}/sycl_ubuntu2404_nightly:${{ github.sha }} + ghcr.io/${{ github.repository }}/sycl_ubuntu2404_nightly:latest diff --git a/devops/containers/ubuntu2204_preinstalled.Dockerfile b/devops/containers/nightly.Dockerfile similarity index 79% rename from devops/containers/ubuntu2204_preinstalled.Dockerfile rename to devops/containers/nightly.Dockerfile index 18ec7de01acb3..4c62f2668b2b8 100644 --- a/devops/containers/ubuntu2204_preinstalled.Dockerfile +++ b/devops/containers/nightly.Dockerfile @@ -1,5 +1,5 @@ -ARG base_tag=latest -ARG base_image=ghcr.io/intel/llvm/ubuntu2204_intel_drivers +ARG base_tag=alldeps +ARG base_image=ghcr.io/intel/llvm/ubuntu2404_intel_drivers FROM $base_image:$base_tag diff --git a/sycl/doc/developer/DockerBKMs.md b/sycl/doc/developer/DockerBKMs.md index 4b761c4075a2b..6f1db70e37ffa 100644 --- a/sycl/doc/developer/DockerBKMs.md +++ b/sycl/doc/developer/DockerBKMs.md @@ -53,10 +53,6 @@ development containers: NVidia/AMD and can be used for building DPC++ compiler from source with all backends enabled or for end-to-end testing with HIP/CUDA on machines with corresponding GPUs available. - - `devops/containers/sycl_ubuntu2204_nightly`: contains the latest successfully - built nightly build of DPC++ compiler. The Dockerfile comes in three flavors: - with pre-installed Intel drivers (`latest`), without them (`no-drivers`) and - with development kits installed (`build`). ### Ubuntu 24.04-based Dockerfiles @@ -78,7 +74,8 @@ development containers: NVidia/AMD and can be used for building DPC++ compiler from source with all backends enabled or for end-to-end testing with HIP/CUDA on machines with corresponding GPUs available. - + - `devops/containers/nightly`: contains the latest successfully + built nightly build of DPC++ compiler. ## Running Docker container interactively @@ -199,7 +196,7 @@ Docker containers can be built with the following command: docker build -f path/to/devops/containers/file.Dockerfile path/to/devops/ ``` -The `ubuntu2204_preinstalled.Dockerfile` script expects `llvm_sycl.tar.xz` file +The `nightly.Dockerfile` script expects `llvm_sycl.tar.xz` file to be present in `devops/` directory. Containers other than base provide several configurable arguments, the most From 1537ca268d82f6a315f7d6dbf5031c7066423e63 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Sun, 19 Jan 2025 23:46:38 -0800 Subject: [PATCH 10/30] [NFC] Use C++17 fold expressions in `detail::CheckDeviceCopyable` (#16679) First, that's what idiomatic C++17 is (folds over recursion), second compile of folds scales much better than recursion when the number of fields/bases grow. --- .../sycl/detail/is_device_copyable.hpp | 41 +++++++++---------- .../basic_tests/is_device_copyable_neg.cpp | 21 ++-------- 2 files changed, 23 insertions(+), 39 deletions(-) diff --git a/sycl/include/sycl/detail/is_device_copyable.hpp b/sycl/include/sycl/detail/is_device_copyable.hpp index a5dcd9d0235f1..bac24f4df3a11 100644 --- a/sycl/include/sycl/detail/is_device_copyable.hpp +++ b/sycl/include/sycl/detail/is_device_copyable.hpp @@ -90,30 +90,25 @@ template inline constexpr bool is_device_copyable_v = is_device_copyable::value; namespace detail { #ifdef __SYCL_DEVICE_ONLY__ -// Checks that the fields of the type T with indices 0 to (NumFieldsToCheck - -// 1) are device copyable. -template -struct CheckFieldsAreDeviceCopyable - : CheckFieldsAreDeviceCopyable { - using FieldT = decltype(__builtin_field_type(T, NumFieldsToCheck - 1)); - static_assert(is_device_copyable_v, - "The specified type is not device copyable"); +template struct CheckFieldsAreDeviceCopyable; +template struct CheckBasesAreDeviceCopyable; + +template +struct CheckFieldsAreDeviceCopyable> { + static_assert( + ((is_device_copyable_v && + ...)), + "The specified type is not device copyable"); }; -template struct CheckFieldsAreDeviceCopyable {}; - -// Checks that the base classes of the type T with indices 0 to -// (NumFieldsToCheck - 1) are device copyable. -template -struct CheckBasesAreDeviceCopyable - : CheckBasesAreDeviceCopyable { - using BaseT = decltype(__builtin_base_type(T, NumBasesToCheck - 1)); - static_assert(is_device_copyable_v, - "The specified type is not device copyable"); +template +struct CheckBasesAreDeviceCopyable> { + static_assert( + ((is_device_copyable_v && + ...)), + "The specified type is not device copyable"); }; -template struct CheckBasesAreDeviceCopyable {}; - // All the captures of a lambda or functor of type FuncT passed to a kernel // must be is_device_copyable, which extends to bases and fields of FuncT. // Fields are captures of lambda/functors and bases are possible base classes @@ -127,8 +122,10 @@ template struct CheckBasesAreDeviceCopyable {}; // is currently/temporarily supported only to not break older SYCL programs. template struct CheckDeviceCopyable - : CheckFieldsAreDeviceCopyable, - CheckBasesAreDeviceCopyable {}; + : CheckFieldsAreDeviceCopyable< + FuncT, std::make_index_sequence<__builtin_num_fields(FuncT)>>, + CheckBasesAreDeviceCopyable< + FuncT, std::make_index_sequence<__builtin_num_bases(FuncT)>> {}; template class RoundedRangeKernel; diff --git a/sycl/test/basic_tests/is_device_copyable_neg.cpp b/sycl/test/basic_tests/is_device_copyable_neg.cpp index 2e1ca89b3d25a..c9007685ae693 100644 --- a/sycl/test/basic_tests/is_device_copyable_neg.cpp +++ b/sycl/test/basic_tests/is_device_copyable_neg.cpp @@ -1,5 +1,4 @@ -// RUN: not %clangxx -fsycl -fsycl-device-only -fsyntax-only \ -// RUN: %s -I %sycl_include 2>&1 | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=warning,note %s // This test checks if compiler reports compilation error on an attempt to pass // a struct with type that is not device copyable as SYCL kernel parameter. @@ -57,6 +56,7 @@ void test() { B IamAlsoBad{0}; marray MarrayForNotCopyable; queue Q; + // expected-error@*:* {{static assertion failed due to requirement 'is_device_copyable_v': The specified type is not device copyable}} Q.single_task([=] { int A = IamBad.i; int B = IamAlsoBad.i; @@ -64,23 +64,10 @@ void test() { }); FunctorA FA; + // expected-error@*:* {{static assertion failed due to requirement 'is_device_copyable_v': The specified type is not device copyable}} Q.single_task(FA); FunctorB FB; + // expected-error@*:* {{static assertion failed due to requirement 'is_device_copyable_v': The specified type is not device copyable}} Q.single_task(FB); } - -// CHECK: static assertion failed due to requirement 'is_device_copyable_v -// CHECK: is_device_copyable_neg.cpp:60:5: note: in instantiation of function - -// CHECK: static assertion failed due to requirement 'is_device_copyable_v -// CHECK: is_device_copyable_neg.cpp:60:5: note: in instantiation of function - -// CHECK: static assertion failed due to requirement 'is_device_copyable_v> -// CHECK: is_device_copyable_neg.cpp:60:5: note: in instantiation of function - -// CHECK: static assertion failed due to requirement 'is_device_copyable_v -// CHECK: is_device_copyable_neg.cpp:67:5: note: in instantiation of function - -// CHECK: static assertion failed due to requirement 'is_device_copyable_v -// CHECK: is_device_copyable_neg.cpp:70:5: note: in instantiation of function From 160509bed98939c292b39823c0c85dcdec403334 Mon Sep 17 00:00:00 2001 From: tomflinda Date: Mon, 20 Jan 2025 17:16:01 +0800 Subject: [PATCH 11/30] [SYCL][COMPAT] Add helper function ternary_logic_op() to perform bitwise logical operations on three input values based on the specified 8-bit truth table (#16509) Signed-off-by: chenwei.sun --------- Signed-off-by: chenwei.sun --- sycl/doc/syclcompat/README.md | 6 + sycl/include/syclcompat/util.hpp | 104 ++++ .../util/util_ternary_logic_op_test.cpp | 585 ++++++++++++++++++ 3 files changed, 695 insertions(+) create mode 100644 sycl/test-e2e/syclcompat/util/util_ternary_logic_op_test.cpp diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index a518ea04e3e74..699e904092b32 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -1389,6 +1389,10 @@ with bytes selected according to a third unsigned integer argument. `match_all_over_sub_group` and `match_any_over_sub_group` allows comparison of values across work-items within a sub-group. +The function `ternary_logic_op`performs bitwise logical operations on three input values of +`a`, `b` and `c` based on the specified 8-bit truth table `lut` and return the +result. + The functions `select_from_sub_group`, `shift_sub_group_left`, `shift_sub_group_right` and `permute_sub_group_by_xor` provide equivalent functionality to `sycl::select_from_group`, `sycl::shift_group_left`, @@ -1419,6 +1423,8 @@ inline double cast_ints_to_double(int high32, int low32); inline unsigned int byte_level_permute(unsigned int a, unsigned int b, unsigned int s); +inline uint32_t lop3(uint32_t a, uint32_t b, uint32_t c, uint8_t lut) + template inline int ffs(ValueT a); template diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp index 2fb085509cf6a..8907f32d5e7e6 100644 --- a/sycl/include/syclcompat/util.hpp +++ b/sycl/include/syclcompat/util.hpp @@ -199,6 +199,110 @@ inline unsigned int byte_level_permute(unsigned int a, unsigned int b, return ret; } +/// \brief The function performs bitwise logical operations on three input +/// values of \p a, \p b and \p c based on the specified 8-bit truth table \p +/// lut and return the result +/// +/// \param [in] a Input value +/// \param [in] b Input value +/// \param [in] c Input value +/// \param [in] lut truth table for looking up +/// \returns The result +inline uint32_t ternary_logic_op(uint32_t a, uint32_t b, uint32_t c, + uint8_t lut) { + uint32_t result = 0; +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + asm volatile("lop3.b32 %0, %1, %2, %3, %4;" + : "=r"(result) + : "r"(a), "r"(b), "r"(c), "n"(lut)); +#else + switch (lut) { + case 0x0: + result = 0; + break; + case 0x1: + result = ~a & ~b & ~c; + break; + case 0x2: + result = ~a & ~b & c; + case 0x4: + result = ~a & b & ~c; + break; + case 0x8: + result = ~a & b & c; + break; + case 0x10: + result = a & ~b & ~c; + break; + case 0x20: + result = a & ~b & c; + break; + case 0x40: + result = a & b & ~c; + break; + case 0x80: + result = a & b & c; + break; + case 0x1a: + result = (a & b | c) ^ a; + break; + case 0x1e: + result = a ^ (b | c); + break; + case 0x2d: + result = ~a ^ (~b & c); + break; + case 0x78: + result = a ^ (b & c); + break; + case 0x96: + result = a ^ b ^ c; + break; + case 0xb4: + result = a ^ (b & ~c); + break; + case 0xb8: + result = a ^ (b & (c ^ a)); + break; + case 0xd2: + result = a ^ (~b & c); + break; + case 0xe8: + result = a & (b | c) | (b & c); + break; + case 0xea: + result = a & b | c; + break; + case 0xfe: + result = a | b | c; + break; + case 0xff: + result = -1; + break; + default: { + if (lut & 0x01) + result |= ~a & ~b & ~c; + if (lut & 0x02) + result |= ~a & ~b & c; + if (lut & 0x04) + result |= ~a & b & ~c; + if (lut & 0x08) + result |= ~a & b & c; + if (lut & 0x10) + result |= a & ~b & ~c; + if (lut & 0x20) + result |= a & ~b & c; + if (lut & 0x40) + result |= a & b & ~c; + if (lut & 0x80) + result |= a & b & c; + break; + } + } +#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + return result; +} + /// Find position of first least significant set bit in an integer. /// ffs(0) returns 0. /// diff --git a/sycl/test-e2e/syclcompat/util/util_ternary_logic_op_test.cpp b/sycl/test-e2e/syclcompat/util/util_ternary_logic_op_test.cpp new file mode 100644 index 0000000000000..4e252c5df08a5 --- /dev/null +++ b/sycl/test-e2e/syclcompat/util/util_ternary_logic_op_test.cpp @@ -0,0 +1,585 @@ +// ====------ util_ternary_logic_op_test.cpp ------------ *- C/C++ -* ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===--------------------------------------------------------------------===// + +// This file is modified from the code migrated by SYCLomatic. + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include +#include +#include +#include +#include + +// clang-format off +void reference_of_ternary_logic_op(uint32_t &R, uint32_t A, uint32_t B, uint32_t C, uint32_t D) { + switch (D) { + case 0: R = 0; break; + case 1: R = (~A & ~B & ~C); break; + case 2: R = (~A & ~B & C); break; + case 3: R = (~A & ~B & ~C) | (~A & ~B & C); break; + case 4: R = (~A & B & ~C); break; + case 5: R = (~A & ~B & ~C) | (~A & B & ~C); break; + case 6: R = (~A & ~B & C) | (~A & B & ~C); break; + case 7: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C); break; + case 8: R = (~A & B & C); break; + case 9: R = (~A & ~B & ~C) | (~A & B & C); break; + case 10: R = (~A & ~B & C) | (~A & B & C); break; + case 11: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C); break; + case 12: R = (~A & B & ~C) | (~A & B & C); break; + case 13: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C); break; + case 14: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C); break; + case 15: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C); break; + case 16: R = (A & ~B & ~C); break; + case 17: R = (~A & ~B & ~C) | (A & ~B & ~C); break; + case 18: R = (~A & ~B & C) | (A & ~B & ~C); break; + case 19: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C); break; + case 20: R = (~A & B & ~C) | (A & ~B & ~C); break; + case 21: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C); break; + case 22: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C); break; + case 23: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C); break; + case 24: R = (~A & B & C) | (A & ~B & ~C); break; + case 25: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C); break; + case 26: R = (A & B | C) ^ A; break; + case 27: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C); break; + case 28: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C); break; + case 29: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C); break; + case 30: R = A ^ (B | C); break; + case 31: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C); break; + case 32: R = (A & ~B & C); break; + case 33: R = (~A & ~B & ~C) | (A & ~B & C); break; + case 34: R = (~A & ~B & C) | (A & ~B & C); break; + case 35: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & C); break; + case 36: R = (~A & B & ~C) | (A & ~B & C); break; + case 37: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & C); break; + case 38: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C); break; + case 39: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C); break; + case 40: R = (~A & B & C) | (A & ~B & C); break; + case 41: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & C); break; + case 42: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & C); break; + case 43: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & C); break; + case 44: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & C); break; + case 45: R = ~A ^ (~B & C); break; + case 46: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C); break; + case 47: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C); break; + case 48: R = (A & ~B & ~C) | (A & ~B & C); break; + case 49: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & ~B & C); break; + case 50: R = (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 51: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 52: R = (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C); break; + case 53: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C); break; + case 54: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C); break; + case 55: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C); break; + case 56: R = (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 57: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 58: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 59: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 60: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 61: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 62: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 63: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break; + case 64: R = A & B & ~C; break; + case 65: R = (~A & ~B & ~C) | (A & B & ~C); break; + case 66: R = (~A & ~B & C) | (A & B & ~C); break; + case 67: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & B & ~C); break; + case 68: R = (~A & B & ~C) | (A & B & ~C); break; + case 69: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & B & ~C); break; + case 70: R = (~A & ~B & C) | (~A & B & ~C) | (A & B & ~C); break; + case 71: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & B & ~C); break; + case 72: R = (~A & B & C) | (A & B & ~C); break; + case 73: R = (~A & ~B & ~C) | (~A & B & C) | (A & B & ~C); break; + case 74: R = (~A & ~B & C) | (~A & B & C) | (A & B & ~C); break; + case 75: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & B & ~C); break; + case 76: R = (~A & B & ~C) | (~A & B & C) | (A & B & ~C); break; + case 77: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C); break; + case 78: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C); break; + case 79: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C); break; + case 80: R = (A & ~B & ~C) | (A & B & ~C); break; + case 81: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & B & ~C); break; + case 82: R = (~A & ~B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 83: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 84: R = (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C); break; + case 85: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C); break; + case 86: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C); break; + case 87: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C); break; + case 88: R = (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 89: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 90: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 91: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 92: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 93: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 94: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 95: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break; + case 96: R = (A & ~B & C) | (A & B & ~C); break; + case 97: R = (~A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 98: R = (~A & ~B & C) | (A & ~B & C) | (A & B & ~C); break; + case 99: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & C) | (A & B & ~C); break; + case 100: R = (~A & B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 101: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 102: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 103: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 104: R = (~A & B & C) | (A & ~B & C) | (A & B & ~C); break; + case 105: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break; + case 106: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break; + case 107: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break; + case 108: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break; + case 109: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break; + case 110: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break; + case 111: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break; + case 112: R = (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 113: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 114: R = (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 115: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 116: R = (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 117: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 118: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 119: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 120: R = A ^ (B & C); break; + case 121: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 122: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 123: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 124: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 125: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 126: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 127: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break; + case 128: R = A & B & C; break; + case 129: R = (~A & ~B & ~C) | (A & B & C); break; + case 130: R = (~A & ~B & C) | (A & B & C); break; + case 131: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & B & C); break; + case 132: R = (~A & B & ~C) | (A & B & C); break; + case 133: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & B & C); break; + case 134: R = (~A & ~B & C) | (~A & B & ~C) | (A & B & C); break; + case 135: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & B & C); break; + case 136: R = (~A & B & C) | (A & B & C); break; + case 137: R = (~A & ~B & ~C) | (~A & B & C) | (A & B & C); break; + case 138: R = (~A & ~B & C) | (~A & B & C) | (A & B & C); break; + case 139: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & B & C); break; + case 140: R = (~A & B & ~C) | (~A & B & C) | (A & B & C); break; + case 141: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & B & C); break; + case 142: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & C); break; + case 143: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & C); break; + case 144: R = (A & ~B & ~C) | (A & B & C); break; + case 145: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & B & C); break; + case 146: R = (~A & ~B & C) | (A & ~B & ~C) | (A & B & C); break; + case 147: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & B & C); break; + case 148: R = (~A & B & ~C) | (A & ~B & ~C) | (A & B & C); break; + case 149: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & C); break; + case 150: R = A ^ B ^ C; break; + case 151: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & C); break; + case 152: R = (~A & B & C) | (A & ~B & ~C) | (A & B & C); break; + case 153: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break; + case 154: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break; + case 155: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break; + case 156: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break; + case 157: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break; + case 158: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break; + case 159: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break; + case 160: R = (A & ~B & C) | (A & B & C); break; + case 161: R = (~A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 162: R = (~A & ~B & C) | (A & ~B & C) | (A & B & C); break; + case 163: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & C) | (A & B & C); break; + case 164: R = (~A & B & ~C) | (A & ~B & C) | (A & B & C); break; + case 165: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & C) | (A & B & C); break; + case 166: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & C); break; + case 167: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & C); break; + case 168: R = (~A & B & C) | (A & ~B & C) | (A & B & C); break; + case 169: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break; + case 170: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break; + case 171: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break; + case 172: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break; + case 173: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break; + case 174: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break; + case 175: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break; + case 176: R = (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 177: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 178: R = (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 179: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 180: R = A ^ (B & ~C); break; + case 181: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 182: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 183: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 184: R = (A ^ (B & (C ^ A))); break; + case 185: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 186: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 187: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 188: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 189: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 190: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 191: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break; + case 192: R = (A & B & ~C) | (A & B & C); break; + case 193: R = (~A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 194: R = (~A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 195: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 196: R = (~A & B & ~C) | (A & B & ~C) | (A & B & C); break; + case 197: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & B & ~C) | (A & B & C); break; + case 198: R = (~A & ~B & C) | (~A & B & ~C) | (A & B & ~C) | (A & B & C); break; + case 199: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & B & ~C) | (A & B & C); break; + case 200: R = (~A & B & C) | (A & B & ~C) | (A & B & C); break; + case 201: R = (~A & ~B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break; + case 202: R = (~A & ~B & C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break; + case 203: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break; + case 204: R = (~A & B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break; + case 205: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break; + case 206: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break; + case 207: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break; + case 208: R = (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 209: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 210: R = A ^ (~B & C); break; + case 211: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 212: R = (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 213: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 214: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 215: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 216: R = (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 217: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 218: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 219: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 220: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 221: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 222: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 223: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break; + case 224: R = (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 225: R = (~A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 226: R = (~A & ~B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 227: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 228: R = (~A & B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 229: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 230: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 231: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 232: R = ((A & (B | C)) | (B & C)); break; + case 233: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 234: R = (A & B) | C; break; + case 235: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 236: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 237: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 238: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 239: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 240: R = (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 241: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 242: R = (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 243: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 244: R = (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 245: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 246: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 247: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 248: R = (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 249: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 250: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 251: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 252: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 253: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break; + case 254: R = A | B | C; break; + case 255: R = uint32_t(-1); break; + default: break; + } +} + +void asm_ternary_logic_op(uint32_t &R, uint32_t A, uint32_t B, uint32_t C, uint32_t D) { + switch (D) { + case 0: R = syclcompat::ternary_logic_op(A, B, C, 0x0); break; + case 1: R = syclcompat::ternary_logic_op(A, B, C, 0x1); break; + case 2: R = syclcompat::ternary_logic_op(A, B, C, 0x2); break; + case 3: R = syclcompat::ternary_logic_op(A, B, C, 0x3); break; + case 4: R = syclcompat::ternary_logic_op(A, B, C, 0x4); break; + case 5: R = syclcompat::ternary_logic_op(A, B, C, 0x5); break; + case 6: R = syclcompat::ternary_logic_op(A, B, C, 0x6); break; + case 7: R = syclcompat::ternary_logic_op(A, B, C, 0x7); break; + case 8: R = syclcompat::ternary_logic_op(A, B, C, 0x8); break; + case 9: R = syclcompat::ternary_logic_op(A, B, C, 0x9); break; + case 10: R = syclcompat::ternary_logic_op(A, B, C, 0xA); break; + case 11: R = syclcompat::ternary_logic_op(A, B, C, 0xB); break; + case 12: R = syclcompat::ternary_logic_op(A, B, C, 0xC); break; + case 13: R = syclcompat::ternary_logic_op(A, B, C, 0xD); break; + case 14: R = syclcompat::ternary_logic_op(A, B, C, 0xE); break; + case 15: R = syclcompat::ternary_logic_op(A, B, C, 0xF); break; + case 16: R = syclcompat::ternary_logic_op(A, B, C, 0x10); break; + case 17: R = syclcompat::ternary_logic_op(A, B, C, 0x11); break; + case 18: R = syclcompat::ternary_logic_op(A, B, C, 0x12); break; + case 19: R = syclcompat::ternary_logic_op(A, B, C, 0x13); break; + case 20: R = syclcompat::ternary_logic_op(A, B, C, 0x14); break; + case 21: R = syclcompat::ternary_logic_op(A, B, C, 0x15); break; + case 22: R = syclcompat::ternary_logic_op(A, B, C, 0x16); break; + case 23: R = syclcompat::ternary_logic_op(A, B, C, 0x17); break; + case 24: R = syclcompat::ternary_logic_op(A, B, C, 0x18); break; + case 25: R = syclcompat::ternary_logic_op(A, B, C, 0x19); break; + case 26: R = syclcompat::ternary_logic_op(A, B, C, 0x1A); break; + case 27: R = syclcompat::ternary_logic_op(A, B, C, 0x1B); break; + case 28: R = syclcompat::ternary_logic_op(A, B, C, 0x1C); break; + case 29: R = syclcompat::ternary_logic_op(A, B, C, 0x1D); break; + case 30: R = syclcompat::ternary_logic_op(A, B, C, 0x1E); break; + case 31: R = syclcompat::ternary_logic_op(A, B, C, 0x1F); break; + case 32: R = syclcompat::ternary_logic_op(A, B, C, 0x20); break; + case 33: R = syclcompat::ternary_logic_op(A, B, C, 0x21); break; + case 34: R = syclcompat::ternary_logic_op(A, B, C, 0x22); break; + case 35: R = syclcompat::ternary_logic_op(A, B, C, 0x23); break; + case 36: R = syclcompat::ternary_logic_op(A, B, C, 0x24); break; + case 37: R = syclcompat::ternary_logic_op(A, B, C, 0x25); break; + case 38: R = syclcompat::ternary_logic_op(A, B, C, 0x26); break; + case 39: R = syclcompat::ternary_logic_op(A, B, C, 0x27); break; + case 40: R = syclcompat::ternary_logic_op(A, B, C, 0x28); break; + case 41: R = syclcompat::ternary_logic_op(A, B, C, 0x29); break; + case 42: R = syclcompat::ternary_logic_op(A, B, C, 0x2A); break; + case 43: R = syclcompat::ternary_logic_op(A, B, C, 0x2B); break; + case 44: R = syclcompat::ternary_logic_op(A, B, C, 0x2C); break; + case 45: R = syclcompat::ternary_logic_op(A, B, C, 0x2D); break; + case 46: R = syclcompat::ternary_logic_op(A, B, C, 0x2E); break; + case 47: R = syclcompat::ternary_logic_op(A, B, C, 0x2F); break; + case 48: R = syclcompat::ternary_logic_op(A, B, C, 0x30); break; + case 49: R = syclcompat::ternary_logic_op(A, B, C, 0x31); break; + case 50: R = syclcompat::ternary_logic_op(A, B, C, 0x32); break; + case 51: R = syclcompat::ternary_logic_op(A, B, C, 0x33); break; + case 52: R = syclcompat::ternary_logic_op(A, B, C, 0x34); break; + case 53: R = syclcompat::ternary_logic_op(A, B, C, 0x35); break; + case 54: R = syclcompat::ternary_logic_op(A, B, C, 0x36); break; + case 55: R = syclcompat::ternary_logic_op(A, B, C, 0x37); break; + case 56: R = syclcompat::ternary_logic_op(A, B, C, 0x38); break; + case 57: R = syclcompat::ternary_logic_op(A, B, C, 0x39); break; + case 58: R = syclcompat::ternary_logic_op(A, B, C, 0x3A); break; + case 59: R = syclcompat::ternary_logic_op(A, B, C, 0x3B); break; + case 60: R = syclcompat::ternary_logic_op(A, B, C, 0x3C); break; + case 61: R = syclcompat::ternary_logic_op(A, B, C, 0x3D); break; + case 62: R = syclcompat::ternary_logic_op(A, B, C, 0x3E); break; + case 63: R = syclcompat::ternary_logic_op(A, B, C, 0x3F); break; + case 64: R = syclcompat::ternary_logic_op(A, B, C, 0x40); break; + case 65: R = syclcompat::ternary_logic_op(A, B, C, 0x41); break; + case 66: R = syclcompat::ternary_logic_op(A, B, C, 0x42); break; + case 67: R = syclcompat::ternary_logic_op(A, B, C, 0x43); break; + case 68: R = syclcompat::ternary_logic_op(A, B, C, 0x44); break; + case 69: R = syclcompat::ternary_logic_op(A, B, C, 0x45); break; + case 70: R = syclcompat::ternary_logic_op(A, B, C, 0x46); break; + case 71: R = syclcompat::ternary_logic_op(A, B, C, 0x47); break; + case 72: R = syclcompat::ternary_logic_op(A, B, C, 0x48); break; + case 73: R = syclcompat::ternary_logic_op(A, B, C, 0x49); break; + case 74: R = syclcompat::ternary_logic_op(A, B, C, 0x4A); break; + case 75: R = syclcompat::ternary_logic_op(A, B, C, 0x4B); break; + case 76: R = syclcompat::ternary_logic_op(A, B, C, 0x4C); break; + case 77: R = syclcompat::ternary_logic_op(A, B, C, 0x4D); break; + case 78: R = syclcompat::ternary_logic_op(A, B, C, 0x4E); break; + case 79: R = syclcompat::ternary_logic_op(A, B, C, 0x4F); break; + case 80: R = syclcompat::ternary_logic_op(A, B, C, 0x50); break; + case 81: R = syclcompat::ternary_logic_op(A, B, C, 0x51); break; + case 82: R = syclcompat::ternary_logic_op(A, B, C, 0x52); break; + case 83: R = syclcompat::ternary_logic_op(A, B, C, 0x53); break; + case 84: R = syclcompat::ternary_logic_op(A, B, C, 0x54); break; + case 85: R = syclcompat::ternary_logic_op(A, B, C, 0x55); break; + case 86: R = syclcompat::ternary_logic_op(A, B, C, 0x56); break; + case 87: R = syclcompat::ternary_logic_op(A, B, C, 0x57); break; + case 88: R = syclcompat::ternary_logic_op(A, B, C, 0x58); break; + case 89: R = syclcompat::ternary_logic_op(A, B, C, 0x59); break; + case 90: R = syclcompat::ternary_logic_op(A, B, C, 0x5A); break; + case 91: R = syclcompat::ternary_logic_op(A, B, C, 0x5B); break; + case 92: R = syclcompat::ternary_logic_op(A, B, C, 0x5C); break; + case 93: R = syclcompat::ternary_logic_op(A, B, C, 0x5D); break; + case 94: R = syclcompat::ternary_logic_op(A, B, C, 0x5E); break; + case 95: R = syclcompat::ternary_logic_op(A, B, C, 0x5F); break; + case 96: R = syclcompat::ternary_logic_op(A, B, C, 0x60); break; + case 97: R = syclcompat::ternary_logic_op(A, B, C, 0x61); break; + case 98: R = syclcompat::ternary_logic_op(A, B, C, 0x62); break; + case 99: R = syclcompat::ternary_logic_op(A, B, C, 0x63); break; + case 100: R = syclcompat::ternary_logic_op(A, B, C, 0x64); break; + case 101: R = syclcompat::ternary_logic_op(A, B, C, 0x65); break; + case 102: R = syclcompat::ternary_logic_op(A, B, C, 0x66); break; + case 103: R = syclcompat::ternary_logic_op(A, B, C, 0x67); break; + case 104: R = syclcompat::ternary_logic_op(A, B, C, 0x68); break; + case 105: R = syclcompat::ternary_logic_op(A, B, C, 0x69); break; + case 106: R = syclcompat::ternary_logic_op(A, B, C, 0x6A); break; + case 107: R = syclcompat::ternary_logic_op(A, B, C, 0x6B); break; + case 108: R = syclcompat::ternary_logic_op(A, B, C, 0x6C); break; + case 109: R = syclcompat::ternary_logic_op(A, B, C, 0x6D); break; + case 110: R = syclcompat::ternary_logic_op(A, B, C, 0x6E); break; + case 111: R = syclcompat::ternary_logic_op(A, B, C, 0x6F); break; + case 112: R = syclcompat::ternary_logic_op(A, B, C, 0x70); break; + case 113: R = syclcompat::ternary_logic_op(A, B, C, 0x71); break; + case 114: R = syclcompat::ternary_logic_op(A, B, C, 0x72); break; + case 115: R = syclcompat::ternary_logic_op(A, B, C, 0x73); break; + case 116: R = syclcompat::ternary_logic_op(A, B, C, 0x74); break; + case 117: R = syclcompat::ternary_logic_op(A, B, C, 0x75); break; + case 118: R = syclcompat::ternary_logic_op(A, B, C, 0x76); break; + case 119: R = syclcompat::ternary_logic_op(A, B, C, 0x77); break; + case 120: R = syclcompat::ternary_logic_op(A, B, C, 0x78); break; + case 121: R = syclcompat::ternary_logic_op(A, B, C, 0x79); break; + case 122: R = syclcompat::ternary_logic_op(A, B, C, 0x7A); break; + case 123: R = syclcompat::ternary_logic_op(A, B, C, 0x7B); break; + case 124: R = syclcompat::ternary_logic_op(A, B, C, 0x7C); break; + case 125: R = syclcompat::ternary_logic_op(A, B, C, 0x7D); break; + case 126: R = syclcompat::ternary_logic_op(A, B, C, 0x7E); break; + case 127: R = syclcompat::ternary_logic_op(A, B, C, 0x7F); break; + case 128: R = syclcompat::ternary_logic_op(A, B, C, 0x80); break; + case 129: R = syclcompat::ternary_logic_op(A, B, C, 0x81); break; + case 130: R = syclcompat::ternary_logic_op(A, B, C, 0x82); break; + case 131: R = syclcompat::ternary_logic_op(A, B, C, 0x83); break; + case 132: R = syclcompat::ternary_logic_op(A, B, C, 0x84); break; + case 133: R = syclcompat::ternary_logic_op(A, B, C, 0x85); break; + case 134: R = syclcompat::ternary_logic_op(A, B, C, 0x86); break; + case 135: R = syclcompat::ternary_logic_op(A, B, C, 0x87); break; + case 136: R = syclcompat::ternary_logic_op(A, B, C, 0x88); break; + case 137: R = syclcompat::ternary_logic_op(A, B, C, 0x89); break; + case 138: R = syclcompat::ternary_logic_op(A, B, C, 0x8A); break; + case 139: R = syclcompat::ternary_logic_op(A, B, C, 0x8B); break; + case 140: R = syclcompat::ternary_logic_op(A, B, C, 0x8C); break; + case 141: R = syclcompat::ternary_logic_op(A, B, C, 0x8D); break; + case 142: R = syclcompat::ternary_logic_op(A, B, C, 0x8E); break; + case 143: R = syclcompat::ternary_logic_op(A, B, C, 0x8F); break; + case 144: R = syclcompat::ternary_logic_op(A, B, C, 0x90); break; + case 145: R = syclcompat::ternary_logic_op(A, B, C, 0x91); break; + case 146: R = syclcompat::ternary_logic_op(A, B, C, 0x92); break; + case 147: R = syclcompat::ternary_logic_op(A, B, C, 0x93); break; + case 148: R = syclcompat::ternary_logic_op(A, B, C, 0x94); break; + case 149: R = syclcompat::ternary_logic_op(A, B, C, 0x95); break; + case 150: R = syclcompat::ternary_logic_op(A, B, C, 0x96); break; + case 151: R = syclcompat::ternary_logic_op(A, B, C, 0x97); break; + case 152: R = syclcompat::ternary_logic_op(A, B, C, 0x98); break; + case 153: R = syclcompat::ternary_logic_op(A, B, C, 0x99); break; + case 154: R = syclcompat::ternary_logic_op(A, B, C, 0x9A); break; + case 155: R = syclcompat::ternary_logic_op(A, B, C, 0x9B); break; + case 156: R = syclcompat::ternary_logic_op(A, B, C, 0x9C); break; + case 157: R = syclcompat::ternary_logic_op(A, B, C, 0x9D); break; + case 158: R = syclcompat::ternary_logic_op(A, B, C, 0x9E); break; + case 159: R = syclcompat::ternary_logic_op(A, B, C, 0x9F); break; + case 160: R = syclcompat::ternary_logic_op(A, B, C, 0xA0); break; + case 161: R = syclcompat::ternary_logic_op(A, B, C, 0xA1); break; + case 162: R = syclcompat::ternary_logic_op(A, B, C, 0xA2); break; + case 163: R = syclcompat::ternary_logic_op(A, B, C, 0xA3); break; + case 164: R = syclcompat::ternary_logic_op(A, B, C, 0xA4); break; + case 165: R = syclcompat::ternary_logic_op(A, B, C, 0xA5); break; + case 166: R = syclcompat::ternary_logic_op(A, B, C, 0xA6); break; + case 167: R = syclcompat::ternary_logic_op(A, B, C, 0xA7); break; + case 168: R = syclcompat::ternary_logic_op(A, B, C, 0xA8); break; + case 169: R = syclcompat::ternary_logic_op(A, B, C, 0xA9); break; + case 170: R = syclcompat::ternary_logic_op(A, B, C, 0xAA); break; + case 171: R = syclcompat::ternary_logic_op(A, B, C, 0xAB); break; + case 172: R = syclcompat::ternary_logic_op(A, B, C, 0xAC); break; + case 173: R = syclcompat::ternary_logic_op(A, B, C, 0xAD); break; + case 174: R = syclcompat::ternary_logic_op(A, B, C, 0xAE); break; + case 175: R = syclcompat::ternary_logic_op(A, B, C, 0xAF); break; + case 176: R = syclcompat::ternary_logic_op(A, B, C, 0xB0); break; + case 177: R = syclcompat::ternary_logic_op(A, B, C, 0xB1); break; + case 178: R = syclcompat::ternary_logic_op(A, B, C, 0xB2); break; + case 179: R = syclcompat::ternary_logic_op(A, B, C, 0xB3); break; + case 180: R = syclcompat::ternary_logic_op(A, B, C, 0xB4); break; + case 181: R = syclcompat::ternary_logic_op(A, B, C, 0xB5); break; + case 182: R = syclcompat::ternary_logic_op(A, B, C, 0xB6); break; + case 183: R = syclcompat::ternary_logic_op(A, B, C, 0xB7); break; + case 184: R = syclcompat::ternary_logic_op(A, B, C, 0xB8); break; + case 185: R = syclcompat::ternary_logic_op(A, B, C, 0xB9); break; + case 186: R = syclcompat::ternary_logic_op(A, B, C, 0xBA); break; + case 187: R = syclcompat::ternary_logic_op(A, B, C, 0xBB); break; + case 188: R = syclcompat::ternary_logic_op(A, B, C, 0xBC); break; + case 189: R = syclcompat::ternary_logic_op(A, B, C, 0xBD); break; + case 190: R = syclcompat::ternary_logic_op(A, B, C, 0xBE); break; + case 191: R = syclcompat::ternary_logic_op(A, B, C, 0xBF); break; + case 192: R = syclcompat::ternary_logic_op(A, B, C, 0xC0); break; + case 193: R = syclcompat::ternary_logic_op(A, B, C, 0xC1); break; + case 194: R = syclcompat::ternary_logic_op(A, B, C, 0xC2); break; + case 195: R = syclcompat::ternary_logic_op(A, B, C, 0xC3); break; + case 196: R = syclcompat::ternary_logic_op(A, B, C, 0xC4); break; + case 197: R = syclcompat::ternary_logic_op(A, B, C, 0xC5); break; + case 198: R = syclcompat::ternary_logic_op(A, B, C, 0xC6); break; + case 199: R = syclcompat::ternary_logic_op(A, B, C, 0xC7); break; + case 200: R = syclcompat::ternary_logic_op(A, B, C, 0xC8); break; + case 201: R = syclcompat::ternary_logic_op(A, B, C, 0xC9); break; + case 202: R = syclcompat::ternary_logic_op(A, B, C, 0xCA); break; + case 203: R = syclcompat::ternary_logic_op(A, B, C, 0xCB); break; + case 204: R = syclcompat::ternary_logic_op(A, B, C, 0xCC); break; + case 205: R = syclcompat::ternary_logic_op(A, B, C, 0xCD); break; + case 206: R = syclcompat::ternary_logic_op(A, B, C, 0xCE); break; + case 207: R = syclcompat::ternary_logic_op(A, B, C, 0xCF); break; + case 208: R = syclcompat::ternary_logic_op(A, B, C, 0xD0); break; + case 209: R = syclcompat::ternary_logic_op(A, B, C, 0xD1); break; + case 210: R = syclcompat::ternary_logic_op(A, B, C, 0xD2); break; + case 211: R = syclcompat::ternary_logic_op(A, B, C, 0xD3); break; + case 212: R = syclcompat::ternary_logic_op(A, B, C, 0xD4); break; + case 213: R = syclcompat::ternary_logic_op(A, B, C, 0xD5); break; + case 214: R = syclcompat::ternary_logic_op(A, B, C, 0xD6); break; + case 215: R = syclcompat::ternary_logic_op(A, B, C, 0xD7); break; + case 216: R = syclcompat::ternary_logic_op(A, B, C, 0xD8); break; + case 217: R = syclcompat::ternary_logic_op(A, B, C, 0xD9); break; + case 218: R = syclcompat::ternary_logic_op(A, B, C, 0xDA); break; + case 219: R = syclcompat::ternary_logic_op(A, B, C, 0xDB); break; + case 220: R = syclcompat::ternary_logic_op(A, B, C, 0xDC); break; + case 221: R = syclcompat::ternary_logic_op(A, B, C, 0xDD); break; + case 222: R = syclcompat::ternary_logic_op(A, B, C, 0xDE); break; + case 223: R = syclcompat::ternary_logic_op(A, B, C, 0xDF); break; + case 224: R = syclcompat::ternary_logic_op(A, B, C, 0xE0); break; + case 225: R = syclcompat::ternary_logic_op(A, B, C, 0xE1); break; + case 226: R = syclcompat::ternary_logic_op(A, B, C, 0xE2); break; + case 227: R = syclcompat::ternary_logic_op(A, B, C, 0xE3); break; + case 228: R = syclcompat::ternary_logic_op(A, B, C, 0xE4); break; + case 229: R = syclcompat::ternary_logic_op(A, B, C, 0xE5); break; + case 230: R = syclcompat::ternary_logic_op(A, B, C, 0xE6); break; + case 231: R = syclcompat::ternary_logic_op(A, B, C, 0xE7); break; + case 232: R = syclcompat::ternary_logic_op(A, B, C, 0xE8); break; + case 233: R = syclcompat::ternary_logic_op(A, B, C, 0xE9); break; + case 234: R = syclcompat::ternary_logic_op(A, B, C, 0xEA); break; + case 235: R = syclcompat::ternary_logic_op(A, B, C, 0xEB); break; + case 236: R = syclcompat::ternary_logic_op(A, B, C, 0xEC); break; + case 237: R = syclcompat::ternary_logic_op(A, B, C, 0xED); break; + case 238: R = syclcompat::ternary_logic_op(A, B, C, 0xEE); break; + case 239: R = syclcompat::ternary_logic_op(A, B, C, 0xEF); break; + case 240: R = syclcompat::ternary_logic_op(A, B, C, 0xF0); break; + case 241: R = syclcompat::ternary_logic_op(A, B, C, 0xF1); break; + case 242: R = syclcompat::ternary_logic_op(A, B, C, 0xF2); break; + case 243: R = syclcompat::ternary_logic_op(A, B, C, 0xF3); break; + case 244: R = syclcompat::ternary_logic_op(A, B, C, 0xF4); break; + case 245: R = syclcompat::ternary_logic_op(A, B, C, 0xF5); break; + case 246: R = syclcompat::ternary_logic_op(A, B, C, 0xF6); break; + case 247: R = syclcompat::ternary_logic_op(A, B, C, 0xF7); break; + case 248: R = syclcompat::ternary_logic_op(A, B, C, 0xF8); break; + case 249: R = syclcompat::ternary_logic_op(A, B, C, 0xF9); break; + case 250: R = syclcompat::ternary_logic_op(A, B, C, 0xFA); break; + case 251: R = syclcompat::ternary_logic_op(A, B, C, 0xFB); break; + case 252: R = syclcompat::ternary_logic_op(A, B, C, 0xFC); break; + case 253: R = syclcompat::ternary_logic_op(A, B, C, 0xFD); break; + case 254: R = syclcompat::ternary_logic_op(A, B, C, 0xFE); break; + case 255: R = syclcompat::ternary_logic_op(A, B, C, 0xFF); break; + } +} + +// clang-format on + +void ternary_logic_op(int *ec) { + uint32_t X, Y, A = 1, B = 2, C = 3, D; + for (D = 0; D < 256; ++D) { + reference_of_ternary_logic_op(X, A, B, C, D); + asm_ternary_logic_op(Y, A, B, C, D); + if (X != Y) { + *ec = D; + return; + } + } + *ec = 0; +} + +int main() { + syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device(); + sycl::queue &q_ct1 = *dev_ct1.default_queue(); + int ret = 0; + int *d_ec = nullptr; + d_ec = sycl::malloc_device(1, q_ct1); + + auto wait_and_check = [&](const char *case_name) { + syclcompat::get_current_device().queues_wait_and_throw(); + int ec = 0; + syclcompat::get_default_queue().memcpy(&ec, d_ec, sizeof(int)).wait(); + if (ec != 0) + printf("Test %s failed: return code = %d\n", case_name, ec); + ret = ret || ec; + }; + + q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { ternary_logic_op(d_ec); }); + wait_and_check("ternary_logic_op"); + + syclcompat::wait_and_free(d_ec, q_ct1); + + return ret; +} From b56f941d02f1951c1f028b7d0e764bdd6ef14ed2 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Mon, 20 Jan 2025 09:16:20 +0000 Subject: [PATCH 12/30] [SYCL] [Graph] Add E2E tests for Graphs using sycl_ext_oneapi_work_group_static extension (#16644) Two e2e tests were added: - `Inputs/work_group_static_memory.cpp` checks if using the extension in a static graph works, - `Update/work_group_static_memory_with_dyn_cgf_and_dyn_params.cpp` tests the work_group extension against graph with dynamic CGF which at the same time, uses one dynamic parameter --- .../Explicit/work_group_static_memory.cpp | 14 +++ .../Graph/Inputs/work_group_static_memory.cpp | 50 ++++++++++ .../RecordReplay/work_group_static_memory.cpp | 14 +++ ...tic_memory_with_dyn_cgf_and_dyn_params.cpp | 98 +++++++++++++++++++ 4 files changed, 176 insertions(+) create mode 100644 sycl/test-e2e/Graph/Explicit/work_group_static_memory.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/work_group_static_memory.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/work_group_static_memory.cpp create mode 100644 sycl/test-e2e/Graph/Update/work_group_static_memory_with_dyn_cgf_and_dyn_params.cpp diff --git a/sycl/test-e2e/Graph/Explicit/work_group_static_memory.cpp b/sycl/test-e2e/Graph/Explicit/work_group_static_memory.cpp new file mode 100644 index 0000000000000..c3d9d4f950dd8 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/work_group_static_memory.cpp @@ -0,0 +1,14 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: hip +// UNSUPPORTED-INTENDED: sycl_ext_oneapi_work_group_static is not supported on +// AMD + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/work_group_static_memory.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/work_group_static_memory.cpp b/sycl/test-e2e/Graph/Inputs/work_group_static_memory.cpp new file mode 100644 index 0000000000000..9ccd9f5a21deb --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/work_group_static_memory.cpp @@ -0,0 +1,50 @@ +// Tests using sycl_ext_oneapi_work_group_static in a graph node + +#include "../graph_common.hpp" +#include + +constexpr size_t WgSize = 32; + +// Local mem used in kernel +sycl::ext::oneapi::experimental::work_group_static LocalIDBuff; + +int main() { + queue Queue; + exp_ext::command_graph Graph{Queue}; + + std::vector HostData(Size, 0); + + int *Ptr = malloc_device(Size, Queue); + Queue.copy(HostData.data(), Ptr, Size).wait(); + + auto node = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(nd_range({Size}, {WgSize}), [=](nd_item<1> Item) { + LocalIDBuff[Item.get_local_linear_id()] = Item.get_local_linear_id(); + + Item.barrier(); + + // Check that the memory is accessible from other work-items + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = Item.get_global_linear_id() ^ 1; + Ptr[GlobalIdx] = LocalIDBuff[LocalIdx]; + }); + }); + + auto GraphExec = Graph.finalize(); + + for (unsigned N = 0; N < Iterations; N++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + Queue.wait_and_throw(); + + Queue.copy(Ptr, HostData.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + int Ref = i % WgSize; + assert(check_value(i, Ref, HostData[i], "Ptr")); + } + + free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/work_group_static_memory.cpp b/sycl/test-e2e/Graph/RecordReplay/work_group_static_memory.cpp new file mode 100644 index 0000000000000..98d4f045df50b --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/work_group_static_memory.cpp @@ -0,0 +1,14 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: hip +// UNSUPPORTED-INTENDED: sycl_ext_oneapi_work_group_static is not supported on +// AMD + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/work_group_static_memory.cpp" diff --git a/sycl/test-e2e/Graph/Update/work_group_static_memory_with_dyn_cgf_and_dyn_params.cpp b/sycl/test-e2e/Graph/Update/work_group_static_memory_with_dyn_cgf_and_dyn_params.cpp new file mode 100644 index 0000000000000..0d4f935d4c479 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/work_group_static_memory_with_dyn_cgf_and_dyn_params.cpp @@ -0,0 +1,98 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: hip +// UNSUPPORTED-INTENDED: sycl_ext_oneapi_work_group_static is not supported on +// AMD + +// Tests using sycl_ext_oneapi_work_group_static in a graph node with dynamic +// cgf and dynamic parameter + +#include "../graph_common.hpp" +#include + +constexpr size_t WgSize = 32; + +// Local mem used in kernel +sycl::ext::oneapi::experimental::work_group_static LocalIDBuff; + +int main() { + queue Queue; + exp_ext::command_graph Graph{Queue}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + exp_ext::dynamic_parameter DynParam(Graph, PtrA); + + auto CGFA = [&](handler &CGH) { + CGH.set_arg(0, DynParam); + CGH.parallel_for(nd_range({Size}, {WgSize}), [=](nd_item<1> Item) { + LocalIDBuff[Item.get_local_linear_id()] = Item.get_local_linear_id(); + + Item.barrier(); + + // Check that the memory is accessible from other work-items + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = Item.get_global_linear_id() ^ 1; + PtrA[GlobalIdx] = LocalIDBuff[LocalIdx]; + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.set_arg(0, DynParam); + CGH.parallel_for(nd_range({Size}, {WgSize}), [=](nd_item<1> Item) { + LocalIDBuff[Item.get_local_linear_id()] = Item.get_local_linear_id(); + + Item.barrier(); + + // Check that the memory is accessible from other work-items + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = Item.get_global_linear_id() ^ 1; + PtrA[GlobalIdx] = LocalIDBuff[LocalIdx] - 1; + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, bool nextCGF) { + Queue.memset(PtrA, 0, Size * sizeof(int)); + Queue.memset(PtrB, 0, Size * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size); + Queue.copy(PtrB, HostDataB.data(), Size); + Queue.wait(); + + for (size_t i = 0; i < Size; i++) { + int Ref = nextCGF ? (i % WgSize) - 1 : i % WgSize; + assert(HostDataA[i] == (A ? Ref : 0)); + assert(HostDataB[i] == (B ? Ref : 0)); + } + }; + + ExecuteGraphAndVerifyResults(true, false, false); + + DynParam.update(PtrB); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + DynamicCG.set_active_index(1); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, true); + + free(PtrA, Queue); + free(PtrB, Queue); + return 0; +} From 8998b9b54f85095d10541c50a16492600a72f696 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Mon, 20 Jan 2025 10:45:50 +0000 Subject: [PATCH 13/30] [UR] Unified clang format (#16672) https://github.com/oneapi-src/unified-runtime/pull/1536 Align the UR clang-format version with intel/llvm. --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 9c18fb006a799..c643d1708353e 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 222e4b1d51536bb38e03e2000a79679af0a44a6d -# Merge: 30d183a0 28108a7e +# commit 029a977bc76d1216783c69bfdb18d0db465ea399 +# Merge: 222e4b1d 041179a3 # Author: Kenneth Benzie (Benie) -# Date: Fri Jan 17 10:28:34 2025 +0000 -# Merge pull request #2561 from Bensuo/ben/cmd-buffer-l0-fence -# [L0][CMDBUF] Optimize fence/event waits during update -set(UNIFIED_RUNTIME_TAG 222e4b1d51536bb38e03e2000a79679af0a44a6d) +# Date: Fri Jan 17 16:36:13 2025 +0000 +# Merge pull request #1536 from ldrumm/unified-clang-format +# Unified clang format +set(UNIFIED_RUNTIME_TAG 029a977bc76d1216783c69bfdb18d0db465ea399) From 004d6d905bd8dbee7dadb9280b2959002405334c Mon Sep 17 00:00:00 2001 From: uwedolinsky Date: Mon, 20 Jan 2025 11:56:12 +0000 Subject: [PATCH 14/30] [SYCL][NATIVECPU] added __spir cast builtins to NativeCPU (#16676) Added more support for __spir cast builtins. Needed for e2e etc --- libdevice/nativecpu_utils.cpp | 28 +++++++++------ .../native_cpu/device_builtins.cpp | 36 +++++++++++++++++++ 2 files changed, 54 insertions(+), 10 deletions(-) create mode 100644 sycl/test/check_device_code/native_cpu/device_builtins.cpp diff --git a/libdevice/nativecpu_utils.cpp b/libdevice/nativecpu_utils.cpp index c3e8bb61657a7..eb5c3ff2ebdf4 100644 --- a/libdevice/nativecpu_utils.cpp +++ b/libdevice/nativecpu_utils.cpp @@ -31,16 +31,7 @@ using __nativecpu_state = native_cpu::state; #define OCL_LOCAL __attribute__((opencl_local)) #define OCL_GLOBAL __attribute__((opencl_global)) - -DEVICE_EXTERNAL OCL_LOCAL void * -__spirv_GenericCastToPtrExplicit_ToLocal(void *p, int) { - return (OCL_LOCAL void *)p; -} - -DEVICE_EXTERNAL OCL_GLOBAL void * -__spirv_GenericCastToPtrExplicit_ToGlobal(void *p, int) { - return (OCL_GLOBAL void *)p; -} +#define OCL_PRIVATE __attribute__((opencl_private)) DEVICE_EXTERN_C void __mux_work_group_barrier(uint32_t id, uint32_t scope, uint32_t semantics); @@ -61,6 +52,23 @@ __spirv_MemoryBarrier(uint32_t Memory, uint32_t Semantics) { // Turning clang format off here because it reorders macro invocations // making the following code very difficult to read. // clang-format off + +#define DefGenericCastToPtrExplImpl(sfx, asp, cv)\ +DEVICE_EXTERNAL cv asp void *\ +__spirv_GenericCastToPtrExplicit_##sfx(cv void *p ,int) {\ + return (cv asp void *)p;\ +} + +#define DefGenericCastToPtrExpl(sfx, asp)\ + DefGenericCastToPtrExplImpl(sfx, asp, )\ + DefGenericCastToPtrExplImpl(sfx, asp, const)\ + DefGenericCastToPtrExplImpl(sfx, asp, volatile)\ + DefGenericCastToPtrExplImpl(sfx, asp, const volatile) + +DefGenericCastToPtrExpl(ToPrivate, OCL_PRIVATE) +DefGenericCastToPtrExpl(ToLocal, OCL_LOCAL) +DefGenericCastToPtrExpl(ToGlobal, OCL_GLOBAL) + #define DefSubgroupBlockINTEL1(Type, PType) \ template <> \ __SYCL_CONVERGENT__ DEVICE_EXTERNAL Type \ diff --git a/sycl/test/check_device_code/native_cpu/device_builtins.cpp b/sycl/test/check_device_code/native_cpu/device_builtins.cpp new file mode 100644 index 0000000000000..cd2a392ea4b5e --- /dev/null +++ b/sycl/test/check_device_code/native_cpu/device_builtins.cpp @@ -0,0 +1,36 @@ +// REQUIRES: native_cpu_ock +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -O0 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s + +// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-DEV + +// check that builtins are defined + +// CHECK-NOT: {{.*}}__spirv_GenericCastToPtrExplicit +// CHECK-DEV: {{.*}}__spirv_GenericCastToPtrExplicit + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental; + +#define DefTestCast(FName, Space, PType) \ + SYCL_EXTERNAL auto FName(PType p) { return dynamic_address_cast(p); } + +// Turning clang format off here because it would change the indentations of +// the macro invocations making the following code difficult to read. +// clang-format off + +#define DefTestCastForSpace(PType)\ + DefTestCast(to_local, access::address_space::local_space, PType)\ + DefTestCast(to_global, access::address_space::global_space, PType)\ + DefTestCast(to_private, access::address_space::private_space, PType)\ + DefTestCast(to_generic, access::address_space::generic_space, PType) + +DefTestCastForSpace(int*) +DefTestCastForSpace(const int*) +DefTestCastForSpace(volatile int*) +DefTestCastForSpace(const volatile int*) + +int main(){} +// check that the generated module has the is-native-cpu module flag set +// CHECK: !{{[0-9]*}} = !{i32 1, !"is-native-cpu", i32 1} From e257292972289e08c913e3b12044b78c2632e5d2 Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Mon, 20 Jan 2025 13:02:24 +0100 Subject: [PATCH 15/30] [SYCL][Matrix] Propagate constexpr matrix layout even with O0 (#16628) Per SPIR-V specification Layout of a matrix must be a constant instruction aka a constexpr or specialization constant. Meanwhile in SYCL headers layout is passed as a parameter to joint_matrix_load function, so even if that layout is a constant expression in the user's code - it's not possible to prove that to the compiler, so constant propagation will happen only after inlining, not in AST. That means, that with O0 layout would remain to be a runtime variable in LLVM IR. SYCL matrix layout is being mapped on SPIR-V matrix layout by joint_matrix_layout_to_spv function. This patch adds routine that finds calls to this function and replaces them with the found constant. To help this routine always_inline attribute was removed from joint_matrix_layout_to_spv function. --------- Signed-off-by: Sidorov, Dmitry --- .../SYCLLowerIR/SYCLJointMatrixTransform.cpp | 87 +++++++++++++++++- .../JointMatrixTransform/constexpr-layout.ll | 90 +++++++++++++++++++ .../oneapi/matrix/matrix-unified-utils.hpp | 4 +- 3 files changed, 178 insertions(+), 3 deletions(-) create mode 100644 llvm/test/SYCLLowerIR/JointMatrixTransform/constexpr-layout.ll diff --git a/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp b/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp index c5c03b2ae1c16..1a39c994b1ede 100644 --- a/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp @@ -21,6 +21,7 @@ namespace { static constexpr char ACCESS_CHAIN[] = "_Z19__spirv_AccessChain"; static constexpr char MATRIX_TYPE[] = "spirv.CooperativeMatrixKHR"; +static constexpr char MATRIX_LAYOUT[] = "joint_matrix_layout_to_spv"; Type *getInnermostType(Type *Ty) { while (auto *ArrayTy = dyn_cast(Ty)) @@ -184,17 +185,99 @@ bool transformAccessChain(Function *F) { } return ModuleChanged; } + +StoreInst *findLastStoreBeforeLoad(Value *Ptr, Instruction *Load) { + BasicBlock::iterator It(Load); + while (It != Load->getParent()->begin()) { + --It; + if (auto *Store = dyn_cast(&*It)) + if (Store->getPointerOperand() == Ptr) + return Store; + } + return nullptr; +} + +// Per SPIR-V specification Layout of a matrix must be a constant instruction +// aka a constexpr or specialization constant. Meanwhile in SYCL headers +// layout is passed as a parameter to joint_matrix_load function, so even if +// that layout is a constant expression in the user's code - it's not possible +// to prove that to the compiler, so constant propagation will happen only +// after inlining, not in AST. That means, that with O0 layout would remain +// to be a runtime variable in LLVM IR. +// SYCL matrix layout is being mapped on SPIR-V matrix layout by +// joint_matrix_layout_to_spv function. The following routine finds calls to +// this function and replaces them with the found constant. +// This function also cleans up code, that becomes dead. Pattern of the dead +// code is stable, as user's code doesn't affect it. +bool propagateConstexprLayout(Function *F) { + llvm::SmallVector ToErase; + for (auto I = F->user_begin(), E = F->user_end(); I != E;) { + User *U = *I++; + auto *CI = dyn_cast(U); + if (!CI) + continue; + auto *Op = dyn_cast(CI->getArgOperand(0)); + if (!Op || !isa(Op)) + continue; + auto *Ptr = dyn_cast(cast(Op)->getPointerOperand()); + if (!Ptr) + continue; + + ConstantInt *ConstLayout = nullptr; + StoreInst *SI = findLastStoreBeforeLoad(Ptr, Op); + if (!SI) + continue; + ConstLayout = dyn_cast(SI->getValueOperand()); + if (ConstLayout) { + CI->replaceAllUsesWith(ConstLayout); + ToErase.push_back(CI); + ToErase.push_back(SI); + ToErase.push_back(Op); + ToErase.push_back(Ptr); + if (auto *Cast = dyn_cast(Ptr)) { + auto *OrigPtr = Cast->getPointerOperand(); + if (auto *AI = dyn_cast(OrigPtr)) + ToErase.push_back(AI); + } + } + } + + // There are possible cases, when a single instruction result is used multiple + // times. For this case we have to use a vector to store such instructions + // and keep track if we have removed them before to avoid double free(). + SmallPtrSet Erased; + for (Instruction *II : ToErase) { + if (!II->use_empty()) + continue; + if (Erased.contains(II)) + continue; + II->dropAllReferences(); + II->eraseFromParent(); + Erased.insert(II); + } + return !ToErase.empty(); +} } // namespace PreservedAnalyses SYCLJointMatrixTransformPass::run(Module &M, ModuleAnalysisManager &MAM) { bool ModuleChanged = false; + llvm::SmallVector ToErase; for (Function &F : M) { - if (!F.isDeclaration()) - continue; + if (!F.isDeclaration()) { + if (F.getName() == MATRIX_LAYOUT) { + ModuleChanged |= propagateConstexprLayout(&F); + ToErase.push_back(&F); + } else + continue; + } if (F.getName().starts_with(ACCESS_CHAIN)) ModuleChanged |= transformAccessChain(&F); } + for (auto *F : ToErase) + if (F->users().empty()) + F->eraseFromParent(); + return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/test/SYCLLowerIR/JointMatrixTransform/constexpr-layout.ll b/llvm/test/SYCLLowerIR/JointMatrixTransform/constexpr-layout.ll new file mode 100644 index 0000000000000..b2d352a809be9 --- /dev/null +++ b/llvm/test/SYCLLowerIR/JointMatrixTransform/constexpr-layout.ll @@ -0,0 +1,90 @@ +; The test checks, that users of the call to joint_matrix_layout_to_spv matrix +; are replaced with the layout constant. + +; RUN: opt -passes=sycl-joint-matrix-transform < %s -S | FileCheck %s + +; ModuleID = 'test.bc' +source_filename = "test.cpp" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +$joint_matrix_layout_to_spv = comdat any + +; CHECK: define weak_odr dso_local spir_kernel void @test +; CHECK-NEXT: entry: +; CHECK-NEXT: %{{.*}} = call spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHR{{.*}}(ptr addrspace(1){{.*}}, i32 noundef 0, i64 noundef{{.*}} +; CHECK-NEXT: %{{.*}} = call spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHR{{.*}}(ptr addrspace(1){{.*}}, i32 noundef 1, i64 noundef{{.*}} +; CHECK-NEXT: %{{.*}} = call spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHR{{.*}}(ptr addrspace(1){{.*}}, i32 noundef 1, i64 noundef{{.*}} +; CHECK-NEXT: %{{.*}} = call spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHR{{.*}}(ptr addrspace(1){{.*}}, i32 noundef 2, i64 noundef{{.*}} +; CHECK-NEXT: ret void + +; CHECK-NOT: joint_matrix_layout_to_spv + +define weak_odr dso_local spir_kernel void @test(ptr addrspace(1) %matrix.1, ptr addrspace(1) %matrix.2, i64 noundef %stride) { +entry: + %layout.1 = alloca i32, align 4 + %layout.2 = alloca i32, align 4 + %layout.ascast.1 = addrspacecast ptr %layout.1 to ptr addrspace(4) + %layout.ascast.2 = addrspacecast ptr %layout.2 to ptr addrspace(4) + store i32 0, ptr addrspace(4) %layout.ascast.1, align 4 + store i32 1, ptr addrspace(4) %layout.ascast.2, align 4 + + %layout.val.1 = load i32, ptr addrspace(4) %layout.ascast.1, align 4 + %layout.spv.1 = call spir_func noundef i32 @joint_matrix_layout_to_spv(i32 noundef %layout.val.1) + %mload.1 = call spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHRIU3AS1ffLm16ELm16ELN5__spv9MatrixUseE2ELNS1_12MatrixLayoutE3ELNS1_5Scope4FlagE3EEPNS1_28__spirv_CooperativeMatrixKHRIT0_XT5_EXT1_EXT2_EXT3_EEEPT_S3_mi(ptr addrspace(1) noundef %matrix.1, i32 noundef %layout.spv.1, i64 noundef %stride, i32 noundef 0) + + %layout.val.2 = load i32, ptr addrspace(4) %layout.ascast.2, align 4 + %layout.spv.2 = call spir_func noundef i32 @joint_matrix_layout_to_spv(i32 noundef %layout.val.2) + %mload.2 = call spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHRIU3AS1ffLm16ELm16ELN5__spv9MatrixUseE2ELNS1_12MatrixLayoutE3ELNS1_5Scope4FlagE3EEPNS1_28__spirv_CooperativeMatrixKHRIT0_XT5_EXT1_EXT2_EXT3_EEEPT_S3_mi(ptr addrspace(1) noundef %matrix.2, i32 noundef %layout.spv.2, i64 noundef %stride, i32 noundef 0) + + %layout.spv.3 = call spir_func noundef i32 @joint_matrix_layout_to_spv(i32 noundef %layout.val.2) + %mload.3 = call spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHRIU3AS1ffLm16ELm16ELN5__spv9MatrixUseE2ELNS1_12MatrixLayoutE3ELNS1_5Scope4FlagE3EEPNS1_28__spirv_CooperativeMatrixKHRIT0_XT5_EXT1_EXT2_EXT3_EEEPT_S3_mi(ptr addrspace(1) noundef %matrix.2, i32 noundef %layout.spv.3, i64 noundef %stride, i32 noundef 0) + + store i32 2, ptr addrspace(4) %layout.ascast.2, align 4 + %layout.val.4 = load i32, ptr addrspace(4) %layout.ascast.2, align 4 + %layout.spv.4 = call spir_func noundef i32 @joint_matrix_layout_to_spv(i32 noundef %layout.val.4) + %mload.4 = call spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHRIU3AS1ffLm16ELm16ELN5__spv9MatrixUseE2ELNS1_12MatrixLayoutE3ELNS1_5Scope4FlagE3EEPNS1_28__spirv_CooperativeMatrixKHRIT0_XT5_EXT1_EXT2_EXT3_EEEPT_S3_mi(ptr addrspace(1) noundef %matrix.2, i32 noundef %layout.spv.4, i64 noundef %stride, i32 noundef 0) + ret void +} + +declare dso_local spir_func noundef target("spirv.CooperativeMatrixKHR", float, 3, 16, 16, 2) @_Z32__spirv_CooperativeMatrixLoadKHRIU3AS1ffLm16ELm16ELN5__spv9MatrixUseE2ELNS1_12MatrixLayoutE3ELNS1_5Scope4FlagE3EEPNS1_28__spirv_CooperativeMatrixKHRIT0_XT5_EXT1_EXT2_EXT3_EEEPT_S3_mi(ptr addrspace(1) noundef, i32 noundef, i64 noundef, i32 noundef) + +define linkonce_odr dso_local spir_func noundef i32 @joint_matrix_layout_to_spv(i32 noundef %Layout) comdat { +entry: + %retval = alloca i32, align 4 + %Layout.addr = alloca i32, align 4 + %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4) + %Layout.addr.ascast = addrspacecast ptr %Layout.addr to ptr addrspace(4) + store i32 %Layout, ptr addrspace(4) %Layout.addr.ascast, align 4 + %0 = load i32, ptr addrspace(4) %Layout.addr.ascast, align 4 + switch i32 %0, label %sw.epilog [ + i32 0, label %sw.bb + i32 1, label %sw.bb1 + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: ; preds = %entry + store i32 0, ptr addrspace(4) %retval.ascast, align 4 + br label %return + +sw.bb1: ; preds = %entry + store i32 1, ptr addrspace(4) %retval.ascast, align 4 + br label %return + +sw.bb2: ; preds = %entry + store i32 2, ptr addrspace(4) %retval.ascast, align 4 + br label %return + +sw.bb3: ; preds = %entry + store i32 3, ptr addrspace(4) %retval.ascast, align 4 + br label %return + +sw.epilog: ; preds = %entry + call void @llvm.trap() + unreachable + +return: ; preds = %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb + %1 = load i32, ptr addrspace(4) %retval.ascast, align 4 + ret i32 %1 +} diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp index 865735617bbd8..349acae157ae7 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp @@ -69,7 +69,9 @@ convertMatrixUseStringToEnum(const char *UseString) { return std::nullopt; } -inline __SYCL_ALWAYS_INLINE __spv::MatrixLayout joint_matrix_layout_to_spv( +// propagateConstexprLayout uses the exact name of the function, so we use +// extern "C" here. +extern "C" constexpr __spv::MatrixLayout joint_matrix_layout_to_spv( sycl::ext::oneapi::experimental::matrix::layout Layout) { switch (Layout) { case sycl::ext::oneapi::experimental::matrix::layout::row_major: From 291eeeed16e133d0f3f38458a7b0fbeb38056593 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Mon, 20 Jan 2025 12:05:12 +0000 Subject: [PATCH 16/30] [SYCLCompat] Optimize/(fix?) permute_sub_group_by_xor if `logical_sub_group_size == 32` (#16646) `syclcompat::permute_sub_group_by_xor` was reported to flakily fail on L0. Closer inspection revealed that the implementation of `permute_sub_group_by_xor` is incorrect for cases where `logical_sub_group_size != 32`, which is one of the test cases. This implies that the test itself is wrong. In this PR we first optimize the part of the implementation that is valid assuming that Intel spirv builtins are correct (which is also the only case realistically a user will program): case `logical_sub_group_size == 32`, in order to: - Ensure the only useful case is working via the correct optimized route. - Check that this improvement doesn't break the suspicious test. A follow on PR can fix the other cases where `logical_sub_group_size != 32`: this is better to do later, since - the only use case I know of for this is to implement non-uniform group algorithms that we already have implemented (e.g. see https://github.com/intel/llvm/pull/9671) and any user is advised to use such algorithms instead of reimplementing them themselves. - This must I think require a complete reworking of the test and would otherwise delay the more important change here. --------- Signed-off-by: JackAKirk --- sycl/include/syclcompat/util.hpp | 3 +++ .../util/util_permute_sub_group_by_xor.cpp | 19 +++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp index 8907f32d5e7e6..df03599ea6ad0 100644 --- a/sycl/include/syclcompat/util.hpp +++ b/sycl/include/syclcompat/util.hpp @@ -410,6 +410,9 @@ T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, template T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size = 32) { + if (logical_sub_group_size == 32) { + return permute_group_by_xor(g, x, mask); + } unsigned int id = g.get_local_linear_id(); unsigned int start_index = id / logical_sub_group_size * logical_sub_group_size; diff --git a/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp b/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp index 7b877d826f18b..6b0b478b1e367 100644 --- a/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp +++ b/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp @@ -86,11 +86,9 @@ void test_permute_sub_group_by_xor() { syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device(); sycl::queue *q_ct1 = dev_ct1.default_queue(); bool Result = true; - int *dev_data = nullptr; unsigned int *dev_data_u = nullptr; sycl::range<3> GridSize(1, 1, 1); sycl::range<3> BlockSize(1, 1, 1); - dev_data = sycl::malloc_device(DATA_NUM, *q_ct1); dev_data_u = sycl::malloc_device(DATA_NUM, *q_ct1); GridSize = sycl::range<3>(1, 1, 2); @@ -120,6 +118,19 @@ void test_permute_sub_group_by_xor() { q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int)) .wait(); verify_data(host_dev_data_u, expect1, DATA_NUM); + sycl::free(dev_data_u, *q_ct1); +} + +void test_permute_sub_group_by_xor_extra_arg() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device(); + sycl::queue *q_ct1 = dev_ct1.default_queue(); + bool Result = true; + unsigned int *dev_data_u = nullptr; + sycl::range<3> GridSize(1, 1, 1); + sycl::range<3> BlockSize(1, 1, 1); + dev_data_u = sycl::malloc_device(DATA_NUM, *q_ct1); GridSize = sycl::range<3>(1, 1, 2); BlockSize = sycl::range<3>(1, 2, 32); @@ -133,6 +144,7 @@ void test_permute_sub_group_by_xor() { 91, 90, 93, 92, 95, 94, 97, 96, 99, 98, 101, 100, 103, 102, 105, 104, 107, 106, 109, 108, 111, 110, 113, 112, 115, 114, 117, 116, 119, 118, 121, 120, 123, 122, 125, 124, 127, 126}; + unsigned int host_dev_data_u[DATA_NUM]; init_data(host_dev_data_u, DATA_NUM); q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int)) @@ -147,13 +159,12 @@ void test_permute_sub_group_by_xor() { q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int)) .wait(); verify_data(host_dev_data_u, expect2, DATA_NUM); - - sycl::free(dev_data, *q_ct1); sycl::free(dev_data_u, *q_ct1); } int main() { test_permute_sub_group_by_xor(); + test_permute_sub_group_by_xor_extra_arg(); return 0; } From 16ca790241fe7a1c566076395db9db4c2be2bd57 Mon Sep 17 00:00:00 2001 From: Martin Grant Date: Mon, 20 Jan 2025 12:05:29 +0000 Subject: [PATCH 17/30] [UR] Update tag to 8b7a9957 for https://github.com/oneapi-src/unified-runtime/pull/2582 (#16689) https://github.com/oneapi-src/unified-runtime/pull/2582 --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index c643d1708353e..ee0c29ea62a65 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 029a977bc76d1216783c69bfdb18d0db465ea399 -# Merge: 222e4b1d 041179a3 -# Author: Kenneth Benzie (Benie) -# Date: Fri Jan 17 16:36:13 2025 +0000 -# Merge pull request #1536 from ldrumm/unified-clang-format -# Unified clang format -set(UNIFIED_RUNTIME_TAG 029a977bc76d1216783c69bfdb18d0db465ea399) +# commit 8b7a99578966eb691a961d9620ea38d235196b2f +# Merge: ed095412 7b0e3b19 +# Author: Martin Grant +# Date: Mon Jan 20 09:27:22 2025 +0000 +# Merge pull request #2582 from przemektmalon/przemek/intel-host-usm-support +# Enable Host USM backed images on Level Zero +set(UNIFIED_RUNTIME_TAG 8b7a99578966eb691a961d9620ea38d235196b2f) From 2fb0cb3c72116bc59dc456bd896e044fb3746920 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 20 Jan 2025 13:52:18 +0000 Subject: [PATCH 18/30] [SYCL][Doc] Reformat launch queries extension (#16014) Adopt the longer Constraints/Effects/Returns format from ISO C++, which clarifies how the different overloads are intended to work. --------- Signed-off-by: John Pennycook --- .../sycl_ext_oneapi_launch_queries.asciidoc | 297 +++++++++++------- 1 file changed, 188 insertions(+), 109 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc index ee52d75b8fd21..16f5a27357d6d 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc @@ -10,6 +10,7 @@ :encoding: utf-8 :lang: en :dpcpp: pass:[DPC++] +:endnote: —{nbsp}end{nbsp}note // Set the default source code type in this document to C++, // for syntax highlighting purposes. This is needed because @@ -106,14 +107,12 @@ If the `sycl::nd_range` parameter used to launch a kernel is incompatible with the results of a kernel's launch queries, an implementation must throw a synchronous exception with the `errc::nd_range` error code. -[NOTE] -==== -The values returned by `ext_oneapi_get_info` account for all properties -attached to a kernel (via the mechanisms defined in the +[_Note_: The values returned by `ext_oneapi_get_info` account for all +properties attached to a kernel (via the mechanisms defined in the sycl_ext_oneapi_kernel_properties extension), as well as the usage of features -like group algorithms and work-group local memory. Developers should assume -that the values will differ across kernels. -==== +like group algorithms and work-group local memory. +Developers should assume that the values will differ across +kernels._{endnote}_] [source,c++] ---- @@ -121,46 +120,61 @@ namespace sycl { class kernel { public: - template - /*return-type*/ ext_oneapi_get_info(T... args) const; -}; -} ----- + // Only available if Param is max_work_item_sizes<1> + template + id<1> ext_oneapi_get_info(sycl::queue q) const; -[source,c++] ----- -template -/*return-type*/ ext_oneapi_get_info(T... args) const; ----- -_Constraints_: Available only when the types `+T...+` described by the parameter -pack match the types defined in the table below. + // Only available if Param is max_work_item_sizes<2> + template + id<2> ext_oneapi_get_info(sycl::queue q) const; -_Preconditions_: `Param` must be one of the `info::kernel` descriptors defined -in this extension. + // Only available if Param is max_work_item_sizes<3> + template + id<3> ext_oneapi_get_info(sycl::queue q) const; -_Returns_: Information about the kernel that applies when the kernel is -submitted with the configuration described by the parameter pack `+T...+`. -The return type is defined in the table below. + // Only available if Param is max_work_group_size + template + size_t ext_oneapi_get_info(sycl::queue q) const; -This extension adds several new queries to this interface, many of which -already have equivalents in the `kernel_device_specific` or `device` -namespaces. + // Only available if Param is max_num_work_groups + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<1> r, size_t bytes = 0) const; -NOTE: These queries are queue- and not device-specific because it is -anticipated that implementations will introduce finer-grained queue -controls that impact the scheduling of kernels. + // Only available if Param is max_num_work_groups + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<2> r, size_t bytes = 0) const; -NOTE: Allowing devices to return a value of 1 for these queries maximizes the -chances that code written to use certain extension remains portable. However, -the performance of kernels using only one work-group, sub-group or work-item -may be limited on some (highly parallel) devices. If certain properties (e.g. -forward progress guarantees, cross-work-group synchronization) are being used -as part of a performance optimization, developers should check that the values -returned by these queries is not 1. + // Only available if Param is max_num_work_groups + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<3> r, size_t bytes = 0) const; + + // Only available if Param is max_sub_group_size + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<1> r) const; + + // Only available if Param is max_sub_group_size + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<2> r) const; + + // Only available if Param is max_sub_group_size + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<3> r) const; + + // Only available if Param is num_sub_groups + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<1> r) const; + + // Only available if Param is num_sub_groups + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<2> r) const; + + // Only available if Param is num_sub_groups + template + uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<3> r) const; + +}; -[source, c++] ----- namespace ext::oneapi::experimental::info::kernel { template @@ -169,91 +183,156 @@ struct max_work_item_sizes; struct max_work_group_size; struct max_num_work_groups; -} +struct max_sub_group_size; +struct num_sub_groups; + +} // namespace ext::oneapi::experimental::info::kernel + +} // namespace sycl ---- -[%header,cols="1,5,5,5"] -|=== -|Kernel Descriptor -|Argument Types -|Return Type -|Description +==== Querying valid launch configurations + +This extension adds several new queries for reasoning about the set of valid +launch configurations for a given kernel, many of which already have +equivalents in the `kernel_device_specific` or `device` namespaces. + +[_Note_: These queries are queue- and not device-specific because it is +anticipated that implementations will introduce finer-grained queue +controls that impact the scheduling of kernels._{endnote}_] + +[_Note_: Allowing devices to return a value of 1 for these queries maximizes +the chances that code written to use certain extension remains portable. +However, the performance of kernels using only one work-group, sub-group or +work-item may be limited on some (highly parallel) devices. +If certain properties (e.g. forward progress guarantees, cross-work-group +synchronization) are being used as part of a performance optimization, +developers should check that the values returned by these queries is not +1._{endnote}_] + +''' + +[source,c++] +---- +template +id<1> ext_oneapi_get_info(sycl::queue q) const; // (1) + +template +id<2> ext_oneapi_get_info(sycl::queue q) const; // (2) + +template +id<3> ext_oneapi_get_info(sycl::queue q) const; // (3) +---- +_Constraints (1)_: `Param` is `max_work_item_sizes<1>`. + +_Constraints (2)_: `Param` is `max_work_item_sizes<2>`. + +_Constraints (3)_: `Param` is `max_work_item_sizes<3>`. + +_Returns_: The maximum number of work-items that are permitted in each +dimension of a work-group, when the kernel is submitted to the specified queue, +accounting for any kernel properties or features. +If the kernel can be submitted to the specified queue without an error, the +minimum value returned by this query is 1, otherwise it is 0. -|`template - max_work_item_sizes` -|`sycl::queue` -|`id` -|Returns the maximum number of work-items that are permitted in each dimension - of a work-group, when the kernel is submitted to the specified queue, - accounting for any kernel properties or features. If the kernel can be - submitted to the specified queue without an error, the minimum value returned - by this query is 1, otherwise it is 0. - -|`max_work_group_size` -|`sycl::queue` -|`size_t` -|Returns the maximum number of work-items that are permitted in a work-group, +''' + +[source,c++] +---- +template +size_t ext_oneapi_get_info(sycl::queue q) const; +---- +_Constraints_: `Param` is `max_work_group_size`. + +_Returns_: The maximum number of work-items that are permitted in a work-group, when the kernel is submitted to the specified queue, accounting for any -kernel properties or features. If the kernel can be submitted to the specified -queue without an error, the minimum value returned by this query is 1, -otherwise it is 0. - -|`max_num_work_groups` -|`sycl::queue`, `sycl::range`, `size_t` -|`size_t` -|Returns the maximum number of work-groups, when the kernel is submitted to the -specified queue with the specified work-group size and the specified amount of -dynamic work-group local memory (in bytes), accounting for any kernel -properties or features. If the specified work-group size is 0, which is -invalid, then the implementation will throw a synchronous exception with the -`errc::invalid` error code. If the kernel can be submitted to the specified -queue without an error, the minimum value returned by this query is 1, -otherwise it is 0. +kernel properties or features. +If the kernel can be submitted to the specified queue without an error, the +minimum value returned by this query is 1, otherwise it is 0. -|=== +''' -A separate set of launch queries can be used to reason about how an -implementation will launch a kernel on the specified queue. The values of these -queries should also be checked if a kernel is expected to be launched in a -specific way (e.g., if the kernel requires two sub-groups for correctness). +[source,c++] +---- +template +size_t ext_oneapi_get_info(sycl::queue q, sycl::range<1> r, size_t bytes = 0) const; -[source, c++] +template +size_t ext_oneapi_get_info(sycl::queue q, sycl::range<2> r, size_t bytes = 0) const; + +template +size_t ext_oneapi_get_info(sycl::queue q, sycl::range<3> r, size_t bytes = 0) const; ---- -namespace ext::oneapi::experimental::info::kernel { +_Constraints_: `Param` is `max_num_work_groups`. -struct max_sub_group_size; -struct num_sub_groups; +_Returns_: The maximum number of work-groups, when the kernel is submitted to +the specified queue with the specified work-group size and the specified amount +of dynamic work-group local memory (in bytes), accounting for any kernel +properties or features. +If the kernel can be submitted to the specified queue without an +error, the minimum value returned by this query is 1, otherwise it is 0. + +_Throws_: A synchronous `exception` with the error code `errc::invalid` if the +work-group size `r` is 0. + + +==== Querying launch behavior + +A separate set of launch queries can be used to reason about how an +implementation will launch a kernel on the specified queue. +The values of these queries should also be checked if a kernel is expected to +be launched in a specific way (e.g., if the kernel requires two sub-groups for +correctness). + +''' -} +[source,c++] ---- +template +uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<1> r) const; -[%header,cols="1,5,5,5"] -|=== -|Kernel Descriptor -|Argument Types -|Return Type -|Description +template +uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<2> r) const; -|`max_sub_group_size` -|`sycl::queue`, `sycl::range` -|`uint32_t` -|Returns the maximum sub-group size, when the kernel is submitted to the +template +uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<3> r) const; +---- +_Constraints_: `Param` is `max_sub_group_size`. + +_Returns_: The maximum sub-group size, when the kernel is submitted to the specified queue with the specified work-group size, accounting for any kernel -properties or features. The return value of this query must match the value -returned by `sub_group::get_max_local_range()` inside the kernel. If the kernel -can be submitted to the specified queue without an error, the minimum value -returned by this query is 1, otherwise it is 0. - -|`num_sub_groups` -|`sycl::queue`, `sycl::range` -|`uint32_t` -|Returns the number of sub-groups per work-group, when the kernel is submitted -to the specified queue with the specified work-group size, accounting for any -kernel properties or features. If the kernel can be submitted to the specified -queue without an error, the minimum value returned by this query is 1, -otherwise it is 0. +properties or features. +The return value of this query must match the value returned by +`sub_group::get_max_local_range()` inside the kernel. +If the kernel can be submitted to the specified queue without an error, the +minimum value returned by this query is 1, otherwise it is 0. -|=== +_Throws_: A synchronous `exception` with the error code `errc::invalid` if the +work-group size `r` is 0. + +''' + +[source,c++] +---- +template +uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<1> r) const; + +template +uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<2> r) const; + +template +uint32_t ext_oneapi_get_info(sycl::queue q, sycl::range<3> r) const; +---- +_Constraints_: `Param` is `num_sub_groups`. + +_Returns_: The number of sub-groups per work-group, when the kernel is +submitted to the specified queue with the specified work-group size, accounting +for any kernel properties or features. +If the kernel can be submitted to the specified queue without an error, the +minimum value returned by this query is 1, otherwise it is 0. + +_Throws_: A synchronous `exception` with the error code `errc::invalid` if the +work-group size `r` is 0. == Issues From 38e6e1bf599c49d0ec4d827a90506ff8ba071c2f Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Mon, 20 Jan 2025 16:49:06 +0100 Subject: [PATCH 19/30] [CI][CTS] Add filter for build-only mode (#16671) In nightly we build and run CTS separately, so the build contains the full set of tests. Adding a filter to exclude categories that are not supported at all (e.g. compfail). --- .github/workflows/sycl-linux-run-tests.yml | 4 +++- devops/cts_exclude_filter_compfails | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 devops/cts_exclude_filter_compfails diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index 3e4066fdc1904..269507943c573 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -361,7 +361,9 @@ jobs: cts_exclude_filter="" # If CTS_TESTS_TO_BUILD is null - use filter if [ -z "$CTS_TESTS_TO_BUILD" ]; then - if [ "${{ contains(inputs.target_devices, 'opencl:cpu') }}" = "true" ]; then + if [ "${{ contains(inputs.cts_testing_mode, 'build-only') }}" = "true" ]; then + cts_exclude_filter=$PWD/devops/cts_exclude_filter_compfails + elif [ "${{ contains(inputs.target_devices, 'opencl:cpu') }}" = "true" ]; then cts_exclude_filter=$PWD/devops/cts_exclude_filter_OCL_CPU elif [ "${{ contains(inputs.target_devices, 'level_zero:gpu') }}" = "true" ]; then cts_exclude_filter=$PWD/devops/cts_exclude_filter_L0_GPU diff --git a/devops/cts_exclude_filter_compfails b/devops/cts_exclude_filter_compfails new file mode 100644 index 0000000000000..44d3870b88048 --- /dev/null +++ b/devops/cts_exclude_filter_compfails @@ -0,0 +1,2 @@ +# Please use "#" to add comments here. +# Do not delete the file even if it's empty. From 1758eafc753867aa219fc59f9f4392c1f8364165 Mon Sep 17 00:00:00 2001 From: Marcos Maronas Date: Tue, 21 Jan 2025 07:31:02 +0000 Subject: [PATCH 20/30] [SYCL] Drop undesired fp64 aspect requirement. (#16690) By using `4.0` instead of `4.0f`, we're using a `double`, implicitly requiring support for `aspect::fp64`. We don't really require it. --- .../multi_device_bundle/device_libs_and_caching.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp b/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp index 2a54744ebdef5..78872c04d97ec 100644 --- a/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp +++ b/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp @@ -57,7 +57,7 @@ int main() { auto res = sycl::malloc_host(3, ctx); auto KernelLambda = [=]() { - res[0] = sycl::ext::intel::math::float2int_rd(4.0) + (int)sqrtf(4.0f) + + res[0] = sycl::ext::intel::math::float2int_rd(4.0f) + (int)sqrtf(4.0f) + std::exp(std::complex(0.f, 0.f)).real(); }; // Test case 1 From eb8101e445279c31d2b29f821035dd1767456c62 Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Tue, 21 Jan 2025 07:55:02 +0000 Subject: [PATCH 21/30] [SYCL][Bindless] Clarify the types of supported USM memory in bindless images (#16622) This PR adds more fine-grained information about the types of USM allocations are supported with bindless images. Signed-off-by: Georgi Mirazchiyski --- .../sycl_ext_oneapi_bindless_images.asciidoc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc index 130e54b386cda..d99f988d49cbb 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc @@ -428,6 +428,15 @@ The second way to allocate image memory is to use USM allocations. SYCL already provides a number of USM allocation functions. This proposal would add another, pitched memory allocation, through `pitched_alloc_device`. +Bindless images can be backed by device, host, or shared USM memory allocations. + +[NOTE] +==== +Image memory backed by USM device and host allocations is generally supported, +whereas shared USM allocations depend on the SYCL backend as well as the device +capabilities. +==== + ```cpp namespace sycl::ext::oneapi::experimental { @@ -2328,4 +2337,5 @@ These features still need to be handled: |6.4|2024-10-15| - Fix bindless spec examples and include examples in bindless spec using asciidoc include. |6.5|2024-10-22| - Allow 3-channel image formats on some backends. +|6.6|2025-01-20| - Clarify support for the specific types of USM allocations. |====================== From 903279c316fe06280f3b3a4afcfe60ec6bc6eae8 Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Tue, 21 Jan 2025 00:15:51 -0800 Subject: [PATCH 22/30] [SYCL] disabling two test clauses while opening a JIRA (#16699) to address post-commit failure here: https://github.com/intel/llvm/issues/16693 This fpga_pipe.cpp test has been passing because it was using the wrong binary operation to fold together test results. I fixed this in this and some other tests recently. My PR passed the CI because, apparently, we don't have exercise the FPGA accelerator (or emulator) there. fpga_pipe.cpp tests 12 different combinations, the last two ( `test_array_th_nb_pipe` and `test_array_th_bl_pipe` ) are failing. Opening a JIRA. --- sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp b/sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp index 0eb58c2fcf66c..cf3a87c0088c6 100644 --- a/sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp +++ b/sycl/test-e2e/Basic/fpga_tests/fpga_pipes.cpp @@ -345,8 +345,12 @@ int main() { Error |= test_multiple_bl_pipe(Queue); // Test for an array data passing through a pipe - Error |= test_array_th_nb_pipe(Queue); - Error |= test_array_th_bl_pipe(Queue); + // These two tests are failing in post-commit testing ( + // https://github.com/intel/llvm/issues/16693 ) disabling them, rather than + // the entire test. + + // Error |= test_array_th_nb_pipe(Queue); + // Error |= test_array_th_bl_pipe(Queue); // TODO Remove when #14308 is closed std::cerr << "DEBUG: Finished with result " << Error << std::endl; From 9c65739ea12b12633701c6b438ad50bf4ed0fc32 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Tue, 21 Jan 2025 13:28:52 +0000 Subject: [PATCH 23/30] [UR] Bump with DEVICE_INFO_PROGRAM_SET_SPECIALIZATION_CONSTANTS (#16659) UR: https://github.com/oneapi-src/unified-runtime/pull/2539 --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index ee0c29ea62a65..2bfa1cbef4657 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,8 @@ -# commit 8b7a99578966eb691a961d9620ea38d235196b2f -# Merge: ed095412 7b0e3b19 -# Author: Martin Grant -# Date: Mon Jan 20 09:27:22 2025 +0000 -# Merge pull request #2582 from przemektmalon/przemek/intel-host-usm-support -# Enable Host USM backed images on Level Zero -set(UNIFIED_RUNTIME_TAG 8b7a99578966eb691a961d9620ea38d235196b2f) +# commit b074893e854d28141cd67bc5935ed87e47eb3bb6 +# Merge: 71a5eab0 128ea023 +# Author: Ross Brunton +# Date: Tue Jan 21 11:21:50 2025 +0000 +# Merge pull request #2539 from RossBrunton/ross/specconst +# +# Added `DEVICE_INFO_PROGRAM_SET_SPECIALIZATION_CONSTANTS` +set(UNIFIED_RUNTIME_TAG b074893e854d28141cd67bc5935ed87e47eb3bb6) From d49fca13a2df99b85991144a1fce8e234aad7b56 Mon Sep 17 00:00:00 2001 From: Buildbot for SYCL Date: Tue, 21 Jan 2025 23:49:40 +0800 Subject: [PATCH 24/30] [GHA] Uplift Linux GPU RT version to 24.52.32224.5 (#16701) Scheduled drivers uplift Co-authored-by: GitHub Actions --- devops/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/dependencies.json b/devops/dependencies.json index 79892387df4c1..3d42a52a2f497 100644 --- a/devops/dependencies.json +++ b/devops/dependencies.json @@ -8,7 +8,7 @@ }, "igc": { "github_tag": "v2.5.6", - "version": "2.5.6", + "version": "v2.5.6", "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/v2.5.6", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, From 2737c43451e842bfd4f6c32de850d9d78f8e3c09 Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Tue, 21 Jan 2025 16:51:49 +0100 Subject: [PATCH 25/30] [CI][CTS] Disable test_handler (#16710) --- devops/cts_exclude_filter_compfails | 2 ++ 1 file changed, 2 insertions(+) diff --git a/devops/cts_exclude_filter_compfails b/devops/cts_exclude_filter_compfails index 44d3870b88048..b8b4bb2843ded 100644 --- a/devops/cts_exclude_filter_compfails +++ b/devops/cts_exclude_filter_compfails @@ -1,2 +1,4 @@ # Please use "#" to add comments here. # Do not delete the file even if it's empty. +# See https://github.com/intel/llvm/pull/16615 +handler From ac035eb9c6cb16fd68c55772f8f50e131a31c4ee Mon Sep 17 00:00:00 2001 From: lucyli-ca <107629053+lucyli-ca@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:15:49 -0500 Subject: [PATCH 26/30] Bump jinja2 to 3.1.5 in llvm/docs/requirements-hashed.txt (#16717) PR to bump dependency version to resolve security vulnerability found. In current version, Jinja has a sandbox breakout through malicious filenames - a bug in the Jinja compiler allows an attacker that controls both the content and filename of a template to execute arbitrary Python code, regardless of if Jinja's sandbox is used. Additional details: Weaknesses: CWE-150 CVE ID: CVE-2024-56201 --- llvm/docs/requirements-hashed.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/requirements-hashed.txt b/llvm/docs/requirements-hashed.txt index 08fe7f3573605..636aa7d5ed25c 100644 --- a/llvm/docs/requirements-hashed.txt +++ b/llvm/docs/requirements-hashed.txt @@ -151,7 +151,7 @@ imagesize==1.4.1 \ --hash=sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b \ --hash=sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a # via sphinx -jinja2==3.1.4 \ +jinja2==3.1.5 \ --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \ --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d # via From f43153fc696a2c6e0cba2f507518c013595c2c2f Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Tue, 21 Jan 2025 08:45:47 -0800 Subject: [PATCH 27/30] [CI] Use updated nightly container in `sycl-linux-run-tests.yml` (#16681) Built on top of https://github.com/intel/llvm/pull/16680. --- .github/workflows/sycl-linux-run-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index 269507943c573..6fafe298584e8 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -127,12 +127,9 @@ on: - '["cts-cpu"]' - '["Linux", "build"]' image: - description: | - Use option ending with ":build" for AMDGPU, ":latest" for the rest. type: choice options: - - 'ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:latest' - - 'ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build' + - 'ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest' image_options: description: | Use option with "--device=/dev/kfd" for AMDGPU, without it for the rest. From 74610cc7236df512dfb40ee6e22902ca5a26e2cc Mon Sep 17 00:00:00 2001 From: Kseniya Tikhomirova Date: Tue, 21 Jan 2025 17:58:09 +0100 Subject: [PATCH 28/30] [SYCL][E2E] Remove WA (device selector usage) for allowlist test (#16623) Signed-off-by: Tikhomirova, Kseniya --- sycl/test-e2e/Config/allowlist.cpp | 62 ++++++++++++++++++------------ 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp index cc3a8c6b30221..450116ed05ba1 100644 --- a/sycl/test-e2e/Config/allowlist.cpp +++ b/sycl/test-e2e/Config/allowlist.cpp @@ -1,16 +1,13 @@ -// REQUIRES: opencl && cpu // RUN: %{build} -o %t.out // -// FIXME: Using ONEAPI_DEVICE_SELECTOR=\*:cpu results in seg. faults that I -// cannot reproduce under gdb. -// RUN: env PRINT_DEVICE_INFO=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} %t.out > %t1.conf -// RUN: env TEST_DEVICE_AVAILABLE=1 env SYCL_CONFIG_FILE_NAME=%t1.conf ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} %t.out +// RUN: env PRINT_DEVICE_INFO=1 %{run-unfiltered-devices} %t.out > %t1.conf +// RUN: env TEST_DEVICE_AVAILABLE=1 env SYCL_CONFIG_FILE_NAME=%t1.conf %{run-unfiltered-devices} %t.out // -// RUN: env PRINT_PLATFORM_INFO=1 ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} %t.out > %t2.conf -// RUN: env TEST_DEVICE_AVAILABLE=1 env SYCL_CONFIG_FILE_NAME=%t2.conf ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} %t.out +// RUN: env PRINT_PLATFORM_INFO=1 %{run-unfiltered-devices} %t.out > %t2.conf +// RUN: env TEST_PLATFORM_AVAILABLE=1 env SYCL_CONFIG_FILE_NAME=%t2.conf %{run-unfiltered-devices} %t.out // -// RUN: env TEST_DEVICE_IS_NOT_AVAILABLE=1 env SYCL_DEVICE_ALLOWLIST="PlatformName:{{SUCH NAME DOESN'T EXIST}}" ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} %t.out -// RUN: env TEST_INCORRECT_VALUE=1 env SYCL_DEVICE_ALLOWLIST="IncorrectKey:{{.*}}" ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} %t.out +// RUN: env TEST_DEVICE_IS_NOT_AVAILABLE=1 env SYCL_DEVICE_ALLOWLIST="PlatformName:{{SUCH NAME DOESN'T EXIST}}" %{run-unfiltered-devices} %t.out +// RUN: env TEST_INCORRECT_VALUE=1 env SYCL_DEVICE_ALLOWLIST="IncorrectKey:{{.*}}" %{run-unfiltered-devices} %t.out #include "../helpers.hpp" #include @@ -20,6 +17,16 @@ #include #include +static bool isIdenticalDevices(const std::vector &Devices) { + return std::all_of( + Devices.cbegin(), Devices.cend(), [&](const sycl::device &Dev) { + return (Dev.get_info() == + Devices.at(0).get_info()) && + (Dev.get_info() == + Devices.at(0).get_info()); + }); +} + static void replaceSpecialCharacters(std::string &Str) { // Replace common special symbols with '.' which matches to any character std::replace_if( @@ -48,7 +55,7 @@ int main() { return 0; } - throw std::runtime_error("No device is found"); + throw std::runtime_error("No platform is found"); } // Expected that the allowlist filter is not set @@ -74,12 +81,15 @@ int main() { // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result if (env::isDefined("TEST_DEVICE_AVAILABLE")) { for (const sycl::platform &Platform : sycl::platform::get_platforms()) { - if (Platform.get_devices().size() != 1) + auto Devices = Platform.get_devices(); + if (Devices.empty()) + throw std::runtime_error("No device is found"); + + if (!(Devices.size() == 1 || isIdenticalDevices(Devices))) throw std::runtime_error("Expected only one device."); return 0; } - throw std::runtime_error("No device is found"); } // Expected the allowlist to be set but empty @@ -89,26 +99,28 @@ int main() { return 0; } + // Expected the allowlist to be set with the "PRINT_PLATFORM_INFO" run result + if (env::isDefined("TEST_PLATFORM_AVAILABLE")) { + auto Platforms = sycl::platform::get_platforms(); + if (Platforms.empty()) + throw std::runtime_error("No platform is found"); + else if (Platforms.size() != 1) + throw std::runtime_error("Expected only one platform."); + + return 0; + } + if (env::isDefined("TEST_INCORRECT_VALUE")) { try { sycl::platform::get_platforms(); } catch (sycl::exception &E) { - // Workaround to make CI pass. - // TODO: after the submission of PR intel/llvm:3826, create PR to - // intel/llvm-test-suite with removal of 1st parameter of the vector, - // and transformation of std::vector to std::string - const std::vector ExpectedMsgs{ - "Unrecognized key in device allowlist", + const std::string ExpectedMsg{ "Unrecognized key in SYCL_DEVICE_ALLOWLIST"}; const std::string GotMessage(E.what()); - bool CorrectMsg = false; - for (const auto &ExpectedMsg : ExpectedMsgs) { - if (GotMessage.find(ExpectedMsg) != std::string::npos) { - CorrectMsg = true; - break; - } + if (GotMessage.find(ExpectedMsg) != std::string::npos) { + return 0; } - return CorrectMsg ? 0 : 1; + return 1; } } From 4e62bf7f8fe13065273af833fa1e3d268ffc3ab4 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Tue, 21 Jan 2025 18:07:19 +0100 Subject: [PATCH 29/30] [SYCL][E2E] Fix c++23 option in device_global_copy (#16707) On some Windows configurations, the /std:c++23 option is not recognized. To address this, we instead instruct the compiler to accept it as a clang flag. Signed-off-by: Larsen, Steffen --- sycl/test-e2e/DeviceGlobal/device_global_copy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/DeviceGlobal/device_global_copy.cpp b/sycl/test-e2e/DeviceGlobal/device_global_copy.cpp index e46c6286e2e7a..c15320b55d352 100644 --- a/sycl/test-e2e/DeviceGlobal/device_global_copy.cpp +++ b/sycl/test-e2e/DeviceGlobal/device_global_copy.cpp @@ -1,4 +1,4 @@ -// DEFINE: %{cpp23} = %if cl_options %{/std:c++23%} %else %{-std=c++23%} +// DEFINE: %{cpp23} = %if cl_options %{/clang:-std=c++23%} %else %{-std=c++23%} // RUN: %{build} %{cpp23} -o %t.out // RUN: %{run} %t.out From 4c7d48d5ab17a90986c5d700182fe04597d74423 Mon Sep 17 00:00:00 2001 From: przemektmalon Date: Tue, 21 Jan 2025 17:35:23 +0000 Subject: [PATCH 30/30] [SYCL][Bindless][E2E] Fix post-commit from PR #16537 (#16691) This patch should fix the post-commit failure resulting from enabling the 3-channel image PR in https://github.com/intel/llvm/pull/16537 This is done by adding a `// REQUIRES-INTEL-DRIVER:` comment for LIT to ignore the test until the necessary driver for the functionality is introduced to the GitHub CI. --- sycl/test-e2e/bindless_images/3_channel_format.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/test-e2e/bindless_images/3_channel_format.cpp b/sycl/test-e2e/bindless_images/3_channel_format.cpp index 2cf5df5b98990..3eb46c7458155 100644 --- a/sycl/test-e2e/bindless_images/3_channel_format.cpp +++ b/sycl/test-e2e/bindless_images/3_channel_format.cpp @@ -1,5 +1,8 @@ // REQUIRES: aspect-ext_oneapi_bindless_images +// Test requires at least this version of the Intel GPU driver on Arc. +// REQUIRES-INTEL-DRIVER: lin: 32370 + // RUN: %{build} -o %t.out // RUN: %{run-unfiltered-devices} env NEOReadDebugKeys=1 UseBindlessMode=1 UseExternalAllocatorForSshAndDsh=1 %t.out