Skip to content

Commit

Permalink
[SYCL] Add nested calls detection to shortcut functions (intel#13659)
Browse files Browse the repository at this point in the history
Original impl does not cover shortcut functions.
This version has thread_local global simple type variable that could
track nested call within some queue functions like submit, memset,
memcpy and others. Shortcut functions use common part submitMemOpHelper
where detection is also added.
Reduction impl updated to eliminate nested call we did internally. It is
even better since common logic with dependency tracking used in queue
methods is not needed there.

---------

Signed-off-by: Tikhomirova, Kseniya <[email protected]>
  • Loading branch information
KseniyaTikhomirova authored May 29, 2024
1 parent 05d29f3 commit fd0491c
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 74 deletions.
21 changes: 13 additions & 8 deletions sycl/include/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ using IsReduOptForFastAtomicFetch =
std::bool_constant<false>;
#else
std::bool_constant<((is_sgenfloat_v<T> && sizeof(T) == 4) ||
is_sgeninteger_v<T>)&&IsValidAtomicType<T>::value &&
is_sgeninteger_v<T>) &&
IsValidAtomicType<T>::value &&
(IsPlus<T, BinaryOperation>::value ||
IsMinimum<T, BinaryOperation>::value ||
IsMaximum<T, BinaryOperation>::value ||
Expand Down Expand Up @@ -138,11 +139,12 @@ using IsReduOptForFastReduce =
#ifdef SYCL_REDUCTION_DETERMINISTIC
std::bool_constant<false>;
#else
std::bool_constant<(
(is_sgeninteger_v<T> && (sizeof(T) == 4 || sizeof(T) == 8)) ||
is_sgenfloat_v<T>)&&(IsPlus<T, BinaryOperation>::value ||
IsMinimum<T, BinaryOperation>::value ||
IsMaximum<T, BinaryOperation>::value)>;
std::bool_constant<((is_sgeninteger_v<T> &&
(sizeof(T) == 4 || sizeof(T) == 8)) ||
is_sgenfloat_v<T>) &&
(IsPlus<T, BinaryOperation>::value ||
IsMinimum<T, BinaryOperation>::value ||
IsMaximum<T, BinaryOperation>::value)>;
#endif

// std::tuple seems to be a) too heavy and b) not copyable to device now
Expand Down Expand Up @@ -835,6 +837,10 @@ using __sycl_init_mem_for =
std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
reduction::InitMemKrn<KernelName>>;

__SYCL_EXPORT void
addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
std::shared_ptr<int> &Counter);

template <typename T, class BinaryOperation, int Dims, size_t Extent,
bool ExplicitIdentity, typename RedOutVar>
class reduction_impl_algo {
Expand Down Expand Up @@ -1075,8 +1081,7 @@ class reduction_impl_algo {
std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
CGH.addReduction(Counter);

auto Event = q.memset(Counter.get(), 0, sizeof(int));
CGH.depends_on(Event);
addCounterInit(CGH, CGH.MQueue, Counter);

return Counter.get();
}
Expand Down
62 changes: 60 additions & 2 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ namespace sycl {
inline namespace _V1 {
namespace detail {
std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 0;
thread_local bool NestedCallsDetector = false;
class NestedCallsTracker {
public:
NestedCallsTracker() {
if (NestedCallsDetector)
throw sycl::exception(
make_error_code(errc::invalid),
"Calls to sycl::queue::submit cannot be nested. Command group "
"function objects should use the sycl::handler API instead.");
NestedCallsDetector = true;
}

~NestedCallsTracker() { NestedCallsDetector = false; }
};

static std::vector<sycl::detail::pi::PiEvent>
getPIEvents(const std::vector<sycl::event> &DepEvents) {
Expand Down Expand Up @@ -330,6 +344,46 @@ void queue_impl::addSharedEvent(const event &Event) {
MEventsShared.push_back(Event);
}

event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
const std::shared_ptr<queue_impl> &Self,
const std::shared_ptr<queue_impl> &PrimaryQueue,
const std::shared_ptr<queue_impl> &SecondaryQueue,
const detail::code_location &Loc,
const SubmitPostProcessF *PostProcess) {
handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
Handler.saveCodeLoc(Loc);

{
NestedCallsTracker tracker;
CGF(Handler);
}

// Scheduler will later omit events, that are not required to execute tasks.
// Host and interop tasks, however, are not submitted to low-level runtimes
// and require separate dependency management.
const CG::CGTYPE Type = Handler.getType();
event Event = detail::createSyclObjFromImpl<event>(
std::make_shared<detail::event_impl>());

if (PostProcess) {
bool IsKernel = Type == CG::Kernel;
bool KernelUsesAssert = false;

if (IsKernel)
// Kernel only uses assert if it's non interop one
KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
ProgramManager::getInstance().kernelUsesAssert(
Handler.MKernelName.c_str());
finalizeHandler(Handler, Event);

(*PostProcess)(IsKernel, KernelUsesAssert, Event);
} else
finalizeHandler(Handler, Event);

addEvent(Event);
return Event;
}

template <typename HandlerFuncT>
event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
const std::vector<event> &DepEvents,
Expand Down Expand Up @@ -362,15 +416,19 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass(
ExpandedDepEvents, MContext)) {
if (MSupportsDiscardingPiEvents) {
NestedCallsTracker tracker;
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
/*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
return createDiscardedEvent();
}

event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
auto EventImpl = detail::getSyclObjImpl(ResEvent);
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
&EventImpl->getHandleRef(), EventImpl);
{
NestedCallsTracker tracker;
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
&EventImpl->getHandleRef(), EventImpl);
}

if (MContext->is_host())
return MDiscardEvents ? createDiscardedEvent() : event();
Expand Down
48 changes: 1 addition & 47 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -849,53 +849,7 @@ class queue_impl {
const std::shared_ptr<queue_impl> &PrimaryQueue,
const std::shared_ptr<queue_impl> &SecondaryQueue,
const detail::code_location &Loc,
const SubmitPostProcessF *PostProcess) {
// Flag used to detect nested calls to submit and report an error.
thread_local static bool PreventSubmit = false;

if (PreventSubmit) {
throw sycl::exception(
make_error_code(errc::invalid),
"Calls to sycl::queue::submit cannot be nested. Command group "
"function objects should use the sycl::handler API instead.");
}

handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
Handler.saveCodeLoc(Loc);
PreventSubmit = true;
try {
CGF(Handler);
} catch (...) {
PreventSubmit = false;
throw;
}
PreventSubmit = false;

// Scheduler will later omit events, that are not required to execute tasks.
// Host and interop tasks, however, are not submitted to low-level runtimes
// and require separate dependency management.
const CG::CGTYPE Type = Handler.getType();
event Event = detail::createSyclObjFromImpl<event>(
std::make_shared<detail::event_impl>());

if (PostProcess) {
bool IsKernel = Type == CG::Kernel;
bool KernelUsesAssert = false;

if (IsKernel)
// Kernel only uses assert if it's non interop one
KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
ProgramManager::getInstance().kernelUsesAssert(
Handler.MKernelName.c_str());
finalizeHandler(Handler, Event);

(*PostProcess)(IsKernel, KernelUsesAssert, Event);
} else
finalizeHandler(Handler, Event);

addEvent(Event);
return Event;
}
const SubmitPostProcessF *PostProcess);

/// Helper function for submitting a memory operation with a handler.
/// \param Self is a shared_ptr to this queue.
Expand Down
12 changes: 12 additions & 0 deletions sycl/source/detail/reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//

#include <detail/config.hpp>
#include <detail/memory_manager.hpp>
#include <detail/queue_impl.hpp>
#include <sycl/reduction.hpp>

Expand Down Expand Up @@ -165,6 +166,17 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
}

__SYCL_EXPORT void
addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
std::shared_ptr<int> &Counter) {
auto EventImpl = std::make_shared<detail::event_impl>(Queue);
EventImpl->setContextImpl(detail::getSyclObjImpl(Queue->get_context()));
EventImpl->setStateIncomplete();
MemoryManager::fill_usm(Counter.get(), Queue, sizeof(int), 0, {},
&EventImpl->getHandleRef(), EventImpl);
CGH.depends_on(createSyclObjFromImpl<event>(EventImpl));
}

} // namespace detail
} // namespace _V1
} // namespace sycl
55 changes: 44 additions & 11 deletions sycl/test-e2e/Basic/nested_queue_submit.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
// RUN: %{build} -I . -o %t.out
// RUN: %{run} %t.out

#include <cstdlib>
#include <string>
#include <sycl/detail/core.hpp>
#include <sycl/usm.hpp>
#include <vector>

void nestedSubmit() {
void checkExceptionFields(const sycl::exception &e) {
assert(e.code() == sycl::errc::invalid && "Invalid error code");
assert(std::string(e.what()) ==
"Calls to sycl::queue::submit cannot be nested. Command group "
"function objects should use the sycl::handler API instead." &&
"Invalid e.what() string");
}

void nestedSubmitParallelFor(sycl::queue &q) {
uint32_t n = 1024;
std::vector<float> array(n);
sycl::queue q{};
{
sycl::buffer<float> buf(array.data(), sycl::range<1>{n});
q.submit([&](sycl::handler &h) {
Expand All @@ -19,16 +27,41 @@ void nestedSubmit() {
}
}

int main() {
void nestedSubmitMemset(sycl::queue &q) {
uint32_t n = 1024;
int *data = sycl::malloc_device<int>(n, q);
try {
q.submit([&](sycl::handler &h) { q.memset(data, 0, n * sizeof(int)); });
} catch (...) {
sycl::free(data, q);
throw;
}
sycl::free(data, q);
}

template <typename CommandSubmitterT>
void test(sycl::queue &Queue, CommandSubmitterT QueueSubmit) {
bool ExceptionHappened = false;
try {
nestedSubmit();
QueueSubmit(Queue);
} catch (const sycl::exception &e) {
assert(e.code() == sycl::errc::invalid && "Invalid error code");
assert(std::string(e.what()) ==
"Calls to sycl::queue::submit cannot be nested. Command group "
"function objects should use the sycl::handler API instead." &&
"Invalid e.what() string");
checkExceptionFields(e);
ExceptionHappened = true;
}
std::cout << "test passed" << std::endl;
assert(ExceptionHappened);
// Checks that queue is in a valid state: nested call tracker was cleaned up
// after exception and does not prevent from submission of new commands.
Queue.submit([&](sycl::handler &h) {});
Queue.wait();
}

int main() {
sycl::queue q{};
test(q, nestedSubmitParallelFor);
// All shortcut functions has a common part where nested call detection
// happens. Testing only one of them is enough.
if (q.get_device().get_info<sycl::info::device::usm_device_allocations>())
test(q, nestedSubmitMemset);

return EXIT_SUCCESS;
}
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3314,6 +3314,7 @@ _ZN4sycl3_V16detail13lgamma_r_implEfPi
_ZN4sycl3_V16detail13make_platformEmNS0_7backendE
_ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEE
_ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEERKNS0_7contextE
_ZN4sycl3_V16detail14addCounterInitERNS0_7handlerERSt10shared_ptrINS1_10queue_implEERS4_IiE
_ZN4sycl3_V16detail14getBorderColorENS0_19image_channel_orderE
_ZN4sycl3_V16detail14tls_code_loc_t5queryEv
_ZN4sycl3_V16detail14tls_code_loc_tC1ERKNS1_13code_locationE
Expand Down
13 changes: 7 additions & 6 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,12 @@
??$import_external_semaphore@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
??$import_external_semaphore@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVdevice@34@AEBVcontext@34@@Z
??$import_external_semaphore@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z
??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z
??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z
??$update_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$00@45@@Z
??$update_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$01@45@@Z
??$update_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$02@45@@Z
??0AccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VAccessorImplHost@detail@_V1@sycl@@@std@@@Z
??0AccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z
??0AccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z
Expand Down Expand Up @@ -3932,6 +3938,7 @@
?add@device_global_map@detail@_V1@sycl@@YAXPEBXPEBD@Z
?add@host_pipe_map@detail@_V1@sycl@@YAXPEBXPEBD@Z
?add@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA?AVnode@34567@AEBVproperty_list@67@@Z
?addCounterInit@detail@_V1@sycl@@YAXAEAVhandler@23@AEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@AEAV?$shared_ptr@H@6@@Z
?addGraphLeafDependencies@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXVnode@34567@@Z
?addHostAccessorAndWait@detail@_V1@sycl@@YAXPEAVAccessorImplHost@123@@Z
?addHostSampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVSampledImageAccessorImplHost@123@@Z
Expand Down Expand Up @@ -4355,12 +4362,6 @@
?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z
?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ
?get_node_from_event@node@experimental@oneapi@ext@_V1@sycl@@SA?AV123456@Vevent@56@@Z
??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z
??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z
??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z
??$update_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$00@45@@Z
??$update_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$01@45@@Z
??$update_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$02@45@@Z
?get_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ
?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z
Expand Down

0 comments on commit fd0491c

Please sign in to comment.