Skip to content

Commit 0a27e3f

Browse files
[NFCI][SYCL] Refactor HandlerAccess::postProcess (#19203)
Implements the idea from the earlier TODO comment. Instead of having a hacky `handler::MLastEvent` just `swap` original and post-processing `handler`, so that "natural" `finalize` will work on the latest task in the chain.
1 parent b28ba4a commit 0a27e3f

File tree

6 files changed

+84
-111
lines changed

6 files changed

+84
-111
lines changed

sycl/include/sycl/detail/reduction_forward.hpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,6 @@ enum class strategy : int {
4646
multi,
4747
};
4848

49-
// Reductions implementation need access to private members of handler. Those
50-
// are limited to those below.
51-
inline void finalizeHandler(handler &CGH);
52-
template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func);
53-
5449
template <int Dims>
5550
item<Dims, false> getDelinearizedItem(range<Dims> Range, id<Dims> Id) {
5651
return Builder::createItem<Dims, false>(Range, Id);

sycl/include/sycl/handler.hpp

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,9 @@ class __SYCL_EXPORT handler {
488488
/// \param Graph is a SYCL command_graph
489489
handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
490490
#endif
491+
handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl);
492+
493+
~handler();
491494

492495
void *storeRawArg(const void *Ptr, size_t Size);
493496

@@ -619,8 +622,6 @@ class __SYCL_EXPORT handler {
619622
addReduction(std::shared_ptr<const void>(ReduBuf));
620623
}
621624

622-
~handler() = default;
623-
624625
#ifdef __SYCL_DEVICE_ONLY__
625626
// In device compilation accessor isn't inherited from host base classes, so
626627
// can't detect by it. Since we don't expect it to be ever called in device
@@ -3396,9 +3397,7 @@ class __SYCL_EXPORT handler {
33963397

33973398
private:
33983399
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
3399-
// TODO: Maybe make it a reference when non-preview branch is removed.
3400-
// On the other hand, see `HandlerAccess:postProcess` to how `swap_impl` might
3401-
// be useful in future, pointer here would make that possible/easier.
3400+
std::unique_ptr<detail::handler_impl> implOwner;
34023401
detail::handler_impl *impl;
34033402
#else
34043403
std::shared_ptr<detail::handler_impl> impl;
@@ -3423,11 +3422,10 @@ class __SYCL_EXPORT handler {
34233422
std::unique_ptr<detail::HostKernelBase> MHostKernel;
34243423

34253424
detail::code_location MCodeLoc = {};
3426-
bool MIsFinalized = false;
3427-
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
3428-
detail::EventImplPtr MLastEvent;
3429-
#else
3430-
event MLastEvent;
3425+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
3426+
// Was used for the previous reduction implementation (via `withAuxHandler`).
3427+
bool MIsFinalizedDoNotUse = false;
3428+
event MLastEventDoNotUse;
34313429
#endif
34323430

34333431
// Make queue_impl class friend to be able to call finalize method.
@@ -3452,8 +3450,6 @@ class __SYCL_EXPORT handler {
34523450
bool ExplicitIdentity, typename RedOutVar>
34533451
friend class detail::reduction_impl_algo;
34543452

3455-
friend inline void detail::reduction::finalizeHandler(handler &CGH);
3456-
34573453
template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
34583454
typename PropertiesT, typename... RestT>
34593455
friend void detail::reduction_parallel_for(handler &CGH, range<Dims> NDRange,
@@ -3920,6 +3916,30 @@ class HandlerAccess {
39203916
Handler.parallel_for_impl(Range, Props, Kernel);
39213917
}
39223918

3919+
static void swap(handler &LHS, handler &RHS) {
3920+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
3921+
std::swap(LHS.implOwner, RHS.implOwner);
3922+
#endif
3923+
std::swap(LHS.impl, RHS.impl);
3924+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
3925+
std::swap(LHS.MQueueDoNotUse, RHS.MQueueDoNotUse);
3926+
#endif
3927+
std::swap(LHS.MLocalAccStorage, RHS.MLocalAccStorage);
3928+
std::swap(LHS.MStreamStorage, RHS.MStreamStorage);
3929+
std::swap(LHS.MKernelName, RHS.MKernelName);
3930+
std::swap(LHS.MKernel, RHS.MKernel);
3931+
std::swap(LHS.MSrcPtr, RHS.MSrcPtr);
3932+
std::swap(LHS.MDstPtr, RHS.MDstPtr);
3933+
std::swap(LHS.MLength, RHS.MLength);
3934+
std::swap(LHS.MPattern, RHS.MPattern);
3935+
std::swap(LHS.MHostKernel, RHS.MHostKernel);
3936+
std::swap(LHS.MCodeLoc, RHS.MCodeLoc);
3937+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
3938+
std::swap(LHS.MIsFinalizedDoNotUse, RHS.MIsFinalizedDoNotUse);
3939+
std::swap(LHS.MLastEventDoNotUse, RHS.MLastEventDoNotUse);
3940+
#endif
3941+
}
3942+
39233943
// pre/postProcess are used only for reductions right now, but the
39243944
// abstractions they provide aren't reduction-specific. The main problem they
39253945
// solve is
@@ -3932,9 +3952,8 @@ class HandlerAccess {
39323952
//
39333953
// that needs to be implemented as multiple enqueues involving
39343954
// pre-/post-processing internally. SYCL prohibits recursive submits from
3935-
// inside control group function object (lambda above) so we resort to a
3936-
// somewhat hacky way of creating multiple `handler`s and manual finalization
3937-
// of them (instead of the one in `queue::submit`).
3955+
// inside control group function object (lambda above) so we need some
3956+
// internal interface to implement that.
39383957
__SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F);
39393958
__SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F);
39403959

sycl/include/sycl/reduction.hpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,10 +1162,6 @@ auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
11621162
RedVar, std::forward<RestTy>(Rest)...};
11631163
}
11641164

1165-
namespace reduction {
1166-
inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
1167-
} // namespace reduction
1168-
11691165
// This method is used for implementation of parallel_for accepting 1 reduction.
11701166
// TODO: remove this method when everything is switched to general algorithm
11711167
// implementing arbitrary number of reductions in parallel_for().
@@ -1723,8 +1719,6 @@ struct NDRangeReduction<
17231719
}
17241720
});
17251721

1726-
reduction::finalizeHandler(CGH);
1727-
17281722
// Run the additional kernel as many times as needed to reduce all partial
17291723
// sums into one scalar.
17301724

@@ -1901,8 +1895,6 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
19011895
else
19021896
First(KernelMultipleWGTag{});
19031897

1904-
reduction::finalizeHandler(CGH);
1905-
19061898
// 2. Run the additional kernel as many times as needed to reduce
19071899
// all partial sums into one scalar.
19081900

@@ -2598,7 +2590,6 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
25982590

25992591
reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
26002592
ReduIndices);
2601-
reduction::finalizeHandler(CGH);
26022593

26032594
size_t NWorkItems = NDRange.get_group_range().size();
26042595
while (NWorkItems > 1) {

sycl/source/handler.cpp

Lines changed: 45 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,12 @@ fill_copy_args(detail::handler_impl *impl,
318318

319319
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
320320
handler::handler(detail::handler_impl &HandlerImpl) : impl(&HandlerImpl) {}
321+
handler::handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl)
322+
: implOwner(std::move(HandlerImpl)), impl(implOwner.get()) {}
321323
#else
324+
handler::handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl)
325+
: impl(std::move(HandlerImpl)) {}
326+
322327
handler::handler(std::shared_ptr<detail::queue_impl> Queue,
323328
bool CallerNeedsEvent)
324329
: impl(std::make_shared<detail::handler_impl>(*Queue, nullptr,
@@ -344,6 +349,7 @@ handler::handler(
344349
: impl(std::make_shared<detail::handler_impl>(*Graph)) {}
345350

346351
#endif
352+
handler::~handler() = default;
347353

348354
// Sets the submission state to indicate that an explicit kernel bundle has been
349355
// set. Throws a sycl::exception with errc::invalid if the current state
@@ -426,12 +432,6 @@ detail::EventImplPtr handler::finalize() {
426432
#else
427433
event handler::finalize() {
428434
#endif
429-
// This block of code is needed only for reduction implementation.
430-
// It is harmless (does nothing) for everything else.
431-
if (MIsFinalized)
432-
return MLastEvent;
433-
MIsFinalized = true;
434-
435435
const auto &type = getType();
436436
detail::queue_impl *Queue = impl->get_queue_or_null();
437437
ext::oneapi::experimental::detail::graph_impl *Graph =
@@ -559,13 +559,6 @@ event handler::finalize() {
559559
std::vector<ur_event_handle_t> RawEvents = detail::Command::getUrEvents(
560560
impl->CGData.MEvents, impl->get_queue_or_null(), false);
561561

562-
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
563-
detail::EventImplPtr &LastEventImpl = MLastEvent;
564-
#else
565-
const detail::EventImplPtr &LastEventImpl =
566-
detail::getSyclObjImpl(MLastEvent);
567-
#endif
568-
569562
bool DiscardEvent =
570563
!impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents();
571564
if (DiscardEvent) {
@@ -577,11 +570,10 @@ event handler::finalize() {
577570
DiscardEvent = !KernelUsesAssert;
578571
}
579572

580-
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
581-
if (!DiscardEvent) {
582-
LastEventImpl = detail::event_impl::create_completed_host_event();
583-
}
584-
#endif
573+
std::shared_ptr<detail::event_impl> ResultEvent =
574+
DiscardEvent
575+
? nullptr
576+
: detail::event_impl::create_device_event(impl->get_queue());
585577

586578
#ifdef XPTI_ENABLE_INSTRUMENTATION
587579
const bool xptiEnabled = xptiTraceEnabled();
@@ -612,9 +604,8 @@ event handler::finalize() {
612604
enqueueImpKernel(
613605
impl->get_queue(), impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr,
614606
MKernel.get(), toKernelNameStrT(MKernelName),
615-
impl->MKernelNameBasedCachePtr, RawEvents,
616-
DiscardEvent ? nullptr : LastEventImpl.get(), nullptr,
617-
impl->MKernelCacheConfig, impl->MKernelIsCooperative,
607+
impl->MKernelNameBasedCachePtr, RawEvents, ResultEvent.get(),
608+
nullptr, impl->MKernelCacheConfig, impl->MKernelIsCooperative,
618609
impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize,
619610
BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs,
620611
impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures);
@@ -624,7 +615,7 @@ event handler::finalize() {
624615
if (!DiscardEvent) {
625616
detail::emitInstrumentationGeneral(
626617
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
627-
static_cast<const void *>(LastEventImpl->getHandle()));
618+
static_cast<const void *>(ResultEvent->getHandle()));
628619
}
629620
detail::emitInstrumentationGeneral(StreamID, InstanceID,
630621
CmdTraceEvent,
@@ -635,29 +626,32 @@ event handler::finalize() {
635626

636627
if (DiscardEvent) {
637628
EnqueueKernel();
638-
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
639-
LastEventImpl->setStateDiscarded();
640-
#endif
641629
} else {
642630
detail::queue_impl &Queue = impl->get_queue();
643-
LastEventImpl->setQueue(Queue);
644-
LastEventImpl->setWorkerQueue(Queue.weak_from_this());
645-
LastEventImpl->setContextImpl(impl->get_context());
646-
LastEventImpl->setStateIncomplete();
647-
LastEventImpl->setSubmissionTime();
631+
ResultEvent->setQueue(Queue);
632+
ResultEvent->setWorkerQueue(Queue.weak_from_this());
633+
ResultEvent->setContextImpl(impl->get_context());
634+
ResultEvent->setStateIncomplete();
635+
ResultEvent->setSubmissionTime();
648636

649637
EnqueueKernel();
650-
LastEventImpl->setEnqueued();
638+
ResultEvent->setEnqueued();
651639
// connect returned event with dependent events
652640
if (!Queue.isInOrder()) {
653641
// MEvents is not used anymore, so can move.
654-
LastEventImpl->getPreparedDepsEvents() =
642+
ResultEvent->getPreparedDepsEvents() =
655643
std::move(impl->CGData.MEvents);
656-
// LastEventImpl is local for current thread, no need to lock.
657-
LastEventImpl->cleanDepEventsThroughOneLevelUnlocked();
644+
// ResultEvent is local for current thread, no need to lock.
645+
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
658646
}
659647
}
660-
return MLastEvent;
648+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
649+
return ResultEvent;
650+
#else
651+
return detail::createSyclObjFromImpl<event>(
652+
ResultEvent ? ResultEvent
653+
: detail::event_impl::create_discarded_event());
654+
#endif
661655
}
662656
}
663657

@@ -939,11 +933,10 @@ event handler::finalize() {
939933
std::move(CommandGroup), *Queue, !DiscardEvent);
940934

941935
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
942-
MLastEvent = DiscardEvent ? nullptr : Event;
936+
return DiscardEvent ? nullptr : Event;
943937
#else
944-
MLastEvent = detail::createSyclObjFromImpl<event>(Event);
938+
return detail::createSyclObjFromImpl<event>(Event);
945939
#endif
946-
return MLastEvent;
947940
}
948941

949942
void handler::addReduction(const std::shared_ptr<const void> &ReduObj) {
@@ -2474,58 +2467,28 @@ __SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH,
24742467
AuxHandler.copyCodeLoc(CGH);
24752468
F(AuxHandler);
24762469
auto E = AuxHandler.finalize();
2477-
assert(!CGH.MIsFinalized &&
2478-
"Can't do pre-processing if the command has been enqueued already!");
24792470
if (EventNeeded)
24802471
CGH.depends_on(E);
24812472
}
24822473
__SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH,
24832474
type_erased_cgfo_ty F) {
2484-
// The "hacky" `handler`s manipulation mentioned near the declaration in
2485-
// `handler.hpp` and implemented here is far from perfect. A better approach
2486-
// would be
2487-
//
2488-
// bool OrigNeedsEvent = CGH.needsEvent()
2489-
// assert(CGH.not_finalized/enqueued());
2490-
// if (!InOrderQueue)
2491-
// CGH.setNeedsEvent()
2492-
//
2493-
// handler PostProcessHandler(Queue, OrigNeedsEvent)
2494-
// auto E = CGH.finalize(); // enqueue original or current last
2495-
// // post-process
2496-
// if (!InOrder)
2497-
// PostProcessHandler.depends_on(E)
2498-
//
2499-
// swap_impls(CGH, PostProcessHandler)
2500-
// return; // queue::submit finalizes PostProcessHandler and returns its
2501-
// // event if necessary.
2502-
//
2503-
// Still hackier than "real" `queue::submit` but at least somewhat sane.
2504-
// That, however hasn't been tried yet and we have an even hackier approach
2505-
// copied from what's been done in an old reductions implementation before
2506-
// eventless submission work has started. Not sure how feasible the approach
2507-
// above is at this moment.
2508-
2509-
// This `finalize` is wrong (at least logically) if
2510-
// `assert(!CGH.eventNeeded())`
2511-
auto E = CGH.finalize();
2475+
bool EventNeeded = CGH.impl->MEventNeeded;
25122476
queue_impl &Q = CGH.impl->get_queue();
25132477
bool InOrder = Q.isInOrder();
2514-
// Cannot use `CGH.eventNeeded()` alone as there might be subsequent
2515-
// `postProcess` calls and we cannot address them properly similarly to the
2516-
// `finalize` issue described above. `swap_impls` suggested above might be
2517-
// able to handle this scenario naturally.
2518-
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
2519-
handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder};
2520-
handler AuxHandler{HandlerImpl};
2521-
#else
2522-
handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder};
2523-
#endif
25242478
if (!InOrder)
2525-
AuxHandler.depends_on(E);
2526-
AuxHandler.copyCodeLoc(CGH);
2527-
F(AuxHandler);
2528-
CGH.MLastEvent = AuxHandler.finalize();
2479+
CGH.impl->MEventNeeded = true;
2480+
2481+
handler PostProcessHandler{
2482+
std::make_unique<handler_impl>(Q, nullptr, EventNeeded)};
2483+
PostProcessHandler.copyCodeLoc(CGH);
2484+
// Extend lifetimes of auxiliary resources till the last kernel in the chain
2485+
// finishes:
2486+
PostProcessHandler.impl->MAuxiliaryResources = CGH.impl->MAuxiliaryResources;
2487+
auto E = CGH.finalize();
2488+
if (!InOrder)
2489+
PostProcessHandler.depends_on(E);
2490+
F(PostProcessHandler);
2491+
swap(CGH, PostProcessHandler);
25292492
}
25302493
} // namespace detail
25312494
} // namespace _V1

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3617,12 +3617,16 @@ _ZN4sycl3_V17handler8getQueueEv
36173617
_ZN4sycl3_V17handler8prefetchEPKvm
36183618
_ZN4sycl3_V17handler9clearArgsEv
36193619
_ZN4sycl3_V17handler9fill_implEPvPKvmm
3620+
_ZN4sycl3_V17handlerC1EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE
36203621
_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
36213622
_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
36223623
_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb
3624+
_ZN4sycl3_V17handlerC2EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE
36233625
_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
36243626
_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
36253627
_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb
3628+
_ZN4sycl3_V17handlerD1Ev
3629+
_ZN4sycl3_V17handlerD2Ev
36263630
_ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
36273631
_ZN4sycl3_V17samplerC1EP11_cl_samplerRKNS0_7contextE
36283632
_ZN4sycl3_V17samplerC2ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE

sycl/test/abi/sycl_symbols_windows.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@
417417
??0gpu_selector@_V1@sycl@@QEAA@$$QEAV012@@Z
418418
??0gpu_selector@_V1@sycl@@QEAA@AEBV012@@Z
419419
??0gpu_selector@_V1@sycl@@QEAA@XZ
420+
??0handler@_V1@sycl@@AEAA@$$QEAV?$unique_ptr@Vhandler_impl@detail@_V1@sycl@@U?$default_delete@Vhandler_impl@detail@_V1@sycl@@@std@@@std@@@Z
420421
??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z
421422
??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z
422423
??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@PEAVqueue_impl@detail@12@_N@Z

0 commit comments

Comments
 (0)