Skip to content

[NFCI][SYCL] Refactor HandlerAccess::postProcess #19203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions sycl/include/sycl/detail/reduction_forward.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,6 @@ enum class strategy : int {
multi,
};

// Reductions implementation need access to private members of handler. Those
// are limited to those below.
inline void finalizeHandler(handler &CGH);
template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

withAuxHandler declaration should have been removed earlier when postProcess was introduced instead of it.


template <int Dims>
item<Dims, false> getDelinearizedItem(range<Dims> Range, id<Dims> Id) {
return Builder::createItem<Dims, false>(Range, Id);
Expand Down
49 changes: 34 additions & 15 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,9 @@ class __SYCL_EXPORT handler {
/// \param Graph is a SYCL command_graph
handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
#endif
handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl);

~handler();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

std::unique_ptr<T> dtor needs a complete T, so move the definition to the handler.cpp.


void *storeRawArg(const void *Ptr, size_t Size);

Expand Down Expand Up @@ -619,8 +622,6 @@ class __SYCL_EXPORT handler {
addReduction(std::shared_ptr<const void>(ReduBuf));
}

~handler() = default;

#ifdef __SYCL_DEVICE_ONLY__
// In device compilation accessor isn't inherited from host base classes, so
// can't detect by it. Since we don't expect it to be ever called in device
Expand Down Expand Up @@ -3396,9 +3397,7 @@ class __SYCL_EXPORT handler {

private:
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
// TODO: Maybe make it a reference when non-preview branch is removed.
// On the other hand, see `HandlerAccess:postProcess` to how `swap_impl` might
// be useful in future, pointer here would make that possible/easier.
std::unique_ptr<detail::handler_impl> implOwner;
detail::handler_impl *impl;
#else
std::shared_ptr<detail::handler_impl> impl;
Expand All @@ -3423,11 +3422,10 @@ class __SYCL_EXPORT handler {
std::unique_ptr<detail::HostKernelBase> MHostKernel;

detail::code_location MCodeLoc = {};
bool MIsFinalized = false;
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::EventImplPtr MLastEvent;
#else
event MLastEvent;
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
// Was used for the previous reduction implementation (via `withAuxHandler`).
bool MIsFinalizedDoNotUse = false;
event MLastEventDoNotUse;
#endif

// Make queue_impl class friend to be able to call finalize method.
Expand All @@ -3452,8 +3450,6 @@ class __SYCL_EXPORT handler {
bool ExplicitIdentity, typename RedOutVar>
friend class detail::reduction_impl_algo;

friend inline void detail::reduction::finalizeHandler(handler &CGH);

template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
typename PropertiesT, typename... RestT>
friend void detail::reduction_parallel_for(handler &CGH, range<Dims> NDRange,
Expand Down Expand Up @@ -3920,6 +3916,30 @@ class HandlerAccess {
Handler.parallel_for_impl(Range, Props, Kernel);
}

static void swap(handler &LHS, handler &RHS) {
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
std::swap(LHS.implOwner, RHS.implOwner);
#endif
std::swap(LHS.impl, RHS.impl);
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
std::swap(LHS.MQueueDoNotUse, RHS.MQueueDoNotUse);
#endif
std::swap(LHS.MLocalAccStorage, RHS.MLocalAccStorage);
std::swap(LHS.MStreamStorage, RHS.MStreamStorage);
std::swap(LHS.MKernelName, RHS.MKernelName);
std::swap(LHS.MKernel, RHS.MKernel);
std::swap(LHS.MSrcPtr, RHS.MSrcPtr);
std::swap(LHS.MDstPtr, RHS.MDstPtr);
std::swap(LHS.MLength, RHS.MLength);
std::swap(LHS.MPattern, RHS.MPattern);
std::swap(LHS.MHostKernel, RHS.MHostKernel);
std::swap(LHS.MCodeLoc, RHS.MCodeLoc);
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
std::swap(LHS.MIsFinalizedDoNotUse, RHS.MIsFinalizedDoNotUse);
std::swap(LHS.MLastEventDoNotUse, RHS.MLastEventDoNotUse);
#endif
}

// pre/postProcess are used only for reductions right now, but the
// abstractions they provide aren't reduction-specific. The main problem they
// solve is
Expand All @@ -3932,9 +3952,8 @@ class HandlerAccess {
//
// that needs to be implemented as multiple enqueues involving
// pre-/post-processing internally. SYCL prohibits recursive submits from
// inside control group function object (lambda above) so we resort to a
// somewhat hacky way of creating multiple `handler`s and manual finalization
// of them (instead of the one in `queue::submit`).
// inside control group function object (lambda above) so we need some
// internal interface to implement that.
__SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F);
__SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F);

Expand Down
9 changes: 0 additions & 9 deletions sycl/include/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1162,10 +1162,6 @@ auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
RedVar, std::forward<RestTy>(Rest)...};
}

namespace reduction {
inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
} // namespace reduction

// This method is used for implementation of parallel_for accepting 1 reduction.
// TODO: remove this method when everything is switched to general algorithm
// implementing arbitrary number of reductions in parallel_for().
Expand Down Expand Up @@ -1723,8 +1719,6 @@ struct NDRangeReduction<
}
});

reduction::finalizeHandler(CGH);

// Run the additional kernel as many times as needed to reduce all partial
// sums into one scalar.

Expand Down Expand Up @@ -1901,8 +1895,6 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
else
First(KernelMultipleWGTag{});

reduction::finalizeHandler(CGH);

// 2. Run the additional kernel as many times as needed to reduce
// all partial sums into one scalar.

Expand Down Expand Up @@ -2598,7 +2590,6 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {

reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
ReduIndices);
reduction::finalizeHandler(CGH);

size_t NWorkItems = NDRange.get_group_range().size();
while (NWorkItems > 1) {
Expand Down
127 changes: 45 additions & 82 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,12 @@ fill_copy_args(detail::handler_impl *impl,

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
handler::handler(detail::handler_impl &HandlerImpl) : impl(&HandlerImpl) {}
handler::handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl)
: implOwner(std::move(HandlerImpl)), impl(implOwner.get()) {}
#else
handler::handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl)
: impl(std::move(HandlerImpl)) {}

handler::handler(std::shared_ptr<detail::queue_impl> Queue,
bool CallerNeedsEvent)
: impl(std::make_shared<detail::handler_impl>(*Queue, nullptr,
Expand All @@ -344,6 +349,7 @@ handler::handler(
: impl(std::make_shared<detail::handler_impl>(*Graph)) {}

#endif
handler::~handler() = default;

// Sets the submission state to indicate that an explicit kernel bundle has been
// set. Throws a sycl::exception with errc::invalid if the current state
Expand Down Expand Up @@ -426,12 +432,6 @@ detail::EventImplPtr handler::finalize() {
#else
event handler::finalize() {
#endif
// This block of code is needed only for reduction implementation.
// It is harmless (does nothing) for everything else.
if (MIsFinalized)
return MLastEvent;
MIsFinalized = true;

const auto &type = getType();
detail::queue_impl *Queue = impl->get_queue_or_null();
ext::oneapi::experimental::detail::graph_impl *Graph =
Expand Down Expand Up @@ -559,13 +559,6 @@ event handler::finalize() {
std::vector<ur_event_handle_t> RawEvents = detail::Command::getUrEvents(
impl->CGData.MEvents, impl->get_queue_or_null(), false);

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::EventImplPtr &LastEventImpl = MLastEvent;
#else
const detail::EventImplPtr &LastEventImpl =
detail::getSyclObjImpl(MLastEvent);
#endif

bool DiscardEvent =
!impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents();
if (DiscardEvent) {
Expand All @@ -577,11 +570,10 @@ event handler::finalize() {
DiscardEvent = !KernelUsesAssert;
}

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
if (!DiscardEvent) {
LastEventImpl = detail::event_impl::create_completed_host_event();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea why we were using create_completed_host_event before. @slawekptak , any thoughts?

}
#endif
std::shared_ptr<detail::event_impl> ResultEvent =
DiscardEvent
? nullptr
: detail::event_impl::create_device_event(impl->get_queue());

#ifdef XPTI_ENABLE_INSTRUMENTATION
const bool xptiEnabled = xptiTraceEnabled();
Expand Down Expand Up @@ -612,9 +604,8 @@ event handler::finalize() {
enqueueImpKernel(
impl->get_queue(), impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr,
MKernel.get(), toKernelNameStrT(MKernelName),
impl->MKernelNameBasedCachePtr, RawEvents,
DiscardEvent ? nullptr : LastEventImpl.get(), nullptr,
impl->MKernelCacheConfig, impl->MKernelIsCooperative,
impl->MKernelNameBasedCachePtr, RawEvents, ResultEvent.get(),
nullptr, impl->MKernelCacheConfig, impl->MKernelIsCooperative,
impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize,
BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs,
impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures);
Expand All @@ -624,7 +615,7 @@ event handler::finalize() {
if (!DiscardEvent) {
detail::emitInstrumentationGeneral(
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
static_cast<const void *>(LastEventImpl->getHandle()));
static_cast<const void *>(ResultEvent->getHandle()));
}
detail::emitInstrumentationGeneral(StreamID, InstanceID,
CmdTraceEvent,
Expand All @@ -635,29 +626,32 @@ event handler::finalize() {

if (DiscardEvent) {
EnqueueKernel();
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
LastEventImpl->setStateDiscarded();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line 654 on the right does create_discarded_event instead, which, IMO, is even more readable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree and it's also going away, so no real concerns either way.

#endif
} else {
detail::queue_impl &Queue = impl->get_queue();
LastEventImpl->setQueue(Queue);
LastEventImpl->setWorkerQueue(Queue.weak_from_this());
LastEventImpl->setContextImpl(impl->get_context());
LastEventImpl->setStateIncomplete();
LastEventImpl->setSubmissionTime();
ResultEvent->setQueue(Queue);
ResultEvent->setWorkerQueue(Queue.weak_from_this());
ResultEvent->setContextImpl(impl->get_context());
ResultEvent->setStateIncomplete();
ResultEvent->setSubmissionTime();

EnqueueKernel();
LastEventImpl->setEnqueued();
ResultEvent->setEnqueued();
// connect returned event with dependent events
if (!Queue.isInOrder()) {
// MEvents is not used anymore, so can move.
LastEventImpl->getPreparedDepsEvents() =
ResultEvent->getPreparedDepsEvents() =
std::move(impl->CGData.MEvents);
// LastEventImpl is local for current thread, no need to lock.
LastEventImpl->cleanDepEventsThroughOneLevelUnlocked();
// ResultEvent is local for current thread, no need to lock.
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
}
}
return MLastEvent;
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
return ResultEvent;
#else
return detail::createSyclObjFromImpl<event>(
ResultEvent ? ResultEvent
: detail::event_impl::create_discarded_event());
#endif
}
}

Expand Down Expand Up @@ -939,11 +933,10 @@ event handler::finalize() {
std::move(CommandGroup), *Queue, !DiscardEvent);

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
MLastEvent = DiscardEvent ? nullptr : Event;
return DiscardEvent ? nullptr : Event;
#else
MLastEvent = detail::createSyclObjFromImpl<event>(Event);
return detail::createSyclObjFromImpl<event>(Event);
#endif
return MLastEvent;
}

void handler::addReduction(const std::shared_ptr<const void> &ReduObj) {
Expand Down Expand Up @@ -2474,58 +2467,28 @@ __SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH,
AuxHandler.copyCodeLoc(CGH);
F(AuxHandler);
auto E = AuxHandler.finalize();
assert(!CGH.MIsFinalized &&
"Can't do pre-processing if the command has been enqueued already!");
if (EventNeeded)
CGH.depends_on(E);
}
__SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH,
type_erased_cgfo_ty F) {
// The "hacky" `handler`s manipulation mentioned near the declaration in
// `handler.hpp` and implemented here is far from perfect. A better approach
// would be
//
// bool OrigNeedsEvent = CGH.needsEvent()
// assert(CGH.not_finalized/enqueued());
// if (!InOrderQueue)
// CGH.setNeedsEvent()
//
// handler PostProcessHandler(Queue, OrigNeedsEvent)
// auto E = CGH.finalize(); // enqueue original or current last
// // post-process
// if (!InOrder)
// PostProcessHandler.depends_on(E)
//
// swap_impls(CGH, PostProcessHandler)
// return; // queue::submit finalizes PostProcessHandler and returns its
// // event if necessary.
//
// Still hackier than "real" `queue::submit` but at least somewhat sane.
// That, however hasn't been tried yet and we have an even hackier approach
// copied from what's been done in an old reductions implementation before
// eventless submission work has started. Not sure how feasible the approach
// above is at this moment.

// This `finalize` is wrong (at least logically) if
// `assert(!CGH.eventNeeded())`
auto E = CGH.finalize();
bool EventNeeded = CGH.impl->MEventNeeded;
queue_impl &Q = CGH.impl->get_queue();
bool InOrder = Q.isInOrder();
// Cannot use `CGH.eventNeeded()` alone as there might be subsequent
// `postProcess` calls and we cannot address them properly similarly to the
// `finalize` issue described above. `swap_impls` suggested above might be
// able to handle this scenario naturally.
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder};
handler AuxHandler{HandlerImpl};
#else
handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder};
#endif
if (!InOrder)
AuxHandler.depends_on(E);
AuxHandler.copyCodeLoc(CGH);
F(AuxHandler);
CGH.MLastEvent = AuxHandler.finalize();
CGH.impl->MEventNeeded = true;

handler PostProcessHandler{
std::make_unique<handler_impl>(Q, nullptr, EventNeeded)};
PostProcessHandler.copyCodeLoc(CGH);
// Extend lifetimes of auxiliary resources till the last kernel in the chain
// finishes:
PostProcessHandler.impl->MAuxiliaryResources = CGH.impl->MAuxiliaryResources;
auto E = CGH.finalize();
if (!InOrder)
PostProcessHandler.depends_on(E);
F(PostProcessHandler);
swap(CGH, PostProcessHandler);
}
} // namespace detail
} // namespace _V1
Expand Down
4 changes: 4 additions & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3617,12 +3617,16 @@ _ZN4sycl3_V17handler8getQueueEv
_ZN4sycl3_V17handler8prefetchEPKvm
_ZN4sycl3_V17handler9clearArgsEv
_ZN4sycl3_V17handler9fill_implEPvPKvmm
_ZN4sycl3_V17handlerC1EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE
_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb
_ZN4sycl3_V17handlerC2EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE
_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb
_ZN4sycl3_V17handlerD1Ev
_ZN4sycl3_V17handlerD2Ev
_ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
_ZN4sycl3_V17samplerC1EP11_cl_samplerRKNS0_7contextE
_ZN4sycl3_V17samplerC2ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
Expand Down
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@
??0gpu_selector@_V1@sycl@@QEAA@$$QEAV012@@Z
??0gpu_selector@_V1@sycl@@QEAA@AEBV012@@Z
??0gpu_selector@_V1@sycl@@QEAA@XZ
??0handler@_V1@sycl@@AEAA@$$QEAV?$unique_ptr@Vhandler_impl@detail@_V1@sycl@@U?$default_delete@Vhandler_impl@detail@_V1@sycl@@@std@@@std@@@Z
??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z
??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z
??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@PEAVqueue_impl@detail@12@_N@Z
Expand Down
Loading