-
Notifications
You must be signed in to change notification settings - Fork 790
[NFCI][SYCL] Refactor HandlerAccess::postProcess
#19203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9dd5ad8
4c99be4
7c5daaf
900283e
d7bf37a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -488,6 +488,9 @@ class __SYCL_EXPORT handler { | |
/// \param Graph is a SYCL command_graph | ||
handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph); | ||
#endif | ||
handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl); | ||
|
||
~handler(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
void *storeRawArg(const void *Ptr, size_t Size); | ||
|
||
|
@@ -619,8 +622,6 @@ class __SYCL_EXPORT handler { | |
addReduction(std::shared_ptr<const void>(ReduBuf)); | ||
} | ||
|
||
~handler() = default; | ||
|
||
#ifdef __SYCL_DEVICE_ONLY__ | ||
// In device compilation accessor isn't inherited from host base classes, so | ||
// can't detect by it. Since we don't expect it to be ever called in device | ||
|
@@ -3396,9 +3397,7 @@ class __SYCL_EXPORT handler { | |
|
||
private: | ||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
// TODO: Maybe make it a reference when non-preview branch is removed. | ||
// On the other hand, see `HandlerAccess:postProcess` to how `swap_impl` might | ||
// be useful in future, pointer here would make that possible/easier. | ||
std::unique_ptr<detail::handler_impl> implOwner; | ||
detail::handler_impl *impl; | ||
#else | ||
std::shared_ptr<detail::handler_impl> impl; | ||
|
@@ -3423,11 +3422,10 @@ class __SYCL_EXPORT handler { | |
std::unique_ptr<detail::HostKernelBase> MHostKernel; | ||
|
||
detail::code_location MCodeLoc = {}; | ||
bool MIsFinalized = false; | ||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
detail::EventImplPtr MLastEvent; | ||
#else | ||
event MLastEvent; | ||
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES | ||
// Was used for the previous reduction implementation (via `withAuxHandler`). | ||
bool MIsFinalizedDoNotUse = false; | ||
event MLastEventDoNotUse; | ||
#endif | ||
|
||
// Make queue_impl class friend to be able to call finalize method. | ||
|
@@ -3452,8 +3450,6 @@ class __SYCL_EXPORT handler { | |
bool ExplicitIdentity, typename RedOutVar> | ||
friend class detail::reduction_impl_algo; | ||
|
||
friend inline void detail::reduction::finalizeHandler(handler &CGH); | ||
|
||
template <typename KernelName, detail::reduction::strategy Strategy, int Dims, | ||
typename PropertiesT, typename... RestT> | ||
friend void detail::reduction_parallel_for(handler &CGH, range<Dims> NDRange, | ||
|
@@ -3920,6 +3916,30 @@ class HandlerAccess { | |
Handler.parallel_for_impl(Range, Props, Kernel); | ||
} | ||
|
||
static void swap(handler &LHS, handler &RHS) { | ||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
std::swap(LHS.implOwner, RHS.implOwner); | ||
#endif | ||
std::swap(LHS.impl, RHS.impl); | ||
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES | ||
std::swap(LHS.MQueueDoNotUse, RHS.MQueueDoNotUse); | ||
#endif | ||
std::swap(LHS.MLocalAccStorage, RHS.MLocalAccStorage); | ||
std::swap(LHS.MStreamStorage, RHS.MStreamStorage); | ||
std::swap(LHS.MKernelName, RHS.MKernelName); | ||
std::swap(LHS.MKernel, RHS.MKernel); | ||
std::swap(LHS.MSrcPtr, RHS.MSrcPtr); | ||
std::swap(LHS.MDstPtr, RHS.MDstPtr); | ||
std::swap(LHS.MLength, RHS.MLength); | ||
std::swap(LHS.MPattern, RHS.MPattern); | ||
std::swap(LHS.MHostKernel, RHS.MHostKernel); | ||
std::swap(LHS.MCodeLoc, RHS.MCodeLoc); | ||
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES | ||
std::swap(LHS.MIsFinalizedDoNotUse, RHS.MIsFinalizedDoNotUse); | ||
std::swap(LHS.MLastEventDoNotUse, RHS.MLastEventDoNotUse); | ||
#endif | ||
} | ||
|
||
// pre/postProcess are used only for reductions right now, but the | ||
// abstractions they provide aren't reduction-specific. The main problem they | ||
// solve is | ||
|
@@ -3932,9 +3952,8 @@ class HandlerAccess { | |
// | ||
// that needs to be implemented as multiple enqueues involving | ||
// pre-/post-processing internally. SYCL prohibits recursive submits from | ||
// inside control group function object (lambda above) so we resort to a | ||
// somewhat hacky way of creating multiple `handler`s and manual finalization | ||
// of them (instead of the one in `queue::submit`). | ||
// inside control group function object (lambda above) so we need some | ||
// internal interface to implement that. | ||
__SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F); | ||
__SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -318,7 +318,12 @@ fill_copy_args(detail::handler_impl *impl, | |
|
||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
handler::handler(detail::handler_impl &HandlerImpl) : impl(&HandlerImpl) {} | ||
handler::handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl) | ||
: implOwner(std::move(HandlerImpl)), impl(implOwner.get()) {} | ||
#else | ||
handler::handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl) | ||
: impl(std::move(HandlerImpl)) {} | ||
|
||
handler::handler(std::shared_ptr<detail::queue_impl> Queue, | ||
bool CallerNeedsEvent) | ||
: impl(std::make_shared<detail::handler_impl>(*Queue, nullptr, | ||
|
@@ -344,6 +349,7 @@ handler::handler( | |
: impl(std::make_shared<detail::handler_impl>(*Graph)) {} | ||
|
||
#endif | ||
handler::~handler() = default; | ||
|
||
// Sets the submission state to indicate that an explicit kernel bundle has been | ||
// set. Throws a sycl::exception with errc::invalid if the current state | ||
|
@@ -426,12 +432,6 @@ detail::EventImplPtr handler::finalize() { | |
#else | ||
event handler::finalize() { | ||
#endif | ||
// This block of code is needed only for reduction implementation. | ||
// It is harmless (does nothing) for everything else. | ||
if (MIsFinalized) | ||
return MLastEvent; | ||
MIsFinalized = true; | ||
|
||
const auto &type = getType(); | ||
detail::queue_impl *Queue = impl->get_queue_or_null(); | ||
ext::oneapi::experimental::detail::graph_impl *Graph = | ||
|
@@ -559,13 +559,6 @@ event handler::finalize() { | |
std::vector<ur_event_handle_t> RawEvents = detail::Command::getUrEvents( | ||
impl->CGData.MEvents, impl->get_queue_or_null(), false); | ||
|
||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
detail::EventImplPtr &LastEventImpl = MLastEvent; | ||
#else | ||
const detail::EventImplPtr &LastEventImpl = | ||
detail::getSyclObjImpl(MLastEvent); | ||
#endif | ||
|
||
bool DiscardEvent = | ||
!impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); | ||
if (DiscardEvent) { | ||
|
@@ -577,11 +570,10 @@ event handler::finalize() { | |
DiscardEvent = !KernelUsesAssert; | ||
} | ||
|
||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
if (!DiscardEvent) { | ||
LastEventImpl = detail::event_impl::create_completed_host_event(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No idea why we were using |
||
} | ||
#endif | ||
std::shared_ptr<detail::event_impl> ResultEvent = | ||
DiscardEvent | ||
? nullptr | ||
: detail::event_impl::create_device_event(impl->get_queue()); | ||
|
||
#ifdef XPTI_ENABLE_INSTRUMENTATION | ||
const bool xptiEnabled = xptiTraceEnabled(); | ||
|
@@ -612,9 +604,8 @@ event handler::finalize() { | |
enqueueImpKernel( | ||
impl->get_queue(), impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr, | ||
MKernel.get(), toKernelNameStrT(MKernelName), | ||
impl->MKernelNameBasedCachePtr, RawEvents, | ||
DiscardEvent ? nullptr : LastEventImpl.get(), nullptr, | ||
impl->MKernelCacheConfig, impl->MKernelIsCooperative, | ||
impl->MKernelNameBasedCachePtr, RawEvents, ResultEvent.get(), | ||
nullptr, impl->MKernelCacheConfig, impl->MKernelIsCooperative, | ||
impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, | ||
BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs, | ||
impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures); | ||
|
@@ -624,7 +615,7 @@ event handler::finalize() { | |
if (!DiscardEvent) { | ||
detail::emitInstrumentationGeneral( | ||
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal, | ||
static_cast<const void *>(LastEventImpl->getHandle())); | ||
static_cast<const void *>(ResultEvent->getHandle())); | ||
} | ||
detail::emitInstrumentationGeneral(StreamID, InstanceID, | ||
CmdTraceEvent, | ||
|
@@ -635,29 +626,32 @@ event handler::finalize() { | |
|
||
if (DiscardEvent) { | ||
EnqueueKernel(); | ||
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES | ||
LastEventImpl->setStateDiscarded(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Line 654 on the right does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree and it's also going away, so no real concerns either way. |
||
#endif | ||
} else { | ||
detail::queue_impl &Queue = impl->get_queue(); | ||
LastEventImpl->setQueue(Queue); | ||
LastEventImpl->setWorkerQueue(Queue.weak_from_this()); | ||
LastEventImpl->setContextImpl(impl->get_context()); | ||
LastEventImpl->setStateIncomplete(); | ||
LastEventImpl->setSubmissionTime(); | ||
ResultEvent->setQueue(Queue); | ||
ResultEvent->setWorkerQueue(Queue.weak_from_this()); | ||
ResultEvent->setContextImpl(impl->get_context()); | ||
ResultEvent->setStateIncomplete(); | ||
ResultEvent->setSubmissionTime(); | ||
|
||
EnqueueKernel(); | ||
LastEventImpl->setEnqueued(); | ||
ResultEvent->setEnqueued(); | ||
// connect returned event with dependent events | ||
if (!Queue.isInOrder()) { | ||
// MEvents is not used anymore, so can move. | ||
LastEventImpl->getPreparedDepsEvents() = | ||
ResultEvent->getPreparedDepsEvents() = | ||
std::move(impl->CGData.MEvents); | ||
// LastEventImpl is local for current thread, no need to lock. | ||
LastEventImpl->cleanDepEventsThroughOneLevelUnlocked(); | ||
// ResultEvent is local for current thread, no need to lock. | ||
ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); | ||
} | ||
} | ||
return MLastEvent; | ||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
return ResultEvent; | ||
#else | ||
return detail::createSyclObjFromImpl<event>( | ||
ResultEvent ? ResultEvent | ||
: detail::event_impl::create_discarded_event()); | ||
#endif | ||
} | ||
} | ||
|
||
|
@@ -939,11 +933,10 @@ event handler::finalize() { | |
std::move(CommandGroup), *Queue, !DiscardEvent); | ||
|
||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
MLastEvent = DiscardEvent ? nullptr : Event; | ||
return DiscardEvent ? nullptr : Event; | ||
#else | ||
MLastEvent = detail::createSyclObjFromImpl<event>(Event); | ||
return detail::createSyclObjFromImpl<event>(Event); | ||
#endif | ||
return MLastEvent; | ||
} | ||
|
||
void handler::addReduction(const std::shared_ptr<const void> &ReduObj) { | ||
|
@@ -2474,58 +2467,28 @@ __SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH, | |
AuxHandler.copyCodeLoc(CGH); | ||
F(AuxHandler); | ||
auto E = AuxHandler.finalize(); | ||
assert(!CGH.MIsFinalized && | ||
"Can't do pre-processing if the command has been enqueued already!"); | ||
if (EventNeeded) | ||
CGH.depends_on(E); | ||
} | ||
__SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH, | ||
type_erased_cgfo_ty F) { | ||
// The "hacky" `handler`s manipulation mentioned near the declaration in | ||
// `handler.hpp` and implemented here is far from perfect. A better approach | ||
// would be | ||
// | ||
// bool OrigNeedsEvent = CGH.needsEvent() | ||
// assert(CGH.not_finalized/enqueued()); | ||
// if (!InOrderQueue) | ||
// CGH.setNeedsEvent() | ||
// | ||
// handler PostProcessHandler(Queue, OrigNeedsEvent) | ||
// auto E = CGH.finalize(); // enqueue original or current last | ||
// // post-process | ||
// if (!InOrder) | ||
// PostProcessHandler.depends_on(E) | ||
// | ||
// swap_impls(CGH, PostProcessHandler) | ||
// return; // queue::submit finalizes PostProcessHandler and returns its | ||
// // event if necessary. | ||
// | ||
// Still hackier than "real" `queue::submit` but at least somewhat sane. | ||
// That, however hasn't been tried yet and we have an even hackier approach | ||
// copied from what's been done in an old reductions implementation before | ||
// eventless submission work has started. Not sure how feasible the approach | ||
// above is at this moment. | ||
|
||
// This `finalize` is wrong (at least logically) if | ||
// `assert(!CGH.eventNeeded())` | ||
auto E = CGH.finalize(); | ||
bool EventNeeded = CGH.impl->MEventNeeded; | ||
queue_impl &Q = CGH.impl->get_queue(); | ||
bool InOrder = Q.isInOrder(); | ||
// Cannot use `CGH.eventNeeded()` alone as there might be subsequent | ||
// `postProcess` calls and we cannot address them properly similarly to the | ||
// `finalize` issue described above. `swap_impls` suggested above might be | ||
// able to handle this scenario naturally. | ||
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder}; | ||
handler AuxHandler{HandlerImpl}; | ||
#else | ||
handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder}; | ||
#endif | ||
if (!InOrder) | ||
AuxHandler.depends_on(E); | ||
AuxHandler.copyCodeLoc(CGH); | ||
F(AuxHandler); | ||
CGH.MLastEvent = AuxHandler.finalize(); | ||
CGH.impl->MEventNeeded = true; | ||
|
||
handler PostProcessHandler{ | ||
std::make_unique<handler_impl>(Q, nullptr, EventNeeded)}; | ||
PostProcessHandler.copyCodeLoc(CGH); | ||
// Extend lifetimes of auxiliary resources till the last kernel in the chain | ||
// finishes: | ||
PostProcessHandler.impl->MAuxiliaryResources = CGH.impl->MAuxiliaryResources; | ||
auto E = CGH.finalize(); | ||
if (!InOrder) | ||
PostProcessHandler.depends_on(E); | ||
F(PostProcessHandler); | ||
swap(CGH, PostProcessHandler); | ||
} | ||
} // namespace detail | ||
} // namespace _V1 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
withAuxHandler
declaration should have been removed earlier whenpostProcess
was introduced instead of it.