intel · aelovikov-intel · Jun 30, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
@@ -46,11 +46,6 @@ enum class strategy : int {
   multi,
 };
 
-// Reductions implementation need access to private members of handler. Those
-// are limited to those below.
-inline void finalizeHandler(handler &CGH);
-template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func);
-
 template <int Dims>
 item<Dims, false> getDelinearizedItem(range<Dims> Range, id<Dims> Id) {
   return Builder::createItem<Dims, false>(Range, Id);

@@ -488,6 +488,9 @@ class __SYCL_EXPORT handler {
   /// \param Graph is a SYCL command_graph
   handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
 #endif
+  handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl);
+
+  ~handler();
 
   void *storeRawArg(const void *Ptr, size_t Size);
 
@@ -619,8 +622,6 @@ class __SYCL_EXPORT handler {
     addReduction(std::shared_ptr<const void>(ReduBuf));
   }
 
-  ~handler() = default;
-
 #ifdef __SYCL_DEVICE_ONLY__
   // In device compilation accessor isn't inherited from host base classes, so
   // can't detect by it. Since we don't expect it to be ever called in device
@@ -3396,9 +3397,7 @@ class __SYCL_EXPORT handler {
 
 private:
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-  // TODO: Maybe make it a reference when non-preview branch is removed.
-  // On the other hand, see `HandlerAccess:postProcess` to how `swap_impl` might
-  // be useful in future, pointer here would make that possible/easier.
+  std::unique_ptr<detail::handler_impl> implOwner;
   detail::handler_impl *impl;
 #else
   std::shared_ptr<detail::handler_impl> impl;
@@ -3423,11 +3422,10 @@ class __SYCL_EXPORT handler {
   std::unique_ptr<detail::HostKernelBase> MHostKernel;
 
   detail::code_location MCodeLoc = {};
-  bool MIsFinalized = false;
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-  detail::EventImplPtr MLastEvent;
-#else
-  event MLastEvent;
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+  // Was used for the previous reduction implementation (via `withAuxHandler`).
+  bool MIsFinalizedDoNotUse = false;
+  event MLastEventDoNotUse;
 #endif
 
   // Make queue_impl class friend to be able to call finalize method.
@@ -3452,8 +3450,6 @@ class __SYCL_EXPORT handler {
             bool ExplicitIdentity, typename RedOutVar>
   friend class detail::reduction_impl_algo;
 
-  friend inline void detail::reduction::finalizeHandler(handler &CGH);
-
   template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
             typename PropertiesT, typename... RestT>
   friend void detail::reduction_parallel_for(handler &CGH, range<Dims> NDRange,
@@ -3920,6 +3916,30 @@ class HandlerAccess {
     Handler.parallel_for_impl(Range, Props, Kernel);
   }
 
+  static void swap(handler &LHS, handler &RHS) {
+#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
+    std::swap(LHS.implOwner, RHS.implOwner);
+#endif
+    std::swap(LHS.impl, RHS.impl);
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+    std::swap(LHS.MQueueDoNotUse, RHS.MQueueDoNotUse);
+#endif
+    std::swap(LHS.MLocalAccStorage, RHS.MLocalAccStorage);
+    std::swap(LHS.MStreamStorage, RHS.MStreamStorage);
+    std::swap(LHS.MKernelName, RHS.MKernelName);
+    std::swap(LHS.MKernel, RHS.MKernel);
+    std::swap(LHS.MSrcPtr, RHS.MSrcPtr);
+    std::swap(LHS.MDstPtr, RHS.MDstPtr);
+    std::swap(LHS.MLength, RHS.MLength);
+    std::swap(LHS.MPattern, RHS.MPattern);
+    std::swap(LHS.MHostKernel, RHS.MHostKernel);
+    std::swap(LHS.MCodeLoc, RHS.MCodeLoc);
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+    std::swap(LHS.MIsFinalizedDoNotUse, RHS.MIsFinalizedDoNotUse);
+    std::swap(LHS.MLastEventDoNotUse, RHS.MLastEventDoNotUse);
+#endif
+  }
+
   // pre/postProcess are used only for reductions right now, but the
   // abstractions they provide aren't reduction-specific. The main problem they
   // solve is
@@ -3932,9 +3952,8 @@ class HandlerAccess {
   //
   // that needs to be implemented as multiple enqueues involving
   // pre-/post-processing internally. SYCL prohibits recursive submits from
-  // inside control group function object (lambda above) so we resort to a
-  // somewhat hacky way of creating multiple `handler`s and manual finalization
-  // of them (instead of the one in `queue::submit`).
+  // inside control group function object (lambda above) so we need some
+  // internal interface to implement that.
   __SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F);
   __SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F);
 

@@ -1162,10 +1162,6 @@ auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
       RedVar, std::forward<RestTy>(Rest)...};
 }
 
-namespace reduction {
-inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
-} // namespace reduction
-
 // This method is used for implementation of parallel_for accepting 1 reduction.
 // TODO: remove this method when everything is switched to general algorithm
 // implementing arbitrary number of reductions in parallel_for().
@@ -1723,8 +1719,6 @@ struct NDRangeReduction<
       }
     });
 
-    reduction::finalizeHandler(CGH);
-
     // Run the additional kernel as many times as needed to reduce all partial
     // sums into one scalar.
 
@@ -1901,8 +1895,6 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
     else
       First(KernelMultipleWGTag{});
 
-    reduction::finalizeHandler(CGH);
-
     // 2. Run the additional kernel as many times as needed to reduce
     // all partial sums into one scalar.
 
@@ -2598,7 +2590,6 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
 
     reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
                                 ReduIndices);
-    reduction::finalizeHandler(CGH);
 
     size_t NWorkItems = NDRange.get_group_range().size();
     while (NWorkItems > 1) {

@@ -318,7 +318,12 @@ fill_copy_args(detail::handler_impl *impl,
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
 handler::handler(detail::handler_impl &HandlerImpl) : impl(&HandlerImpl) {}
+handler::handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl)
+    : implOwner(std::move(HandlerImpl)), impl(implOwner.get()) {}
 #else
+handler::handler(std::unique_ptr<detail::handler_impl> &&HandlerImpl)
+    : impl(std::move(HandlerImpl)) {}
+
 handler::handler(std::shared_ptr<detail::queue_impl> Queue,
                  bool CallerNeedsEvent)
     : impl(std::make_shared<detail::handler_impl>(*Queue, nullptr,
@@ -344,6 +349,7 @@ handler::handler(
     : impl(std::make_shared<detail::handler_impl>(*Graph)) {}
 
 #endif
+handler::~handler() = default;
 
 // Sets the submission state to indicate that an explicit kernel bundle has been
 // set. Throws a sycl::exception with errc::invalid if the current state
@@ -426,12 +432,6 @@ detail::EventImplPtr handler::finalize() {
 #else
 event handler::finalize() {
 #endif
-  // This block of code is needed only for reduction implementation.
-  // It is harmless (does nothing) for everything else.
-  if (MIsFinalized)
-    return MLastEvent;
-  MIsFinalized = true;
-
   const auto &type = getType();
   detail::queue_impl *Queue = impl->get_queue_or_null();
   ext::oneapi::experimental::detail::graph_impl *Graph =
@@ -559,13 +559,6 @@ event handler::finalize() {
       std::vector<ur_event_handle_t> RawEvents = detail::Command::getUrEvents(
           impl->CGData.MEvents, impl->get_queue_or_null(), false);
 
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-      detail::EventImplPtr &LastEventImpl = MLastEvent;
-#else
-      const detail::EventImplPtr &LastEventImpl =
-          detail::getSyclObjImpl(MLastEvent);
-#endif
-
       bool DiscardEvent =
           !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents();
       if (DiscardEvent) {
@@ -577,11 +570,10 @@ event handler::finalize() {
         DiscardEvent = !KernelUsesAssert;
       }
 
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-      if (!DiscardEvent) {
-        LastEventImpl = detail::event_impl::create_completed_host_event();
-      }
-#endif
+      std::shared_ptr<detail::event_impl> ResultEvent =
+          DiscardEvent
+              ? nullptr
+              : detail::event_impl::create_device_event(impl->get_queue());
 
 #ifdef XPTI_ENABLE_INSTRUMENTATION
       const bool xptiEnabled = xptiTraceEnabled();
@@ -612,9 +604,8 @@ event handler::finalize() {
         enqueueImpKernel(
             impl->get_queue(), impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr,
             MKernel.get(), toKernelNameStrT(MKernelName),
-            impl->MKernelNameBasedCachePtr, RawEvents,
-            DiscardEvent ? nullptr : LastEventImpl.get(), nullptr,
-            impl->MKernelCacheConfig, impl->MKernelIsCooperative,
+            impl->MKernelNameBasedCachePtr, RawEvents, ResultEvent.get(),
+            nullptr, impl->MKernelCacheConfig, impl->MKernelIsCooperative,
             impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize,
             BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs,
             impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures);
@@ -624,7 +615,7 @@ event handler::finalize() {
           if (!DiscardEvent) {
             detail::emitInstrumentationGeneral(
                 StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
-                static_cast<const void *>(LastEventImpl->getHandle()));
+                static_cast<const void *>(ResultEvent->getHandle()));
           }
           detail::emitInstrumentationGeneral(StreamID, InstanceID,
                                              CmdTraceEvent,
@@ -635,29 +626,32 @@ event handler::finalize() {
 
       if (DiscardEvent) {
         EnqueueKernel();
-#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
-        LastEventImpl->setStateDiscarded();
-#endif
       } else {
         detail::queue_impl &Queue = impl->get_queue();
-        LastEventImpl->setQueue(Queue);
-        LastEventImpl->setWorkerQueue(Queue.weak_from_this());
-        LastEventImpl->setContextImpl(impl->get_context());
-        LastEventImpl->setStateIncomplete();
-        LastEventImpl->setSubmissionTime();
+        ResultEvent->setQueue(Queue);
+        ResultEvent->setWorkerQueue(Queue.weak_from_this());
+        ResultEvent->setContextImpl(impl->get_context());
+        ResultEvent->setStateIncomplete();
+        ResultEvent->setSubmissionTime();
 
         EnqueueKernel();
-        LastEventImpl->setEnqueued();
+        ResultEvent->setEnqueued();
         // connect returned event with dependent events
         if (!Queue.isInOrder()) {
           // MEvents is not used anymore, so can move.
-          LastEventImpl->getPreparedDepsEvents() =
+          ResultEvent->getPreparedDepsEvents() =
               std::move(impl->CGData.MEvents);
-          // LastEventImpl is local for current thread, no need to lock.
-          LastEventImpl->cleanDepEventsThroughOneLevelUnlocked();
+          // ResultEvent is local for current thread, no need to lock.
+          ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
         }
       }
-      return MLastEvent;
+#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
+      return ResultEvent;
+#else
+      return detail::createSyclObjFromImpl<event>(
+          ResultEvent ? ResultEvent
+                      : detail::event_impl::create_discarded_event());
+#endif
     }
   }
 
@@ -939,11 +933,10 @@ event handler::finalize() {
       std::move(CommandGroup), *Queue, !DiscardEvent);
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-  MLastEvent = DiscardEvent ? nullptr : Event;
+  return DiscardEvent ? nullptr : Event;
 #else
-  MLastEvent = detail::createSyclObjFromImpl<event>(Event);
+  return detail::createSyclObjFromImpl<event>(Event);
 #endif
-  return MLastEvent;
 }
 
 void handler::addReduction(const std::shared_ptr<const void> &ReduObj) {
@@ -2474,58 +2467,28 @@ __SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH,
   AuxHandler.copyCodeLoc(CGH);
   F(AuxHandler);
   auto E = AuxHandler.finalize();
-  assert(!CGH.MIsFinalized &&
-         "Can't do pre-processing if the command has been enqueued already!");
   if (EventNeeded)
     CGH.depends_on(E);
 }
 __SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH,
                                               type_erased_cgfo_ty F) {
-  // The "hacky" `handler`s manipulation mentioned near the declaration in
-  // `handler.hpp` and implemented here is far from perfect. A better approach
-  // would be
-  //
-  //    bool OrigNeedsEvent = CGH.needsEvent()
-  //    assert(CGH.not_finalized/enqueued());
-  //    if (!InOrderQueue)
-  //      CGH.setNeedsEvent()
-  //
-  //    handler PostProcessHandler(Queue, OrigNeedsEvent)
-  //    auto E = CGH.finalize(); // enqueue original or current last
-  //                             // post-process
-  //    if (!InOrder)
-  //      PostProcessHandler.depends_on(E)
-  //
-  //    swap_impls(CGH, PostProcessHandler)
-  //    return; // queue::submit finalizes PostProcessHandler and returns its
-  //            // event if necessary.
-  //
-  // Still hackier than "real" `queue::submit` but at least somewhat sane.
-  // That, however hasn't been tried yet and we have an even hackier approach
-  // copied from what's been done in an old reductions implementation before
-  // eventless submission work has started. Not sure how feasible the approach
-  // above is at this moment.
-
-  // This `finalize` is wrong (at least logically) if
-  //   `assert(!CGH.eventNeeded())`
-  auto E = CGH.finalize();
+  bool EventNeeded = CGH.impl->MEventNeeded;
   queue_impl &Q = CGH.impl->get_queue();
   bool InOrder = Q.isInOrder();
-  // Cannot use `CGH.eventNeeded()` alone as there might be subsequent
-  // `postProcess` calls and we cannot address them properly similarly to the
-  // `finalize` issue described above. `swap_impls` suggested above might be
-  // able to handle this scenario naturally.
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-  handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder};
-  handler AuxHandler{HandlerImpl};
-#else
-  handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder};
-#endif
   if (!InOrder)
-    AuxHandler.depends_on(E);
-  AuxHandler.copyCodeLoc(CGH);
-  F(AuxHandler);
-  CGH.MLastEvent = AuxHandler.finalize();
+    CGH.impl->MEventNeeded = true;
+
+  handler PostProcessHandler{
+      std::make_unique<handler_impl>(Q, nullptr, EventNeeded)};
+  PostProcessHandler.copyCodeLoc(CGH);
+  // Extend lifetimes of auxiliary resources till the last kernel in the chain
+  // finishes:
+  PostProcessHandler.impl->MAuxiliaryResources = CGH.impl->MAuxiliaryResources;
+  auto E = CGH.finalize();
+  if (!InOrder)
+    PostProcessHandler.depends_on(E);
+  F(PostProcessHandler);
+  swap(CGH, PostProcessHandler);
 }
 } // namespace detail
 } // namespace _V1

@@ -3617,12 +3617,16 @@ _ZN4sycl3_V17handler8getQueueEv
 _ZN4sycl3_V17handler8prefetchEPKvm
 _ZN4sycl3_V17handler9clearArgsEv
 _ZN4sycl3_V17handler9fill_implEPvPKvmm
+_ZN4sycl3_V17handlerC1EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE
 _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
 _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
 _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb
+_ZN4sycl3_V17handlerC2EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb
+_ZN4sycl3_V17handlerD1Ev
+_ZN4sycl3_V17handlerD2Ev
 _ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
 _ZN4sycl3_V17samplerC1EP11_cl_samplerRKNS0_7contextE
 _ZN4sycl3_V17samplerC2ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE

@@ -417,6 +417,7 @@
 ??0gpu_selector@_V1@sycl@@QEAA@$$QEAV012@@Z
 ??0gpu_selector@_V1@sycl@@QEAA@AEBV012@@Z
 ??0gpu_selector@_V1@sycl@@QEAA@XZ
+??0handler@_V1@sycl@@AEAA@$$QEAV?$unique_ptr@Vhandler_impl@detail@_V1@sycl@@U?$default_delete@Vhandler_impl@detail@_V1@sycl@@@std@@@std@@@Z
 ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z
 ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z
 ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@PEAVqueue_impl@detail@12@_N@Z