From ba940da555461d9082e93deb92b5d2ab713f1b20 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Fri, 23 May 2025 07:36:49 -0700 Subject: [PATCH 1/3] [SYCL] Cache implicit local arg position info --- .../detail/kernel_name_based_cache_t.hpp | 1 + .../program_manager/program_manager.cpp | 24 ++++++++++++++----- .../program_manager/program_manager.hpp | 5 ++-- sycl/source/detail/scheduler/commands.cpp | 12 ++++++---- 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/sycl/source/detail/kernel_name_based_cache_t.hpp b/sycl/source/detail/kernel_name_based_cache_t.hpp index f1ecd3ec4cd9d..bf1d9eb7b4018 100644 --- a/sycl/source/detail/kernel_name_based_cache_t.hpp +++ b/sycl/source/detail/kernel_name_based_cache_t.hpp @@ -36,6 +36,7 @@ struct FastKernelSubcacheT { struct KernelNameBasedCacheT { FastKernelSubcacheT FastKernelSubcache; + std::optional> ImplicitLocalArgPos; }; } // namespace detail diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 5836a215b3216..cad84de8e822a 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1851,12 +1851,24 @@ void ProgramManager::cacheKernelImplicitLocalArg(RTDeviceBinaryImage &Img) { } } -std::optional -ProgramManager::kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const { - auto it = m_KernelImplicitLocalArgPos.find(KernelName); - if (it != m_KernelImplicitLocalArgPos.end()) - return it->second; - return {}; +std::optional ProgramManager::kernelImplicitLocalArgPos( + KernelNameStrRefT KernelName, + KernelNameBasedCacheT *KernelNameBasedCachePtr) const { + auto getLocalArgPos = [&]() -> std::optional { + auto it = m_KernelImplicitLocalArgPos.find(KernelName); + if (it != m_KernelImplicitLocalArgPos.end()) + return it->second; + return {}; + }; + + if (!KernelNameBasedCachePtr) + return getLocalArgPos(); + std::optional> &ImplicitLocalArgPos = + KernelNameBasedCachePtr->ImplicitLocalArgPos; + if (!ImplicitLocalArgPos.has_value()) { + ImplicitLocalArgPos = getLocalArgPos(); + } + return ImplicitLocalArgPos.value(); } static bool isBfloat16DeviceLibImage(sycl_device_binary RawImg, diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 27c4610421ca4..83f7051920438 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -365,8 +365,9 @@ class ProgramManager { SanitizerType kernelUsesSanitizer() const { return m_SanitizerFoundInImage; } - std::optional - kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const; + std::optional kernelImplicitLocalArgPos( + KernelNameStrRefT KernelName, + KernelNameBasedCacheT *KernelNameBasedCachePtr) const; std::set getRawDeviceImages(const std::vector &KernelIDs); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 4309b6f1c0395..8fa5d0aac2d47 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2390,8 +2390,9 @@ static ur_result_t SetKernelParamsAndLaunch( const std::function &getMemAllocationFunc, bool IsCooperative, bool KernelUsesClusterLaunch, uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage, - KernelNameStrRefT KernelName, void *KernelFuncPtr = nullptr, - int KernelNumArgs = 0, + KernelNameStrRefT KernelName, + KernelNameBasedCacheT *KernelNameBasedCachePtr, + void *KernelFuncPtr = nullptr, int KernelNumArgs = 0, detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr, bool KernelHasSpecialCaptures = true) { assert(Queue && "Kernel submissions should have an associated queue"); @@ -2439,7 +2440,8 @@ static ur_result_t SetKernelParamsAndLaunch( } std::optional ImplicitLocalArg = - ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName); + ProgramManager::getInstance().kernelImplicitLocalArgPos( + KernelName, KernelNameBasedCachePtr); // Set the implicit local memory buffer to support // get_work_group_scratch_memory. This is for backend not supporting // CUDA-style local memory setting. Note that we may have -1 as a position, @@ -2775,8 +2777,8 @@ void enqueueImpKernel( Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList, OutEventImpl, EliminatedArgMask, getMemAllocationFunc, KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize, - BinImage, KernelName, KernelFuncPtr, KernelNumArgs, - KernelParamDescGetter, KernelHasSpecialCaptures); + BinImage, KernelName, KernelNameBasedCachePtr, KernelFuncPtr, + KernelNumArgs, KernelParamDescGetter, KernelHasSpecialCaptures); const AdapterPtr &Adapter = Queue->getAdapter(); if (!SyclKernelImpl && !MSyclKernel) { From eae1db07347639a1b1ef8873aea9d79ed810a084 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 2 Jun 2025 05:15:36 -0700 Subject: [PATCH 2/3] Fix default value of cache ptr --- sycl/source/detail/handler_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 9dac6a7f435d4..e54aaa662a335 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -210,7 +210,7 @@ class handler_impl { bool MKernelHasSpecialCaptures = true; // A pointer to a kernel name based cache retrieved on the application side. - KernelNameBasedCacheT *MKernelNameBasedCachePtr; + KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; } // namespace detail From 6abce4eafdba80e24de7c82d1e0eca43379eb501 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 2 Jun 2025 07:59:23 -0700 Subject: [PATCH 3/3] Add a comment --- sycl/source/detail/kernel_name_based_cache_t.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/source/detail/kernel_name_based_cache_t.hpp b/sycl/source/detail/kernel_name_based_cache_t.hpp index a4bd1d349ce76..300df4e4cdf57 100644 --- a/sycl/source/detail/kernel_name_based_cache_t.hpp +++ b/sycl/source/detail/kernel_name_based_cache_t.hpp @@ -38,6 +38,9 @@ struct FastKernelSubcacheT { struct KernelNameBasedCacheT { FastKernelSubcacheT FastKernelSubcache; std::optional UsesAssert; + // Implicit local argument position is represented by an optional int, this + // uses another optional on top of that to represent lazy initialization of + // the cached value. std::optional> ImplicitLocalArgPos; };