intel · uditagarwal97 · May 5, 2025 · May 28, 2025 · May 28, 2025 · vinser52
@@ -403,6 +403,38 @@ class KernelProgramCache {
               << "][Kernel Cache]" << Identifier << Msg << std::endl;
   }
 
+  // (1) state >= 0 means that there are no writers holding the lock and there
+  //     are zero or more readers holding the lock.
+  // (2) state < 0 means that there is one writer holding the lock.
+  // (3) Reader can not acquire the lock if state < 0 (there's a writer).
+  // (4) Writer can not acquire the lock if state >= 0 (there are readers).
+  // (5) There's only one writer at a time, but there can be multiple readers.
+  // (6) The writer is only allowed to delete an entry, we anyway do not support
+  //     modyfying an entry in the cache.
+
+  void acquireReaderLock() {
+    // If state >= 0, increment it.
+    // If state < 0, wait until it becomes non-negative.
+    int expected;
+    do {
+      expected = state.load();
+      while (expected < 0) { // Wait if a writer holds the lock
+        expected = state.load();
+      }
+    } while (!state.compare_exchange_weak(expected, expected + 1));
+  }
+
+  void releaseReaderLock() { state--; }
+
+  void acquireWriterLock() {
+    int expected = 0;
+    while (!state.compare_exchange_weak(expected, -1)) {
+      expected = 0;
+    }
+  }
+
+  void releaseWriterLock() { state.store(0); }
+
   Locked<ProgramCache> acquireCachedPrograms() {
     return {MCachedPrograms, MProgramCacheMutex};
   }
@@ -526,6 +558,12 @@ class KernelProgramCache {
                             ProgramCache &ProgCache) {
     auto It = ProgCache.Cache.find(CacheKey);
 
+    // Make sure there are no readers using handle of kernel/program we are
+    // about to remove. We use one lock for the whole cache and not individual
+    // locks for each program, because we removing items from cache is a very
+    // rare operation.
+    acquireWriterLock();
+
     if (It != ProgCache.Cache.end()) {
       // We are about to remove this program now.
       // (1) Remove it from KernelPerProgram cache.
@@ -609,6 +647,8 @@ class KernelProgramCache {
       throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
                             "Program not found in the cache.");
 
+    releaseWriterLock();
+
     return MCachedPrograms.ProgramCacheSizeInBytes;
   }
 
@@ -713,6 +753,11 @@ class KernelProgramCache {
   ///
   /// This member function should only be used in unit tests.
   void reset() {
+
+    // We are about to delete items from the cache. Make sure there are no
+    // readers.
+    acquireWriterLock();
+
     std::lock_guard<std::mutex> EvictionListLock(MProgramEvictionListMutex);
     std::lock_guard<std::mutex> L1(MProgramCacheMutex);
     std::lock_guard<std::mutex> L2(MKernelsPerProgramCacheMutex);
@@ -723,6 +768,9 @@ class KernelProgramCache {
     MProgramToFastKernelCacheKeyMap.clear();
     // Clear the eviction lists and its mutexes.
     MEvictionList.clear();
+
+    // Release the writer lock.
+    releaseWriterLock();
   }
 
   /// Try to fetch entity (kernel or program) from cache. If there is no such
@@ -860,6 +908,14 @@ class KernelProgramCache {
   // Mutexes that will be used when accessing the eviction lists.
   std::mutex MProgramEvictionListMutex;
 
+  // Implements a reader-writer lock.
+  // Cache might own kernel and program handles that it stores.
+  // So, we need to be careful while deleting items from cache as
+  // another thread might be using a kernel/program handle that we just
+  // deleted. That will lead to use-after-free and undefined behavior.
+  // The state variable is used to implement a reader-writer lock.
+  std::atomic<int> state{0};
+
   friend class ::MockKernelProgramCache;
 
   const AdapterPtr &getAdapter();

@@ -851,7 +851,8 @@ CheckAndDecompressImage([[maybe_unused]] RTDeviceBinaryImage *Img) {
 // its ref count incremented.
 ur_program_handle_t ProgramManager::getBuiltURProgram(
     const ContextImplPtr &ContextImpl, device_impl &DeviceImpl,
-    KernelNameStrRefT KernelName, const NDRDescT &NDRDesc) {
+    KernelNameStrRefT KernelName, const NDRDescT &NDRDesc,
+    const bool TransferOwnershipToCache) {
   device_impl *RootDevImpl;
   ur_bool_t MustBuildOnSubdevice = true;
 
@@ -898,13 +899,14 @@ ur_program_handle_t ProgramManager::getBuiltURProgram(
             std::back_inserter(AllImages));
 
   return getBuiltURProgram(std::move(AllImages), ContextImpl,
-                           {std::move(Device)});
+                           {std::move(Device)}, nullptr, {},
+                           TransferOwnershipToCache);
 }
 
 ur_program_handle_t ProgramManager::getBuiltURProgram(
     const BinImgWithDeps &ImgWithDeps, const ContextImplPtr &ContextImpl,
     const std::vector<device> &Devs, const DevImgPlainWithDeps *DevImgWithDeps,
-    const SerializedObj &SpecConsts) {
+    const SerializedObj &SpecConsts, const bool TransferOwnershipToCache) {
   std::string CompileOpts;
   std::string LinkOpts;
   applyOptionsFromEnvironment(CompileOpts, LinkOpts);
@@ -1105,7 +1107,9 @@ ur_program_handle_t ProgramManager::getBuiltURProgram(
   // stored in the cache, and one handle is returned to the
   // caller. In that case, we need to increase the ref count of the
   // program.
-  Adapter->call<UrApiKind::urProgramRetain>(ResProgram);
+  if (!TransferOwnershipToCache)
+    Adapter->call<UrApiKind::urProgramRetain>(ResProgram);
+
   return ResProgram;
 }
 // When caching is enabled, the returned UrProgram and UrKernel will
@@ -1115,7 +1119,8 @@ std::tuple<ur_kernel_handle_t, std::mutex *, const KernelArgMask *,
 ProgramManager::getOrCreateKernel(
     const ContextImplPtr &ContextImpl, device_impl &DeviceImpl,
     KernelNameStrRefT KernelName,
-    KernelNameBasedCacheT *KernelNameBasedCachePtr, const NDRDescT &NDRDesc) {
+    KernelNameBasedCacheT *KernelNameBasedCachePtr, const NDRDescT &NDRDesc,
+    const bool TransferOwnershipToCache) {
   if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::getOrCreateKernel(" << ContextImpl.get()
               << ", " << &DeviceImpl << ", " << KernelName << ")\n";
@@ -1134,18 +1139,22 @@ ProgramManager::getOrCreateKernel(
     constexpr size_t Kernel = 0;  // see FastKernelCacheValT tuple
     constexpr size_t Program = 3; // see FastKernelCacheValT tuple
     if (std::get<Kernel>(ret_tuple)) {
-      // Pulling a copy of a kernel and program from the cache,
-      // so we need to retain those resources.
-      ContextImpl->getAdapter()->call<UrApiKind::urKernelRetain>(
-          std::get<Kernel>(ret_tuple));
-      ContextImpl->getAdapter()->call<UrApiKind::urProgramRetain>(
-          std::get<Program>(ret_tuple));
+      // No need to retain if cache is the owner as we won't be
+      // releasing them elsewhere.
+      if (!TransferOwnershipToCache) {
+        // Pulling a copy of a kernel and program from the cache,
+        // so we need to retain those resources.
+        ContextImpl->getAdapter()->call<UrApiKind::urKernelRetain>(
+            std::get<Kernel>(ret_tuple));
+        ContextImpl->getAdapter()->call<UrApiKind::urProgramRetain>(
+            std::get<Program>(ret_tuple));
+      }
       return ret_tuple;
     }
   }
 
-  ur_program_handle_t Program =
-      getBuiltURProgram(ContextImpl, DeviceImpl, KernelName, NDRDesc);
+  ur_program_handle_t Program = getBuiltURProgram(
+      ContextImpl, DeviceImpl, KernelName, NDRDesc, TransferOwnershipToCache);
 
   auto BuildF = [this, &Program, &KernelName, &ContextImpl] {
     ur_kernel_handle_t Kernel = nullptr;
@@ -1189,12 +1198,13 @@ ProgramManager::getOrCreateKernel(
   auto ret_val = std::make_tuple(KernelArgMaskPair.first,
                                  &(BuildResult->MBuildResultMutex),
                                  KernelArgMaskPair.second, Program);
-  // If caching is enabled, one copy of the kernel handle will be
-  // stored in the cache, and one handle is returned to the
-  // caller. In that case, we need to increase the ref count of the
-  // kernel.
-  ContextImpl->getAdapter()->call<UrApiKind::urKernelRetain>(
-      KernelArgMaskPair.first);
+
+  // If cache is the owner, we won't be releasing them elsewhere,
+  // so no need to retain.
+  if (!TransferOwnershipToCache)
+    ContextImpl->getAdapter()->call<UrApiKind::urKernelRetain>(
+        KernelArgMaskPair.first);
+
   Cache.saveKernel(KernelName, UrDevice, ret_val, CacheHintPtr);
   return ret_val;
 }

@@ -175,10 +175,10 @@ class ProgramManager {
   /// \param Context the context to build the program with
   /// \param Device the device for which the program is built
   /// \param KernelName the kernel's name
-  ur_program_handle_t getBuiltURProgram(const ContextImplPtr &ContextImpl,
-                                        device_impl &DeviceImpl,
-                                        KernelNameStrRefT KernelName,
-                                        const NDRDescT &NDRDesc = {});
+  ur_program_handle_t
+  getBuiltURProgram(const ContextImplPtr &ContextImpl, device_impl &DeviceImpl,
+                    KernelNameStrRefT KernelName, const NDRDescT &NDRDesc = {},
+                    const bool TransferOwnershipToCache = false);
 
   /// Builds a program from a given set of images or retrieves that program from
   /// cache.
@@ -195,14 +195,16 @@ class ProgramManager {
                     const ContextImplPtr &ContextImpl,
                     const std::vector<device> &Devs,
                     const DevImgPlainWithDeps *DevImgWithDeps = nullptr,
-                    const SerializedObj &SpecConsts = {});
+                    const SerializedObj &SpecConsts = {},
+                    const bool TransferOwnershipToCache = false);
 
   std::tuple<ur_kernel_handle_t, std::mutex *, const KernelArgMask *,
              ur_program_handle_t>
   getOrCreateKernel(const ContextImplPtr &ContextImpl, device_impl &DeviceImpl,
                     KernelNameStrRefT KernelName,
                     KernelNameBasedCacheT *KernelNameBasedCachePtr,
-                    const NDRDescT &NDRDesc = {});
+                    const NDRDescT &NDRDesc = {},
+                    const bool TransferOwnershipToCache = false);
 
   ur_kernel_handle_t getCachedMaterializedKernel(
       KernelNameStrRefT KernelName,

@@ -2703,6 +2703,8 @@ void enqueueImpKernel(
 
   std::shared_ptr<kernel_impl> SyclKernelImpl;
   std::shared_ptr<device_image_impl> DeviceImageImpl;
+  // Transfer ownership only of cache is enabled.
+  const bool TransferownerShipToCache = SYCLConfig<SYCL_CACHE_IN_MEM>::get();
 
   if (nullptr != MSyclKernel) {
     assert(MSyclKernel->get_info<info::kernel::context>() ==
@@ -2730,10 +2732,17 @@ void enqueueImpKernel(
     EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
     KernelMutex = SyclKernelImpl->getCacheMutex();
   } else {
+
+    // Acquire the reader lock if cache holds the ownership. This ensures
+    // that the fetched kernel and program are not evicted while we are
+    // using them.
+    if (TransferownerShipToCache)
+      ContextImpl->getKernelProgramCache().acquireReaderLock();
+
     std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
         detail::ProgramManager::getInstance().getOrCreateKernel(
             ContextImpl, DeviceImpl, KernelName, KernelNameBasedCachePtr,
-            NDRDesc);
+            NDRDesc, TransferownerShipToCache);
   }
 
   // We may need more events for the launch, so we make another reference.
@@ -2779,10 +2788,19 @@ void enqueueImpKernel(
         BinImage, KernelName, KernelFuncPtr, KernelNumArgs,
         KernelParamDescGetter, KernelHasSpecialCaptures);
 
-    const AdapterPtr &Adapter = Queue->getAdapter();
-    if (!SyclKernelImpl && !MSyclKernel) {
-      Adapter->call<UrApiKind::urKernelRelease>(Kernel);
-      Adapter->call<UrApiKind::urProgramRelease>(Program);
+    // If cache is owning the kernel and programs, we don't have to release
+    // them here, as they will be released when the cache is destroyed or
+    // when the kernel is evicted from the cache.
+    if (!SyclKernelImpl && !MSyclKernel && !TransferownerShipToCache) {
+
+      if (TransferownerShipToCache)
+        ContextImpl->getKernelProgramCache().releaseReaderLock();
+      else {
+        // If cache is disabled, we need to release the kernel and program.
+        const AdapterPtr &Adapter = Queue->getAdapter();
+        Adapter->call<UrApiKind::urKernelRelease>(Kernel);
+        Adapter->call<UrApiKind::urProgramRelease>(Program);
+      }
     }
   }
   if (UR_RESULT_SUCCESS != Error) {

@@ -29,14 +29,10 @@ int main() {
   // CHECK: <--- urEventWait
 
   // CHECK-CACHE: <--- urProgramCreate
-  // CHECK-CACHE: <--- urProgramRetain
   // CHECK-CACHE-NOT: <--- urProgramRetain
   // CHECK-CACHE: <--- urKernelCreate
-  // CHECK-CACHE: <--- urKernelRetain
   // CHECK-CACHE-NOT: <--- urKernelCreate
   // CHECK-CACHE: <--- urEnqueueKernelLaunch
-  // CHECK-CACHE: <--- urKernelRelease
-  // CHECK-CACHE: <--- urProgramRelease
   // CHECK-CACHE: <--- urEventWait
   q.single_task([] {}).wait();
 

@@ -43,8 +43,6 @@
 // CHECK:      UR Call Begin : urKernelSetArgPointer
 // CHECK-NEXT: UR Call Begin : urKernelGetGroupInfo
 // CHECK-NEXT: UR Call Begin : urEnqueueKernelLaunch
-// CHECK-NEXT: UR Call Begin : urKernelRelease
-// CHECK-NEXT: UR Call Begin : urProgramRelease
 // CHECK-NEXT: Signal
 // CHECK-DAG:    queue_id : {{.*}}
 // CHECK-DAG:    sym_line_no : {{.*}}

@@ -119,7 +119,7 @@ TEST(GetNative, GetNativeHandle) {
   EXPECT_EQ(mockOpenCLNumQueueRetains(), 0ul);
   EXPECT_EQ(mockOpenCLNumDeviceRetains(), 0ul);
   EXPECT_EQ(mockOpenCLNumEventRetains(), 0ul);
-  ASSERT_EQ(TestCounter, 2 + DeviceRetainCounter - 1)
+  ASSERT_EQ(TestCounter, DeviceRetainCounter)
       << "Not all the retain methods were called";
 
   get_native<backend::opencl>(Context);
@@ -135,6 +135,6 @@ TEST(GetNative, GetNativeHandle) {
 
   // get_native shouldn't retain the SYCL objects, but instead retains the
   // underlying handles
-  ASSERT_EQ(TestCounter, 2 + DeviceRetainCounter - 1)
+  ASSERT_EQ(TestCounter, DeviceRetainCounter)
       << "get_native retained SYCL objects";
 }
@@ -158,7 +158,8 @@ TEST_P(MultipleDeviceCacheTest, ProgramRetain) {
     // MultipleDevsCacheTestKernel.
     EXPECT_EQ(BundleImpl->size(), size_t{1});
 
-    int NumRetains = BundleImpl->size() * std::pow(2, NumDevices) - 1;
+    // '-2' because fetching the same program from cache won't call retain.
+    int NumRetains = BundleImpl->size() * std::pow(2, NumDevices) - 2;
     EXPECT_EQ(RetainCounter, NumRetains)
         << "Expect " << NumRetains << " piProgramRetain calls";
 
@@ -180,9 +181,11 @@ TEST_P(MultipleDeviceCacheTest, ProgramRetain) {
   // expect 3 urKernelRelease calls.
 
   // We create 2 kernels in the test. So, we expect
-  // 4 urKernelRelease calls (correpsonding to 2 create calls + 2 retain calls
+  // 3 urKernelRelease calls (correpsonding to 2 create calls + 1 retain calls
   // when handle is returned to the caller).
-  EXPECT_EQ(KernelReleaseCounter, 4) << "Expect 4 piKernelRelease calls";
+  // While kernel is created using handler::single_task(), the retain call
+  // will be made when cache is destroyed (during shutdown).
+  EXPECT_EQ(KernelReleaseCounter, 3) << "Expect 3 piKernelRelease calls";
 }
 
 INSTANTIATE_TEST_SUITE_P(