diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index 7317b9fd9b309..b498352611227 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -403,6 +403,38 @@ class KernelProgramCache { << "][Kernel Cache]" << Identifier << Msg << std::endl; } + // (1) state >= 0 means that there are no writers holding the lock and there + // are zero or more readers holding the lock. + // (2) state < 0 means that there is one writer holding the lock. + // (3) Reader can not acquire the lock if state < 0 (there's a writer). + // (4) Writer can not acquire the lock if state >= 0 (there are readers). + // (5) There's only one writer at a time, but there can be multiple readers. + // (6) The writer is only allowed to delete an entry, we anyway do not support + // modyfying an entry in the cache. + + void acquireReaderLock() { + // If state >= 0, increment it. + // If state < 0, wait until it becomes non-negative. + int expected; + do { + expected = state.load(); + while (expected < 0) { // Wait if a writer holds the lock + expected = state.load(); + } + } while (!state.compare_exchange_weak(expected, expected + 1)); + } + + void releaseReaderLock() { state--; } + + void acquireWriterLock() { + int expected = 0; + while (!state.compare_exchange_weak(expected, -1)) { + expected = 0; + } + } + + void releaseWriterLock() { state.store(0); } + Locked acquireCachedPrograms() { return {MCachedPrograms, MProgramCacheMutex}; } @@ -526,6 +558,12 @@ class KernelProgramCache { ProgramCache &ProgCache) { auto It = ProgCache.Cache.find(CacheKey); + // Make sure there are no readers using handle of kernel/program we are + // about to remove. We use one lock for the whole cache and not individual + // locks for each program, because we removing items from cache is a very + // rare operation. + acquireWriterLock(); + if (It != ProgCache.Cache.end()) { // We are about to remove this program now. // (1) Remove it from KernelPerProgram cache. @@ -609,6 +647,8 @@ class KernelProgramCache { throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), "Program not found in the cache."); + releaseWriterLock(); + return MCachedPrograms.ProgramCacheSizeInBytes; } @@ -713,6 +753,11 @@ class KernelProgramCache { /// /// This member function should only be used in unit tests. void reset() { + + // We are about to delete items from the cache. Make sure there are no + // readers. + acquireWriterLock(); + std::lock_guard EvictionListLock(MProgramEvictionListMutex); std::lock_guard L1(MProgramCacheMutex); std::lock_guard L2(MKernelsPerProgramCacheMutex); @@ -723,6 +768,9 @@ class KernelProgramCache { MProgramToFastKernelCacheKeyMap.clear(); // Clear the eviction lists and its mutexes. MEvictionList.clear(); + + // Release the writer lock. + releaseWriterLock(); } /// Try to fetch entity (kernel or program) from cache. If there is no such @@ -860,6 +908,14 @@ class KernelProgramCache { // Mutexes that will be used when accessing the eviction lists. std::mutex MProgramEvictionListMutex; + // Implements a reader-writer lock. + // Cache might own kernel and program handles that it stores. + // So, we need to be careful while deleting items from cache as + // another thread might be using a kernel/program handle that we just + // deleted. That will lead to use-after-free and undefined behavior. + // The state variable is used to implement a reader-writer lock. + std::atomic state{0}; + friend class ::MockKernelProgramCache; const AdapterPtr &getAdapter(); diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 5836a215b3216..384e67dcbbeed 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -851,7 +851,8 @@ CheckAndDecompressImage([[maybe_unused]] RTDeviceBinaryImage *Img) { // its ref count incremented. ur_program_handle_t ProgramManager::getBuiltURProgram( const ContextImplPtr &ContextImpl, device_impl &DeviceImpl, - KernelNameStrRefT KernelName, const NDRDescT &NDRDesc) { + KernelNameStrRefT KernelName, const NDRDescT &NDRDesc, + const bool TransferOwnershipToCache) { device_impl *RootDevImpl; ur_bool_t MustBuildOnSubdevice = true; @@ -898,13 +899,14 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( std::back_inserter(AllImages)); return getBuiltURProgram(std::move(AllImages), ContextImpl, - {std::move(Device)}); + {std::move(Device)}, nullptr, {}, + TransferOwnershipToCache); } ur_program_handle_t ProgramManager::getBuiltURProgram( const BinImgWithDeps &ImgWithDeps, const ContextImplPtr &ContextImpl, const std::vector &Devs, const DevImgPlainWithDeps *DevImgWithDeps, - const SerializedObj &SpecConsts) { + const SerializedObj &SpecConsts, const bool TransferOwnershipToCache) { std::string CompileOpts; std::string LinkOpts; applyOptionsFromEnvironment(CompileOpts, LinkOpts); @@ -1105,7 +1107,9 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( // stored in the cache, and one handle is returned to the // caller. In that case, we need to increase the ref count of the // program. - Adapter->call(ResProgram); + if (!TransferOwnershipToCache) + Adapter->call(ResProgram); + return ResProgram; } // When caching is enabled, the returned UrProgram and UrKernel will @@ -1115,7 +1119,8 @@ std::tuple 0) { std::cerr << ">>> ProgramManager::getOrCreateKernel(" << ContextImpl.get() << ", " << &DeviceImpl << ", " << KernelName << ")\n"; @@ -1134,18 +1139,22 @@ ProgramManager::getOrCreateKernel( constexpr size_t Kernel = 0; // see FastKernelCacheValT tuple constexpr size_t Program = 3; // see FastKernelCacheValT tuple if (std::get(ret_tuple)) { - // Pulling a copy of a kernel and program from the cache, - // so we need to retain those resources. - ContextImpl->getAdapter()->call( - std::get(ret_tuple)); - ContextImpl->getAdapter()->call( - std::get(ret_tuple)); + // No need to retain if cache is the owner as we won't be + // releasing them elsewhere. + if (!TransferOwnershipToCache) { + // Pulling a copy of a kernel and program from the cache, + // so we need to retain those resources. + ContextImpl->getAdapter()->call( + std::get(ret_tuple)); + ContextImpl->getAdapter()->call( + std::get(ret_tuple)); + } return ret_tuple; } } - ur_program_handle_t Program = - getBuiltURProgram(ContextImpl, DeviceImpl, KernelName, NDRDesc); + ur_program_handle_t Program = getBuiltURProgram( + ContextImpl, DeviceImpl, KernelName, NDRDesc, TransferOwnershipToCache); auto BuildF = [this, &Program, &KernelName, &ContextImpl] { ur_kernel_handle_t Kernel = nullptr; @@ -1189,12 +1198,13 @@ ProgramManager::getOrCreateKernel( auto ret_val = std::make_tuple(KernelArgMaskPair.first, &(BuildResult->MBuildResultMutex), KernelArgMaskPair.second, Program); - // If caching is enabled, one copy of the kernel handle will be - // stored in the cache, and one handle is returned to the - // caller. In that case, we need to increase the ref count of the - // kernel. - ContextImpl->getAdapter()->call( - KernelArgMaskPair.first); + + // If cache is the owner, we won't be releasing them elsewhere, + // so no need to retain. + if (!TransferOwnershipToCache) + ContextImpl->getAdapter()->call( + KernelArgMaskPair.first); + Cache.saveKernel(KernelName, UrDevice, ret_val, CacheHintPtr); return ret_val; } diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 27c4610421ca4..76bbcb22b6eab 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -175,10 +175,10 @@ class ProgramManager { /// \param Context the context to build the program with /// \param Device the device for which the program is built /// \param KernelName the kernel's name - ur_program_handle_t getBuiltURProgram(const ContextImplPtr &ContextImpl, - device_impl &DeviceImpl, - KernelNameStrRefT KernelName, - const NDRDescT &NDRDesc = {}); + ur_program_handle_t + getBuiltURProgram(const ContextImplPtr &ContextImpl, device_impl &DeviceImpl, + KernelNameStrRefT KernelName, const NDRDescT &NDRDesc = {}, + const bool TransferOwnershipToCache = false); /// Builds a program from a given set of images or retrieves that program from /// cache. @@ -195,14 +195,16 @@ class ProgramManager { const ContextImplPtr &ContextImpl, const std::vector &Devs, const DevImgPlainWithDeps *DevImgWithDeps = nullptr, - const SerializedObj &SpecConsts = {}); + const SerializedObj &SpecConsts = {}, + const bool TransferOwnershipToCache = false); std::tuple getOrCreateKernel(const ContextImplPtr &ContextImpl, device_impl &DeviceImpl, KernelNameStrRefT KernelName, KernelNameBasedCacheT *KernelNameBasedCachePtr, - const NDRDescT &NDRDesc = {}); + const NDRDescT &NDRDesc = {}, + const bool TransferOwnershipToCache = false); ur_kernel_handle_t getCachedMaterializedKernel( KernelNameStrRefT KernelName, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 739242128f0a2..ce223c91190d3 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2703,6 +2703,8 @@ void enqueueImpKernel( std::shared_ptr SyclKernelImpl; std::shared_ptr DeviceImageImpl; + // Transfer ownership only of cache is enabled. + const bool TransferownerShipToCache = SYCLConfig::get(); if (nullptr != MSyclKernel) { assert(MSyclKernel->get_info() == @@ -2730,10 +2732,17 @@ void enqueueImpKernel( EliminatedArgMask = SyclKernelImpl->getKernelArgMask(); KernelMutex = SyclKernelImpl->getCacheMutex(); } else { + + // Acquire the reader lock if cache holds the ownership. This ensures + // that the fetched kernel and program are not evicted while we are + // using them. + if (TransferownerShipToCache) + ContextImpl->getKernelProgramCache().acquireReaderLock(); + std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) = detail::ProgramManager::getInstance().getOrCreateKernel( ContextImpl, DeviceImpl, KernelName, KernelNameBasedCachePtr, - NDRDesc); + NDRDesc, TransferownerShipToCache); } // We may need more events for the launch, so we make another reference. @@ -2779,10 +2788,19 @@ void enqueueImpKernel( BinImage, KernelName, KernelFuncPtr, KernelNumArgs, KernelParamDescGetter, KernelHasSpecialCaptures); - const AdapterPtr &Adapter = Queue->getAdapter(); - if (!SyclKernelImpl && !MSyclKernel) { - Adapter->call(Kernel); - Adapter->call(Program); + // If cache is owning the kernel and programs, we don't have to release + // them here, as they will be released when the cache is destroyed or + // when the kernel is evicted from the cache. + if (!SyclKernelImpl && !MSyclKernel && !TransferownerShipToCache) { + + if (TransferownerShipToCache) + ContextImpl->getKernelProgramCache().releaseReaderLock(); + else { + // If cache is disabled, we need to release the kernel and program. + const AdapterPtr &Adapter = Queue->getAdapter(); + Adapter->call(Kernel); + Adapter->call(Program); + } } } if (UR_RESULT_SUCCESS != Error) { diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp index 8a0b15b12311f..051a1223ab97c 100644 --- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp +++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp @@ -29,14 +29,10 @@ int main() { // CHECK: <--- urEventWait // CHECK-CACHE: <--- urProgramCreate - // CHECK-CACHE: <--- urProgramRetain // CHECK-CACHE-NOT: <--- urProgramRetain // CHECK-CACHE: <--- urKernelCreate - // CHECK-CACHE: <--- urKernelRetain // CHECK-CACHE-NOT: <--- urKernelCreate // CHECK-CACHE: <--- urEnqueueKernelLaunch - // CHECK-CACHE: <--- urKernelRelease - // CHECK-CACHE: <--- urProgramRelease // CHECK-CACHE: <--- urEventWait q.single_task([] {}).wait(); diff --git a/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp b/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp index 4dfe5928bd5ee..cb452728abee2 100644 --- a/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp +++ b/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp @@ -43,8 +43,6 @@ // CHECK: UR Call Begin : urKernelSetArgPointer // CHECK-NEXT: UR Call Begin : urKernelGetGroupInfo // CHECK-NEXT: UR Call Begin : urEnqueueKernelLaunch -// CHECK-NEXT: UR Call Begin : urKernelRelease -// CHECK-NEXT: UR Call Begin : urProgramRelease // CHECK-NEXT: Signal // CHECK-DAG: queue_id : {{.*}} // CHECK-DAG: sym_line_no : {{.*}} diff --git a/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp b/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp index db36d59fb0153..2be62d258ec0f 100644 --- a/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp +++ b/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp @@ -119,7 +119,7 @@ TEST(GetNative, GetNativeHandle) { EXPECT_EQ(mockOpenCLNumQueueRetains(), 0ul); EXPECT_EQ(mockOpenCLNumDeviceRetains(), 0ul); EXPECT_EQ(mockOpenCLNumEventRetains(), 0ul); - ASSERT_EQ(TestCounter, 2 + DeviceRetainCounter - 1) + ASSERT_EQ(TestCounter, DeviceRetainCounter) << "Not all the retain methods were called"; get_native(Context); @@ -135,6 +135,6 @@ TEST(GetNative, GetNativeHandle) { // get_native shouldn't retain the SYCL objects, but instead retains the // underlying handles - ASSERT_EQ(TestCounter, 2 + DeviceRetainCounter - 1) + ASSERT_EQ(TestCounter, DeviceRetainCounter) << "get_native retained SYCL objects"; } diff --git a/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp b/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp index 3f040d3fa0aed..7df910298c54c 100644 --- a/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp +++ b/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp @@ -158,7 +158,8 @@ TEST_P(MultipleDeviceCacheTest, ProgramRetain) { // MultipleDevsCacheTestKernel. EXPECT_EQ(BundleImpl->size(), size_t{1}); - int NumRetains = BundleImpl->size() * std::pow(2, NumDevices) - 1; + // '-2' because fetching the same program from cache won't call retain. + int NumRetains = BundleImpl->size() * std::pow(2, NumDevices) - 2; EXPECT_EQ(RetainCounter, NumRetains) << "Expect " << NumRetains << " piProgramRetain calls"; @@ -180,9 +181,11 @@ TEST_P(MultipleDeviceCacheTest, ProgramRetain) { // expect 3 urKernelRelease calls. // We create 2 kernels in the test. So, we expect - // 4 urKernelRelease calls (correpsonding to 2 create calls + 2 retain calls + // 3 urKernelRelease calls (correpsonding to 2 create calls + 1 retain calls // when handle is returned to the caller). - EXPECT_EQ(KernelReleaseCounter, 4) << "Expect 4 piKernelRelease calls"; + // While kernel is created using handler::single_task(), the retain call + // will be made when cache is destroyed (during shutdown). + EXPECT_EQ(KernelReleaseCounter, 3) << "Expect 3 piKernelRelease calls"; } INSTANTIATE_TEST_SUITE_P(