diff --git a/libdevice/sanitizer/tsan_rtl.cpp b/libdevice/sanitizer/tsan_rtl.cpp index 84ebae901d68b..4a2c909efda97 100644 --- a/libdevice/sanitizer/tsan_rtl.cpp +++ b/libdevice/sanitizer/tsan_rtl.cpp @@ -22,7 +22,7 @@ static const __SYCL_CONSTANT__ char __tsan_print_generic_to[] = "[kernel] %p(4) - %p(%d)\n"; static const __SYCL_CONSTANT__ char __tsan_print_raw_shadow[] = - "[kernel] %p(%d) -> %p: {%x, %x, %x, %x}\n"; + "[kernel] %p(%d) -> %p: {%x, %x}\n"; static const __SYCL_CONSTANT__ char __tsan_print_shadow_value[] = "[kernel] %p(%d) : {size: %d, access: %x, sid: %d, clock: %d, is_write: " @@ -90,26 +90,36 @@ inline __SYCL_GLOBAL__ RawShadow *MemToShadow_PVC(uptr addr, uint32_t as) { ConvertGenericPointer(addr, as); } - if (as != ADDRESS_SPACE_GLOBAL) - return nullptr; - addr = RoundDownTo(addr, kShadowCell); - if (addr & 0xff00'0000'0000'0000ULL) { - // device usm - return addr < TsanLaunchInfo->GlobalShadowOffset - ? reinterpret_cast<__SYCL_GLOBAL__ RawShadow *>( - addr + (TsanLaunchInfo->GlobalShadowOffset + - 0x200'0000'0000ULL - 0xff00'0000'0000'0000ULL)) - : reinterpret_cast<__SYCL_GLOBAL__ RawShadow *>( - addr - (0xff00'ffff'ffff'ffffULL - - TsanLaunchInfo->GlobalShadowOffsetEnd + 1)); - } else { - // host & shared usm - return reinterpret_cast<__SYCL_GLOBAL__ RawShadow *>( - (addr & 0xffffffffffULL) + TsanLaunchInfo->GlobalShadowOffset + - ((addr & 0x800000000000ULL) >> 7)); + if (as == ADDRESS_SPACE_GLOBAL) { + if (addr & 0xff00'0000'0000'0000ULL) { + // device usm + return addr < TsanLaunchInfo->GlobalShadowOffset + ? reinterpret_cast<__SYCL_GLOBAL__ RawShadow *>( + addr + (TsanLaunchInfo->GlobalShadowOffset + + 0x200'0000'0000ULL - 0xff00'0000'0000'0000ULL)) + : reinterpret_cast<__SYCL_GLOBAL__ RawShadow *>( + addr - (0xff00'ffff'ffff'ffffULL - + TsanLaunchInfo->GlobalShadowOffsetEnd + 1)); + } else { + // host & shared usm + return reinterpret_cast<__SYCL_GLOBAL__ RawShadow *>( + (addr & 0xffffffffffULL) + TsanLaunchInfo->GlobalShadowOffset + + ((addr & 0x800000000000ULL) >> 7)); + } + } else if (as == ADDRESS_SPACE_LOCAL) { + const auto shadow_offset = TsanLaunchInfo->LocalShadowOffset; + if (shadow_offset != 0) { + // The size of SLM is 128KB on PVC + constexpr unsigned SLM_SIZE = 128 * 1024; + const size_t wid = WorkGroupLinearId(); + return reinterpret_cast<__SYCL_GLOBAL__ RawShadow *>( + shadow_offset + (wid * SLM_SIZE) + (addr & (SLM_SIZE - 1))); + } } + + return nullptr; } inline __SYCL_GLOBAL__ RawShadow *MemToShadow(uptr addr, uint32_t as) { @@ -151,7 +161,7 @@ inline void StoreShadow(__SYCL_GLOBAL__ RawShadow *p, RawShadow s) { } inline void DoReportRace(__SYCL_GLOBAL__ RawShadow *s, AccessType type, - uptr addr, uint32_t size, + uptr addr, uint32_t size, uint32_t as, const char __SYCL_CONSTANT__ *file, uint32_t line, const char __SYCL_CONSTANT__ *func) { // This prevents trapping on this address in future. @@ -167,6 +177,11 @@ inline void DoReportRace(__SYCL_GLOBAL__ RawShadow *s, AccessType type, return; } + if (as == ADDRESS_SPACE_GENERIC && + TsanLaunchInfo->DeviceTy != DeviceType::CPU) { + ConvertGenericPointer(addr, as); + } + // Check if current address already being recorded before. for (uint32_t i = 0; i < TsanLaunchInfo->RecordedReportCount; i++) { auto &SanitizerReport = TsanLaunchInfo->Report[i]; @@ -180,7 +195,8 @@ inline void DoReportRace(__SYCL_GLOBAL__ RawShadow *s, AccessType type, TsanLaunchInfo->Report[TsanLaunchInfo->RecordedReportCount++]; SanitizerReport.Address = addr; - SanitizerReport.Type = type; + SanitizerReport.Type = + type | (as == ADDRESS_SPACE_LOCAL ? kAccessLocal : 0); SanitizerReport.AccessSize = size; int FileLength = 0; @@ -224,7 +240,7 @@ inline void DoReportRace(__SYCL_GLOBAL__ RawShadow *s, AccessType type, } inline bool CheckRace(__SYCL_GLOBAL__ RawShadow *s, Shadow cur, AccessType type, - uptr addr, uint32_t size, + uptr addr, uint32_t size, uint32_t as, const char __SYCL_CONSTANT__ *file, uint32_t line, const char __SYCL_CONSTANT__ *func) { bool stored = false; @@ -258,7 +274,7 @@ inline bool CheckRace(__SYCL_GLOBAL__ RawShadow *s, Shadow cur, AccessType type, if (TsanLaunchInfo->Clock[cur.sid()].clk_[old.sid()] >= old.clock()) continue; - DoReportRace(s, type, addr, size, file, line, func); + DoReportRace(s, type, addr, size, as, file, line, func); return true; } @@ -301,9 +317,9 @@ inline bool ContainsSameAccess(__SYCL_GLOBAL__ RawShadow *s, Shadow cur, return; \ Sid sid = GetCurrentSid(); \ uint16_t current_clock = IncrementEpoch(sid) + 1; \ - TSAN_DEBUG(__spirv_ocl_printf( \ - __tsan_print_raw_shadow, (void *)addr, as, (void *)shadow_mem, \ - shadow_mem[0], shadow_mem[1], shadow_mem[2], shadow_mem[3])); \ + TSAN_DEBUG(__spirv_ocl_printf(__tsan_print_raw_shadow, (void *)addr, as, \ + (void *)shadow_mem, shadow_mem[0], \ + shadow_mem[1])); \ AccessType type = is_write ? kAccessWrite : kAccessRead; \ Shadow cur(addr, size, current_clock, sid, type); \ TSAN_DEBUG(__spirv_ocl_printf(__tsan_print_shadow_value, (void *)addr, as, \ @@ -311,7 +327,7 @@ inline bool ContainsSameAccess(__SYCL_GLOBAL__ RawShadow *s, Shadow cur, is_write)); \ if (ContainsSameAccess(shadow_mem, cur, type)) \ return; \ - CheckRace(shadow_mem, cur, type, addr, size, file, line, func); \ + CheckRace(shadow_mem, cur, type, addr, size, as, file, line, func); \ } TSAN_CHECK(read, false, 1) @@ -349,16 +365,16 @@ __tsan_read16(uptr addr, uint32_t as, const char __SYCL_CONSTANT__ *file, AccessType type = is_write ? kAccessWrite : kAccessRead; \ uptr size1 = Min(size, RoundUpTo(addr + 1, kShadowCell) - addr); \ { \ - TSAN_DEBUG(__spirv_ocl_printf( \ - __tsan_print_raw_shadow, (void *)addr, as, (void *)shadow_mem, \ - shadow_mem[0], shadow_mem[1], shadow_mem[2], shadow_mem[3])); \ + TSAN_DEBUG(__spirv_ocl_printf(__tsan_print_raw_shadow, (void *)addr, as, \ + (void *)shadow_mem, shadow_mem[0], \ + shadow_mem[1])); \ Shadow cur(addr, size1, current_clock, sid, type); \ TSAN_DEBUG(__spirv_ocl_printf(__tsan_print_shadow_value, (void *)addr, \ as, size1, cur.access(), cur.sid(), \ cur.clock(), is_write)); \ if (ContainsSameAccess(shadow_mem, cur, type)) \ goto SECOND; \ - if (CheckRace(shadow_mem, cur, type, addr, size1, file, line, func)) \ + if (CheckRace(shadow_mem, cur, type, addr, size1, as, file, line, func)) \ return; \ } \ SECOND: \ @@ -367,17 +383,17 @@ __tsan_read16(uptr addr, uint32_t as, const char __SYCL_CONSTANT__ *file, return; \ shadow_mem += kShadowCnt; \ { \ - TSAN_DEBUG( \ - __spirv_ocl_printf(__tsan_print_raw_shadow, (void *)(addr + size1), \ - as, (void *)shadow_mem, shadow_mem[0], \ - shadow_mem[1], shadow_mem[2], shadow_mem[3])); \ + TSAN_DEBUG(__spirv_ocl_printf( \ + __tsan_print_raw_shadow, (void *)(addr + size1), as, \ + (void *)shadow_mem, shadow_mem[0], shadow_mem[1])); \ Shadow cur(0, size2, current_clock, sid, type); \ TSAN_DEBUG(__spirv_ocl_printf( \ __tsan_print_shadow_value, (void *)(addr + size1), as, size2, \ cur.access(), cur.sid(), cur.clock(), is_write)); \ if (ContainsSameAccess(shadow_mem, cur, type)) \ return; \ - CheckRace(shadow_mem, cur, type, addr + size1, size2, file, line, func); \ + CheckRace(shadow_mem, cur, type, addr + size1, size2, as, file, line, \ + func); \ } \ } @@ -420,7 +436,7 @@ static inline void __tsan_cleanup_private_cpu_impl(uptr addr, uint32_t size) { } } -DEVICE_EXTERN_C_NOINLINE void __tsan_cleanup_private(uptr addr, uint32_t size) { +DEVICE_EXTERN_C_NOINLINE void __tsan_cleanup_private(uptr addr, size_t size) { #if defined(__LIBDEVICE_CPU__) __tsan_cleanup_private_cpu_impl(addr, size); #elif defined(__LIBDEVICE_PVC__) @@ -433,6 +449,55 @@ DEVICE_EXTERN_C_NOINLINE void __tsan_cleanup_private(uptr addr, uint32_t size) { #endif } +static __SYCL_CONSTANT__ const char __tsan_print_cleanup_local[] = + "[kernel] cleanup shadow (%p ~ %p) for local %p\n"; + +DEVICE_EXTERN_C_NOINLINE void __tsan_cleanup_static_local(uptr addr, + size_t size) { + // Update shadow memory of local memory only on first work-item + if (__spirv_LocalInvocationId_x() + __spirv_LocalInvocationId_y() + + __spirv_LocalInvocationId_z() == + 0) { + if (TsanLaunchInfo->LocalShadowOffset == 0) + return; + + addr = RoundDownTo(addr, kShadowCell); + size = RoundUpTo(size, kShadowCell); + + RawShadow *Begin = MemToShadow(addr, ADDRESS_SPACE_LOCAL); + for (uptr i = 0; i < size / kShadowCell * kShadowCnt; i++) + Begin[i] = 0; + + TSAN_DEBUG(__spirv_ocl_printf( + __tsan_print_cleanup_local, addr, Begin, + (uptr)Begin + size / kShadowCell * kShadowCnt * kShadowSize - 1)); + } +} + +static __SYCL_CONSTANT__ const char __tsan_print_report_arg_count_incorrect[] = + "[kernel] ERROR: The number of local args is incorrect, expect %d, actual " + "%d\n"; + +DEVICE_EXTERN_C_NOINLINE void __tsan_cleanup_dynamic_local(uptr ptr, + uint32_t num_args) { + if (!TsanLaunchInfo->LocalShadowOffset) + return; + + if (num_args != TsanLaunchInfo->NumLocalArgs) { + __spirv_ocl_printf(__tsan_print_report_arg_count_incorrect, num_args, + TsanLaunchInfo->NumLocalArgs); + return; + } + + uptr *args = (uptr *)ptr; + + for (uint32_t i = 0; i < num_args; ++i) { + auto *local_arg = &TsanLaunchInfo->LocalArgs[i]; + + __tsan_cleanup_static_local(args[i], local_arg->Size); + } +} + DEVICE_EXTERN_C_INLINE void __tsan_device_barrier() { Sid sid = GetCurrentSid(); diff --git a/llvm/include/llvm/Transforms/Instrumentation/SPIRVSanitizerCommonUtils.h b/llvm/include/llvm/Transforms/Instrumentation/SPIRVSanitizerCommonUtils.h index f6edabcc3bbf3..11056ae197ac5 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/SPIRVSanitizerCommonUtils.h +++ b/llvm/include/llvm/Transforms/Instrumentation/SPIRVSanitizerCommonUtils.h @@ -12,6 +12,7 @@ #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_SPIRVSANITIZERCOMMONUTILS_H #define LLVM_TRANSFORMS_INSTRUMENTATION_SPIRVSANITIZERCOMMONUTILS_H +#include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -24,8 +25,17 @@ constexpr unsigned kSpirOffloadConstantAS = 2; constexpr unsigned kSpirOffloadLocalAS = 3; constexpr unsigned kSpirOffloadGenericAS = 4; +// If the type is or has target extension type just return the type, otherwise +// return nullptr. TargetExtType *getTargetExtType(Type *Ty); + +// Check if it's a joint matrix access operation. bool isJointMatrixAccess(Value *V); + +// If the User is an instruction of constant expr, try to get the functions that +// it has been used. +void getFunctionsOfUser(User *User, SmallVectorImpl &Functions); + } // namespace llvm #endif // LLVM_TRANSFORMS_INSTRUMENTATION_SPIRVSANITIZERCOMMONUTILS_H diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index e5c7958ac644f..182346941e891 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -2968,15 +2968,6 @@ void ModuleAddressSanitizer::instrumentDeviceGlobal(IRBuilder<> &IRB) { G->eraseFromParent(); } -static void getFunctionsOfUser(User *User, DenseSet &Functions) { - if (Instruction *Inst = dyn_cast(User)) { - Functions.insert(Inst->getFunction()); - } else if (ConstantExpr *CE = dyn_cast(User)) { - for (auto *U : CE->users()) - getFunctionsOfUser(U, Functions); - } -} - void ModuleAddressSanitizer::initializeRetVecMap(Function *F) { if (KernelToRetVecMap.find(F) != KernelToRetVecMap.end()) return; @@ -3109,19 +3100,23 @@ void ModuleAddressSanitizer::instrumentSyclStaticLocalMemory(IRBuilder<> &IRB) { // We only instrument on spir_kernel, because local variables are // kind of global variable for (auto *G : LocalGlobals) { - DenseSet InstrumentedFunc; + SmallVector WorkList; + DenseSet InstrumentedKernel; for (auto *User : G->users()) - getFunctionsOfUser(User, InstrumentedFunc); - for (Function *F : InstrumentedFunc) { + getFunctionsOfUser(User, WorkList); + while (!WorkList.empty()) { + Function *F = WorkList.pop_back_val(); if (F->getCallingConv() == CallingConv::SPIR_KERNEL) { - Instrument(G, F); + if (!InstrumentedKernel.contains(F)) { + Instrument(G, F); + InstrumentedKernel.insert(F); + } continue; } // Get root spir_kernel of spir_func initializeKernelCallerMap(F); - for (Function *Kernel : FuncToKernelCallerMap[F]) - if (!InstrumentedFunc.contains(Kernel)) - Instrument(G, Kernel); + for (auto *F : FuncToKernelCallerMap[F]) + WorkList.push_back(F); } } } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index f95d3ca03b4bd..bbbc3cfe5a55f 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1012,15 +1012,6 @@ void MemorySanitizerOnSpirv::initializeKernelCallerMap(Function *F) { } } -static void getFunctionsOfUser(User *User, DenseSet &Functions) { - if (Instruction *Inst = dyn_cast(User)) { - Functions.insert(Inst->getFunction()); - } else if (ConstantExpr *CE = dyn_cast(User)) { - for (auto *U : CE->users()) - getFunctionsOfUser(U, Functions); - } -} - void MemorySanitizerOnSpirv::instrumentStaticLocalMemory() { if (!ClSpirOffloadLocals) return; @@ -1057,18 +1048,23 @@ void MemorySanitizerOnSpirv::instrumentStaticLocalMemory() { // kind of global variable, which must be initialized only once. for (auto &G : M.globals()) { if (G.getAddressSpace() == kSpirOffloadLocalAS) { - DenseSet InstrumentedFunc; + SmallVector WorkList; + DenseSet InstrumentedKernel; for (auto *User : G.users()) - getFunctionsOfUser(User, InstrumentedFunc); - for (Function *F : InstrumentedFunc) { + getFunctionsOfUser(User, WorkList); + while (!WorkList.empty()) { + Function *F = WorkList.pop_back_val(); if (F->getCallingConv() == CallingConv::SPIR_KERNEL) { - Instrument(&G, F); + if (!InstrumentedKernel.contains(F)) { + Instrument(&G, F); + InstrumentedKernel.insert(F); + } continue; } // Get root spir_kernel of spir_func initializeKernelCallerMap(F); - for (Function *Kernel : FuncToKernelCallerMap[F]) - Instrument(&G, Kernel); + for (auto *F : FuncToKernelCallerMap[F]) + WorkList.push_back(F); } } } diff --git a/llvm/lib/Transforms/Instrumentation/SPIRVSanitizerCommonUtils.cpp b/llvm/lib/Transforms/Instrumentation/SPIRVSanitizerCommonUtils.cpp index f08d931b96375..2c483caff1b61 100644 --- a/llvm/lib/Transforms/Instrumentation/SPIRVSanitizerCommonUtils.cpp +++ b/llvm/lib/Transforms/Instrumentation/SPIRVSanitizerCommonUtils.cpp @@ -58,4 +58,14 @@ bool isJointMatrixAccess(Value *V) { } return false; } + +void getFunctionsOfUser(User *User, SmallVectorImpl &Functions) { + if (Instruction *Inst = dyn_cast(User)) { + Functions.push_back(Inst->getFunction()); + } else if (ConstantExpr *CE = dyn_cast(User)) { + for (auto *U : CE->users()) + getFunctionsOfUser(U, Functions); + } +} + } // namespace llvm diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 6b449ad1ab3c0..10e08b4b685d4 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -84,6 +84,10 @@ static cl::opt ClCompoundReadBeforeWrite( cl::desc("Emit special compound instrumentation for reads-before-writes"), cl::Hidden); +static cl::opt ClSpirOffloadLocals("tsan-spir-locals", + cl::desc("instrument local pointer"), + cl::Hidden, cl::init(true)); + STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); STATISTIC(NumOmittedReadsBeforeWrite, @@ -117,6 +121,8 @@ struct ThreadSanitizerOnSpirv { bool instrumentAllocInst(Function *F, SmallVectorImpl &AllocaInsts); + void instrumentDynamicLocalMemory(Function &F); + bool instrumentControlBarrier(CallInst *CI); void appendDebugInfoToArgs(Instruction *I, SmallVectorImpl &Args); @@ -126,8 +132,12 @@ struct ThreadSanitizerOnSpirv { private: void instrumentGlobalVariables(); + void instrumentStaticLocalMemory(); + void instrumentKernelsMetadata(); + void initializeKernelCallerMap(Function *F); + bool isSupportedSPIRKernel(Function &F); bool isUnsupportedDeviceGlobal(const GlobalVariable &G); @@ -143,9 +153,13 @@ struct ThreadSanitizerOnSpirv { StringMap GlobalStringMap; + DenseMap> FuncToKernelCallerMap; + // Accesses sizes are powers of two: 1, 2, 4, 8, 16. static const size_t kNumberOfAccessSizes = 5; FunctionCallee TsanCleanupPrivate; + FunctionCallee TsanCleanupStaticLocal; + FunctionCallee TsanCleanupDynamicLocal; FunctionCallee TsanDeviceBarrier; FunctionCallee TsanGroupBarrier; FunctionCallee TsanRead[kNumberOfAccessSizes]; @@ -238,7 +252,7 @@ void insertModuleCtor(Module &M) { // time. Hook them into the global ctors list in that case: [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); }); } -} // namespace +} // namespace PreservedAnalyses ThreadSanitizerPass::run(Function &F, FunctionAnalysisManager &FAM) { @@ -255,6 +269,7 @@ PreservedAnalyses ModuleThreadSanitizerPass::run(Module &M, return PreservedAnalyses::all(); if (Triple(M.getTargetTriple()).isSPIROrSPIRV()) { ThreadSanitizerOnSpirv Spirv(M); + Spirv.initialize(); Spirv.instrumentModule(); } else insertModuleCtor(M); @@ -267,9 +282,27 @@ void ThreadSanitizerOnSpirv::initialize() { Attr = Attr.addFnAttribute(C, Attribute::NoUnwind); Type *Int8PtrTy = IRB.getInt8PtrTy(kSpirOffloadConstantAS); - TsanCleanupPrivate = - M.getOrInsertFunction("__tsan_cleanup_private", Attr, IRB.getVoidTy(), - IntptrTy, IRB.getInt32Ty()); + // __tsan_cleanup_private( + // uptr ptr, + // size_t size + // ) + TsanCleanupPrivate = M.getOrInsertFunction( + "__tsan_cleanup_private", Attr, IRB.getVoidTy(), IntptrTy, IntptrTy); + + // __tsan_cleanup_static_local( + // uptr ptr, + // size_t size + // ) + TsanCleanupStaticLocal = M.getOrInsertFunction( + "__tsan_cleanup_static_local", Attr, IRB.getVoidTy(), IntptrTy, IntptrTy); + + // __tsan_cleanup_dynamic_local( + // uptr ptr, + // size_t size + // ) + TsanCleanupDynamicLocal = + M.getOrInsertFunction("__tsan_cleanup_dynamic_local", Attr, + IRB.getVoidTy(), IntptrTy, IRB.getInt32Ty()); TsanDeviceBarrier = M.getOrInsertFunction( "__tsan_device_barrier", Attr.addFnAttribute(C, Attribute::Convergent), @@ -325,10 +358,9 @@ bool ThreadSanitizerOnSpirv::instrumentAllocInst( continue; if (auto AllocSize = AI->getAllocationSize(DL)) { - AtExit->CreateCall( - TsanCleanupPrivate, - {AtExit->CreatePtrToInt(AI, IntptrTy), - ConstantInt::get(AtExit->getInt32Ty(), *AllocSize)}); + AtExit->CreateCall(TsanCleanupPrivate, + {AtExit->CreatePtrToInt(AI, IntptrTy), + ConstantInt::get(IntptrTy, *AllocSize)}); Changed |= true; } } @@ -401,9 +433,10 @@ bool ThreadSanitizerOnSpirv::isUnsupportedSPIRAccess(Value *Addr, ->getPointerAddressSpace(); switch (AddrAS) { case kSpirOffloadPrivateAS: - case kSpirOffloadLocalAS: case kSpirOffloadConstantAS: return true; + case kSpirOffloadLocalAS: + return !ClSpirOffloadLocals; case kSpirOffloadGlobalAS: case kSpirOffloadGenericAS: return false; @@ -447,9 +480,6 @@ bool ThreadSanitizerOnSpirv::isUnsupportedDeviceGlobal( return true; if (G.getName().starts_with("__usid_str")) return true; - // TODO: Will support global variable with local address space later. - if (G.getAddressSpace() == kSpirOffloadLocalAS) - return true; // Global variables have constant address space will not trigger race // condition. if (G.getAddressSpace() == kSpirOffloadConstantAS) @@ -459,6 +489,7 @@ bool ThreadSanitizerOnSpirv::isUnsupportedDeviceGlobal( void ThreadSanitizerOnSpirv::instrumentModule() { instrumentGlobalVariables(); + instrumentStaticLocalMemory(); instrumentKernelsMetadata(); } @@ -483,6 +514,10 @@ void ThreadSanitizerOnSpirv::instrumentGlobalVariables() { continue; } + // This case is handled by instrumentStaticLocalMemory + if (G.getAddressSpace() == kSpirOffloadLocalAS) + continue; + DeviceGlobalMetadata.push_back(ConstantStruct::get( StructTy, ConstantInt::get(IntptrTy, DL.getTypeAllocSize(G.getValueType())), @@ -503,6 +538,140 @@ void ThreadSanitizerOnSpirv::instrumentGlobalVariables() { MsanDeviceGlobalMetadata->setUnnamedAddr(GlobalValue::UnnamedAddr::Local); } +void ThreadSanitizerOnSpirv::instrumentStaticLocalMemory() { + if (!ClSpirOffloadLocals) + return; + + auto Instrument = [this](GlobalVariable *G, Function *F) { + const uint64_t SizeInBytes = DL.getTypeAllocSize(G->getValueType()); + + if (!F->hasMetadata("tsan_instrumented_local")) { + IRBuilder<> Builder(&F->getEntryBlock().front()); + Builder.CreateCall(TsanGroupBarrier); + } + + // Poison shadow of static local memory + { + IRBuilder<> Builder(&F->getEntryBlock().front()); + Builder.CreateCall(TsanCleanupStaticLocal, + {Builder.CreatePointerCast(G, IntptrTy), + ConstantInt::get(IntptrTy, SizeInBytes)}); + } + + // Unpoison shadow of static local memory, required by CPU device + EscapeEnumerator EE(*F, "tsan_cleanup_static_local", false); + while (IRBuilder<> *AtExit = EE.Next()) { + if (!F->hasMetadata("tsan_instrumented_local")) + AtExit->CreateCall(TsanGroupBarrier); + AtExit->CreateCall(TsanCleanupStaticLocal, + {AtExit->CreatePointerCast(G, IntptrTy), + ConstantInt::get(IntptrTy, SizeInBytes)}); + } + + if (!F->hasMetadata("tsan_instrumented_local")) { + Constant *One = ConstantInt::get(Type::getInt32Ty(C), 1); + MDNode *NewNode = MDNode::get(C, ConstantAsMetadata::get(One)); + F->addMetadata("tsan_instrumented_local", *NewNode); + } + }; + + // We only instrument on spir_kernel, because local variables are + // kind of global variable, which must be initialized only once. + for (auto &G : M.globals()) { + if (G.getAddressSpace() == kSpirOffloadLocalAS) { + SmallVector WorkList; + DenseSet InstrumentedKernel; + for (auto *User : G.users()) + getFunctionsOfUser(User, WorkList); + while (!WorkList.empty()) { + Function *F = WorkList.pop_back_val(); + if (F->getCallingConv() == CallingConv::SPIR_KERNEL) { + if (!InstrumentedKernel.contains(F)) { + Instrument(&G, F); + InstrumentedKernel.insert(F); + } + continue; + } + // Get root spir_kernel of spir_func + initializeKernelCallerMap(F); + for (auto *F : FuncToKernelCallerMap[F]) + WorkList.push_back(F); + } + } + } +} + +void ThreadSanitizerOnSpirv::instrumentDynamicLocalMemory(Function &F) { + if (!ClSpirOffloadLocals) + return; + + // Poison shadow of local memory in kernel argument, required by CPU device + SmallVector LocalArgs; + for (auto &Arg : F.args()) { + Type *PtrTy = dyn_cast(Arg.getType()->getScalarType()); + if (PtrTy && PtrTy->getPointerAddressSpace() == kSpirOffloadLocalAS) + LocalArgs.push_back(&Arg); + } + + if (LocalArgs.empty()) + return; + + if (!F.hasMetadata("tsan_instrumented_local")) { + IRBuilder<> Builder(&F.getEntryBlock().front()); + Builder.CreateCall(TsanGroupBarrier); + } + + IRBuilder<> IRB(&F.getEntryBlock().front()); + + AllocaInst *ArgsArray = IRB.CreateAlloca( + IntptrTy, ConstantInt::get(IRB.getInt32Ty(), LocalArgs.size()), + "local_args"); + for (size_t i = 0; i < LocalArgs.size(); i++) { + auto *StoreDest = IRB.CreateGEP(IntptrTy, ArgsArray, + ConstantInt::get(IRB.getInt32Ty(), i)); + IRB.CreateStore(IRB.CreatePointerCast(LocalArgs[i], IntptrTy), StoreDest); + } + + auto *ArgsArrayAddr = IRB.CreatePointerCast(ArgsArray, IntptrTy); + IRB.CreateCall( + TsanCleanupDynamicLocal, + {ArgsArrayAddr, ConstantInt::get(IRB.getInt32Ty(), LocalArgs.size())}); + + // Unpoison shadow of dynamic local memory, required by CPU device + EscapeEnumerator EE(F, "tsan_cleanup_dynamic_local", false); + while (IRBuilder<> *AtExit = EE.Next()) { + if (!F.hasMetadata("tsan_instrumented_local")) + AtExit->CreateCall(TsanGroupBarrier); + AtExit->CreateCall(TsanCleanupDynamicLocal, + {ArgsArrayAddr, ConstantInt::get(AtExit->getInt32Ty(), + LocalArgs.size())}); + } + + if (!F.hasMetadata("tsan_instrumented_local")) { + Constant *One = ConstantInt::get(Type::getInt32Ty(C), 1); + MDNode *NewNode = MDNode::get(C, ConstantAsMetadata::get(One)); + F.addMetadata("tsan_instrumented_local", *NewNode); + } +} + +void ThreadSanitizerOnSpirv::initializeKernelCallerMap(Function *F) { + if (FuncToKernelCallerMap.find(F) != FuncToKernelCallerMap.end()) + return; + + for (auto *U : F->users()) { + if (Instruction *Inst = dyn_cast(U)) { + Function *Caller = Inst->getFunction(); + if (Caller->getCallingConv() == CallingConv::SPIR_KERNEL) { + FuncToKernelCallerMap[F].insert(Caller); + continue; + } + initializeKernelCallerMap(Caller); + FuncToKernelCallerMap[F].insert(FuncToKernelCallerMap[Caller].begin(), + FuncToKernelCallerMap[Caller].end()); + } + } +} + void ThreadSanitizerOnSpirv::instrumentKernelsMetadata() { SmallVector SpirKernelsMetadata; @@ -590,12 +759,12 @@ void ThreadSanitizer::initialize(Module &M, const TargetLibraryInfo &TLI) { std::string ByteSizeStr = utostr(ByteSize); std::string BitSizeStr = utostr(BitSize); SmallString<32> ReadName("__tsan_read" + ByteSizeStr); - TsanRead[i] = M.getOrInsertFunction(ReadName, Attr, IRB.getVoidTy(), - IRB.getPtrTy()); + TsanRead[i] = + M.getOrInsertFunction(ReadName, Attr, IRB.getVoidTy(), IRB.getPtrTy()); SmallString<32> WriteName("__tsan_write" + ByteSizeStr); - TsanWrite[i] = M.getOrInsertFunction(WriteName, Attr, IRB.getVoidTy(), - IRB.getPtrTy()); + TsanWrite[i] = + M.getOrInsertFunction(WriteName, Attr, IRB.getVoidTy(), IRB.getPtrTy()); SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr); TsanUnalignedRead[i] = M.getOrInsertFunction( @@ -624,8 +793,8 @@ void ThreadSanitizer::initialize(Module &M, const TargetLibraryInfo &TLI) { UnalignedVolatileWriteName, Attr, IRB.getVoidTy(), IRB.getPtrTy()); SmallString<64> CompoundRWName("__tsan_read_write" + ByteSizeStr); - TsanCompoundRW[i] = M.getOrInsertFunction( - CompoundRWName, Attr, IRB.getVoidTy(), IRB.getPtrTy()); + TsanCompoundRW[i] = M.getOrInsertFunction(CompoundRWName, Attr, + IRB.getVoidTy(), IRB.getPtrTy()); SmallString<64> UnalignedCompoundRWName("__tsan_unaligned_read_write" + ByteSizeStr); @@ -643,7 +812,7 @@ void ThreadSanitizer::initialize(Module &M, const TargetLibraryInfo &TLI) { // Args of type Ty need extension only when BitSize is 32 or less. using Idxs = std::vector; - Idxs Idxs2Or12 ((BitSize <= 32) ? Idxs({1, 2}) : Idxs({2})); + Idxs Idxs2Or12((BitSize <= 32) ? Idxs({1, 2}) : Idxs({2})); Idxs Idxs34Or1234((BitSize <= 32) ? Idxs({1, 2, 3, 4}) : Idxs({3, 4})); SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store"); TsanAtomicStore[i] = M.getOrInsertFunction( @@ -702,12 +871,10 @@ void ThreadSanitizer::initialize(Module &M, const TargetLibraryInfo &TLI) { TLI.getAttrList(&Ctx, {0}, /*Signed=*/true, /*Ret=*/false, Attr), IRB.getVoidTy(), OrdTy); - MemmoveFn = - M.getOrInsertFunction("__tsan_memmove", Attr, IRB.getPtrTy(), - IRB.getPtrTy(), IRB.getPtrTy(), IntptrTy); - MemcpyFn = - M.getOrInsertFunction("__tsan_memcpy", Attr, IRB.getPtrTy(), - IRB.getPtrTy(), IRB.getPtrTy(), IntptrTy); + MemmoveFn = M.getOrInsertFunction("__tsan_memmove", Attr, IRB.getPtrTy(), + IRB.getPtrTy(), IRB.getPtrTy(), IntptrTy); + MemcpyFn = M.getOrInsertFunction("__tsan_memcpy", Attr, IRB.getPtrTy(), + IRB.getPtrTy(), IRB.getPtrTy(), IntptrTy); MemsetFn = M.getOrInsertFunction( "__tsan_memset", TLI.getAttrList(&Ctx, {1}, /*Signed=*/true, /*Ret=*/false, Attr), @@ -882,9 +1049,9 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, initialize(*F.getParent(), TLI); SmallVector AllLoadsAndStores; - SmallVector LocalLoadsAndStores; - SmallVector AtomicAccesses; - SmallVector MemIntrinCalls; + SmallVector LocalLoadsAndStores; + SmallVector AtomicAccesses; + SmallVector MemIntrinCalls; SmallVector Allocas; SmallVector SpirControlBarrierCalls; bool Res = false; @@ -986,6 +1153,9 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, } Res = true; } + + if (Spirv && F.getCallingConv() == CallingConv::SPIR_KERNEL) + Spirv->instrumentDynamicLocalMemory(F); return Res; } @@ -1081,16 +1251,27 @@ bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II, static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { uint32_t v = 0; switch (ord) { - case AtomicOrdering::NotAtomic: - llvm_unreachable("unexpected atomic ordering!"); - case AtomicOrdering::Unordered: [[fallthrough]]; - case AtomicOrdering::Monotonic: v = 0; break; - // Not specified yet: - // case AtomicOrdering::Consume: v = 1; break; - case AtomicOrdering::Acquire: v = 2; break; - case AtomicOrdering::Release: v = 3; break; - case AtomicOrdering::AcquireRelease: v = 4; break; - case AtomicOrdering::SequentiallyConsistent: v = 5; break; + case AtomicOrdering::NotAtomic: + llvm_unreachable("unexpected atomic ordering!"); + case AtomicOrdering::Unordered: + [[fallthrough]]; + case AtomicOrdering::Monotonic: + v = 0; + break; + // Not specified yet: + // case AtomicOrdering::Consume: v = 1; break; + case AtomicOrdering::Acquire: + v = 2; + break; + case AtomicOrdering::Release: + v = 3; + break; + case AtomicOrdering::AcquireRelease: + v = 4; + break; + case AtomicOrdering::SequentiallyConsistent: + v = 5; + break; } return IRB->getInt32(v); } @@ -1106,20 +1287,15 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) { InstrumentationIRBuilder IRB(I); if (MemSetInst *M = dyn_cast(I)) { - Value *Cast1 = IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false); + Value *Cast1 = + IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false); Value *Cast2 = IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false); - IRB.CreateCall( - MemsetFn, - {M->getArgOperand(0), - Cast1, - Cast2}); + IRB.CreateCall(MemsetFn, {M->getArgOperand(0), Cast1, Cast2}); I->eraseFromParent(); } else if (MemTransferInst *M = dyn_cast(I)) { - IRB.CreateCall( - isa(M) ? MemcpyFn : MemmoveFn, - {M->getArgOperand(0), - M->getArgOperand(1), - IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); + IRB.CreateCall(isa(M) ? MemcpyFn : MemmoveFn, + {M->getArgOperand(0), M->getArgOperand(1), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); I->eraseFromParent(); } return false; @@ -1141,8 +1317,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { int Idx = getMemoryAccessFuncIndex(OrigTy, Addr, DL); if (Idx < 0) return false; - Value *Args[] = {Addr, - createOrdering(&IRB, LI->getOrdering())}; + Value *Args[] = {Addr, createOrdering(&IRB, LI->getOrdering())}; Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args); Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy); I->replaceAllUsesWith(Cast); @@ -1188,12 +1363,10 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Value *CmpOperand = - IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty); + IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty); Value *NewOperand = - IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty); - Value *Args[] = {Addr, - CmpOperand, - NewOperand, + IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty); + Value *Args[] = {Addr, CmpOperand, NewOperand, createOrdering(&IRB, CASI->getSuccessOrdering()), createOrdering(&IRB, CASI->getFailureOrdering())}; CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args); @@ -1205,7 +1378,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { } Value *Res = - IRB.CreateInsertValue(PoisonValue::get(CASI->getType()), OldVal, 0); + IRB.CreateInsertValue(PoisonValue::get(CASI->getType()), OldVal, 0); Res = IRB.CreateInsertValue(Res, Success, 1); I->replaceAllUsesWith(Res); @@ -1229,8 +1402,8 @@ int ThreadSanitizer::getMemoryAccessFuncIndex(Type *OrigTy, Value *Addr, return -1; } uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy); - if (TypeSize != 8 && TypeSize != 16 && - TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { + if (TypeSize != 8 && TypeSize != 16 && TypeSize != 32 && TypeSize != 64 && + TypeSize != 128) { NumAccessesWithBadSize++; // Ignore all unusual sizes. return -1; diff --git a/llvm/test/Instrumentation/ThreadSanitizer/SPIRV/cleanup_private_shadow.ll b/llvm/test/Instrumentation/ThreadSanitizer/SPIRV/cleanup_private_shadow.ll index 28ac1f0c980d4..aa4b13e76ed40 100644 --- a/llvm/test/Instrumentation/ThreadSanitizer/SPIRV/cleanup_private_shadow.ll +++ b/llvm/test/Instrumentation/ThreadSanitizer/SPIRV/cleanup_private_shadow.ll @@ -22,7 +22,7 @@ for.body: ; preds = %for.body.preheader exit: ; CHECK: [[REG1:%[0-9]+]] = ptrtoint ptr %agg.tmp to i64 -; CHECK-NEXT: call void @__tsan_cleanup_private(i64 [[REG1]], i32 8) +; CHECK-NEXT: call void @__tsan_cleanup_private(i64 [[REG1]], i64 8) ; CHECK-NOT: ptrtoint ptr %device-byval-temp.ascast234298 to i64 ret void } diff --git a/llvm/test/Instrumentation/ThreadSanitizer/SPIRV/instrument_local.ll b/llvm/test/Instrumentation/ThreadSanitizer/SPIRV/instrument_local.ll new file mode 100644 index 0000000000000..fbff879b68564 --- /dev/null +++ b/llvm/test/Instrumentation/ThreadSanitizer/SPIRV/instrument_local.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -passes='function(tsan),module(tsan-module)' -tsan-instrument-func-entry-exit=0 -tsan-instrument-memintrinsics=0 -S | FileCheck %s +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +@WGLocalMem.0 = external addrspace(3) global i32 + +define spir_func void @foo() #0 { +entry: +; CHECK-LABEL: define spir_func void @foo() +; CHECK: call void @__tsan_write4(i64 ptrtoint (ptr addrspace(3) @WGLocalMem.0 to i64), i32 3 + store i32 1, ptr addrspace(3) @WGLocalMem.0, align 4 + br label %exit + +exit: + ret void +} + +define spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel(ptr addrspace(3) noundef align 4 %_arg_acc) #0 { +entry: +; CHECK-LABEL: define spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel(ptr addrspace(3) noundef align 4 %_arg_acc) +; CHECK: call void @__tsan_cleanup_static_local(i64 ptrtoint (ptr addrspace(3) @WGLocalMem.0 to i64), i64 4) +; CHECK: %local_args = alloca i64, align 8 +; CHECK-NEXT: [[REG1:%[0-9]+]] = getelementptr i64, ptr %local_args, i32 0 +; CHECK-NEXT: [[REG2:%[0-9]+]] = ptrtoint ptr addrspace(3) %_arg_acc to i64 +; CHECK-NEXT: store i64 [[REG2]], ptr [[REG1]], align 8 +; CHECK-NEXT: [[REG3:%[0-9]+]] = ptrtoint ptr %local_args to i64 +; CHECK-NEXT: call void @__tsan_cleanup_dynamic_local(i64 [[REG3]], i32 1) +; CHECK-NEXT: call void @__tsan_group_barrier() + store i32 0, ptr addrspace(3) @WGLocalMem.0, align 4 + store i32 0, ptr addrspace(3) %_arg_acc, align 4 + call void @foo() + br label %exit + +exit: ; preds = %entry +; CHECK: call void @__tsan_cleanup_dynamic_local(i64 [[REG3]], i32 1) +; CHECK-NEXT: call void @__tsan_cleanup_static_local(i64 ptrtoint (ptr addrspace(3) @WGLocalMem.0 to i64), i64 4) + ret void +} + +attributes #0 = { sanitize_thread } diff --git a/sycl/test-e2e/ThreadSanitizer/group_local_memory.cpp b/sycl/test-e2e/ThreadSanitizer/group_local_memory.cpp new file mode 100644 index 0000000000000..f9a0545dcf36e --- /dev/null +++ b/sycl/test-e2e/ThreadSanitizer/group_local_memory.cpp @@ -0,0 +1,37 @@ +// REQUIRES: linux, cpu || (gpu && level_zero) +// ALLOW_RETRIES: 10 +// RUN: %{build} %device_tsan_flags -O0 -g -o %t.out +// RUN: %{run} %t.out 2>&1 | FileCheck %s +#include "sycl/ext/oneapi/group_local_memory.hpp" +#include "sycl/detail/core.hpp" +#include "sycl/usm.hpp" + +__attribute__((noinline)) void check(int *ptr, size_t val) { *ptr += val; } + +int main() { + sycl::queue Q; + auto *sum = sycl::malloc_shared(1, Q); + + Q.submit([&](sycl::handler &h) { + h.parallel_for( + sycl::nd_range<1>(128, 16), [=](sycl::nd_item<1> item) { + auto ptr = + sycl::ext::oneapi::group_local_memory(item.get_group()); + *ptr += item.get_global_linear_id(); + + check(ptr, item.get_local_linear_id()); + + item.barrier(); + + if (item.get_global_linear_id() == 0) + *sum = *ptr; + }); + }); + Q.wait(); + // CHECK: WARNING: DeviceSanitizer: data race + // CHECK-NEXT: When write of size 4 at 0x{{.*}} in kernel <{{.*}}Test> + // CHECK-NEXT: #0 {{.*}}group_local_memory.cpp + + sycl::free(sum, Q); + return 0; +} diff --git a/sycl/test-e2e/ThreadSanitizer/local_accessor.cpp b/sycl/test-e2e/ThreadSanitizer/local_accessor.cpp new file mode 100644 index 0000000000000..9a8292c1160fa --- /dev/null +++ b/sycl/test-e2e/ThreadSanitizer/local_accessor.cpp @@ -0,0 +1,32 @@ +// REQUIRES: linux, cpu || (gpu && level_zero) +// ALLOW_RETRIES: 10 +// RUN: %{build} %device_tsan_flags -O0 -g -o %t.out +// RUN: %{run} %t.out 2>&1 | FileCheck %s +#include "sycl/detail/core.hpp" +#include "sycl/usm.hpp" + +__attribute__((noinline)) void check(int *ptr, size_t val) { *ptr += val; } + +int main() { + sycl::queue Q; + auto *sum = sycl::malloc_device(1, Q); + + Q.submit([&](sycl::handler &cgh) { + auto acc = sycl::local_accessor(1, cgh); + cgh.parallel_for(sycl::nd_range<1>(128, 16), + [=](sycl::nd_item<1> item) { + acc[0] += item.get_global_linear_id(); + check(&acc[0], item.get_local_linear_id()); + + item.barrier(); + if (item.get_global_linear_id() == 0) + *sum = acc[0]; + }); + }); + Q.wait(); + // CHECK: WARNING: DeviceSanitizer: data race + // CHECK-NEXT: When write of size 4 at 0x{{.*}} in kernel <{{.*}}Test> + // CHECK-NEXT: #0 {{.*}}local_accessor.cpp + + return 0; +} diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp index 13aa868cbf0f0..f9967ff357da5 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -426,6 +426,12 @@ ur_result_t urEnqueueKernelLaunch( /// [out][optional] return an event object that identifies this /// particular kernel execution instance. ur_event_handle_t *phEvent) { + // This mutex is to prevent concurrent kernel launches across different queues + // as the DeviceMSAN local/private shadow memory does not support concurrent + // kernel launches now. + std::scoped_lock Guard( + getMsanInterceptor()->KernelLaunchMutex); + UR_LOG_L(getContext()->logger, DEBUG, "==== urEnqueueKernelLaunch"); USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp index b7c5d274ae76a..82438f5f419c1 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp @@ -337,6 +337,8 @@ class MsanInterceptor { bool isNormalExit() { return m_NormalExit; } + ur_shared_mutex KernelLaunchMutex; + private: /// Initialize Global Variables & Kernel Name at first Launch ur_result_t prepareLaunch(std::shared_ptr &DeviceInfo, diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index f3802f652d614..4dc89f6ddca80 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -1044,6 +1044,32 @@ ur_result_t urKernelSetArgMemObj( return UR_RESULT_SUCCESS; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSetArgLocal +__urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] argument index in range [0, num args - 1] + uint32_t argIndex, + /// [in] size of the local buffer to be allocated by the runtime + size_t argSize, + /// [in][optional] pointer to local buffer properties. + const ur_kernel_arg_local_properties_t *pProperties) { + auto pfnSetArgLocal = getContext()->urDdiTable.Kernel.pfnSetArgLocal; + + UR_LOG_L(getContext()->logger, DEBUG, + "==== urKernelSetArgLocal (argIndex={}, argSize={})", argIndex, + argSize); + + { + auto &KI = getTsanInterceptor()->getKernelInfo(hKernel); + std::scoped_lock Guard(KI.Mutex); + KI.LocalArgs[argIndex] = TsanLocalArgsInfo{argSize}; + } + + return pfnSetArgLocal(hKernel, argIndex, argSize, pProperties); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMDeviceAlloc __urdlllocal ur_result_t UR_APICALL urUSMDeviceAlloc( @@ -1142,9 +1168,16 @@ ur_result_t urEnqueueKernelLaunch( /// [out][optional] return an event object that identifies this /// particular kernel execution instance. ur_event_handle_t *phEvent) { + // This mutex is to prevent concurrent kernel launches across different queues + // as the DeviceTSAN local shadow memory does not support concurrent + // kernel launches now. + std::scoped_lock Guard( + getTsanInterceptor()->KernelLaunchMutex); + UR_LOG_L(getContext()->logger, DEBUG, "==== urEnqueueKernelLaunch"); - LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue)); + LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, + pLocalWorkSize, pGlobalWorkOffset, workDim); UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -1250,6 +1283,7 @@ ur_result_t urGetKernelProcAddrTable( pDdiTable->pfnRelease = ur_sanitizer_layer::tsan::urKernelRelease; pDdiTable->pfnSetArgValue = ur_sanitizer_layer::tsan::urKernelSetArgValue; pDdiTable->pfnSetArgMemObj = ur_sanitizer_layer::tsan::urKernelSetArgMemObj; + pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::tsan::urKernelSetArgLocal; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp index e40df91a9076e..c98adc25180c4 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp @@ -21,6 +21,11 @@ namespace tsan { TsanRuntimeDataWrapper::~TsanRuntimeDataWrapper() { [[maybe_unused]] ur_result_t Result; + if (Host.LocalArgs) { + Result = + getContext()->urDdiTable.USM.pfnFree(Context, (void *)Host.LocalArgs); + assert(Result == UR_RESULT_SUCCESS); + } if (DevicePtr) { Result = getContext()->urDdiTable.USM.pfnFree(Context, DevicePtr); assert(Result == UR_RESULT_SUCCESS); @@ -56,6 +61,24 @@ ur_result_t TsanRuntimeDataWrapper::syncToDevice(ur_queue_handle_t Queue) { return UR_RESULT_SUCCESS; } +ur_result_t TsanRuntimeDataWrapper::importLocalArgsInfo( + ur_queue_handle_t Queue, const std::vector &LocalArgs) { + assert(!LocalArgs.empty()); + + Host.NumLocalArgs = LocalArgs.size(); + const size_t LocalArgsInfoSize = + sizeof(TsanLocalArgsInfo) * Host.NumLocalArgs; + UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc( + Context, Device, nullptr, nullptr, LocalArgsInfoSize, + ur_cast(&Host.LocalArgs))); + + UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, Host.LocalArgs, &LocalArgs[0], LocalArgsInfoSize, 0, nullptr, + nullptr)); + + return UR_RESULT_SUCCESS; +} + ur_result_t DeviceInfo::allocShadowMemory() { ur_context_handle_t ShadowContext; UR_CALL(getContext()->urDdiTable.Context.pfnCreate(1, &Handle, nullptr, @@ -303,12 +326,65 @@ ur_result_t TsanInterceptor::prepareLaunch(std::shared_ptr &, } } + // Get suggested local work size if user doesn't determine it. + if (LaunchInfo.LocalWorkSize.empty()) { + LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); + auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize( + Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset, + LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data()); + if (URes != UR_RESULT_SUCCESS) { + if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + return URes; + } + // If urKernelGetSuggestedLocalWorkSize is not supported by driver, we + // fallback to inefficient implementation + for (size_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) { + LaunchInfo.LocalWorkSize[Dim] = 1; + } + } + } + // Prepare launch info data LaunchInfo.Data.Host.GlobalShadowOffset = DI->Shadow->ShadowBegin; LaunchInfo.Data.Host.GlobalShadowOffsetEnd = DI->Shadow->ShadowEnd; LaunchInfo.Data.Host.DeviceTy = DI->Type; LaunchInfo.Data.Host.Debug = getContext()->Options.Debug ? 1 : 0; + const size_t *LocalWorkSize = LaunchInfo.LocalWorkSize.data(); + uint32_t NumWG = 1; + for (uint32_t Dim = 0; Dim < LaunchInfo.WorkDim; ++Dim) { + NumWG *= (LaunchInfo.GlobalWorkSize[Dim] + LocalWorkSize[Dim] - 1) / + LocalWorkSize[Dim]; + } + + if (DI->Shadow->AllocLocalShadow( + Queue, NumWG, LaunchInfo.Data.Host.LocalShadowOffset, + LaunchInfo.Data.Host.LocalShadowOffsetEnd) != UR_RESULT_SUCCESS) { + UR_LOG_L(getContext()->logger, WARN, + "Failed to allocate shadow memory for local memory, " + "maybe the number of workgroup ({}) is too large", + NumWG); + UR_LOG_L(getContext()->logger, WARN, + "Skip checking local memory of kernel <{}> ", + GetKernelName(Kernel)); + } else { + UR_LOG_L(getContext()->logger, DEBUG, + "ShadowMemory(Local, WorkGroup={}, {} - {})", NumWG, + (void *)LaunchInfo.Data.Host.LocalShadowOffset, + (void *)LaunchInfo.Data.Host.LocalShadowOffsetEnd); + + // Write local arguments info + if (!KernelInfo.LocalArgs.empty()) { + std::vector LocalArgsInfo; + for (auto [ArgIndex, ArgInfo] : KernelInfo.LocalArgs) { + LocalArgsInfo.push_back(ArgInfo); + UR_LOG_L(getContext()->logger, DEBUG, + "LocalArgs (argIndex={}, size={})", ArgIndex, ArgInfo.Size); + } + UR_CALL(LaunchInfo.Data.importLocalArgsInfo(Queue, LocalArgsInfo)); + } + } + LaunchInfo.Data.syncToDevice(Queue); // EnqueueWrite __TsanLaunchInfo diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp index c98df6fb59550..bf9023d4b582f 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp @@ -84,6 +84,9 @@ struct KernelInfo { ur_shared_mutex Mutex; std::unordered_map> BufferArgs; + // Need preserve the order of local arguments + std::map LocalArgs; + KernelInfo() = default; explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) { @@ -126,20 +129,36 @@ struct TsanRuntimeDataWrapper { ur_result_t syncFromDevice(ur_queue_handle_t Queue); ur_result_t syncToDevice(ur_queue_handle_t Queue); + + ur_result_t + importLocalArgsInfo(ur_queue_handle_t Queue, + const std::vector &LocalArgs); }; struct LaunchInfo { ur_context_handle_t Context = nullptr; ur_device_handle_t Device = nullptr; + const size_t *GlobalWorkSize = nullptr; + const size_t *GlobalWorkOffset = nullptr; + std::vector LocalWorkSize; + uint32_t WorkDim = 0; TsanRuntimeDataWrapper Data; - LaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device) - : Context(Context), Device(Device), Data(Context, Device) { + LaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + const size_t *GlobalWorkOffset, uint32_t WorkDim) + : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize), + GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim), + Data(Context, Device) { [[maybe_unused]] auto Result = getContext()->urDdiTable.Context.pfnRetain(Context); assert(Result == UR_RESULT_SUCCESS); Result = getContext()->urDdiTable.Device.pfnRetain(Device); assert(Result == UR_RESULT_SUCCESS); + if (LocalWorkSize) { + this->LocalWorkSize = + std::vector(LocalWorkSize, LocalWorkSize + WorkDim); + } } ~LaunchInfo() { @@ -207,6 +226,8 @@ class TsanInterceptor { return m_KernelMap[Kernel]; } + ur_shared_mutex KernelLaunchMutex; + private: ur_result_t updateShadowMemory(std::shared_ptr &CI, std::shared_ptr &DI, diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_libdevice.hpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_libdevice.hpp index ef24bc14ece5f..79d359265273f 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_libdevice.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_libdevice.hpp @@ -46,6 +46,7 @@ enum : AccessType { kAccessWrite = 0, kAccessRead = 1 << 0, kAccessAtomic = 1 << 1, + kAccessLocal = 1 << 2, }; // Fixed-size vector clock, used both for threads and sync objects. @@ -74,6 +75,10 @@ struct TsanErrorReport { uint32_t AccessSize = 0; }; +struct TsanLocalArgsInfo { + uint64_t Size = 0; +}; + constexpr uint64_t TSAN_MAX_NUM_REPORTS = 128; struct TsanRuntimeData { @@ -81,6 +86,14 @@ struct TsanRuntimeData { uintptr_t GlobalShadowOffsetEnd = 0; + uintptr_t LocalShadowOffset = 0; + + uintptr_t LocalShadowOffsetEnd = 0; + + TsanLocalArgsInfo *LocalArgs = nullptr; // Ordered by ArgIndex + + uint32_t NumLocalArgs = 0; + // The last one is to record global state VectorClock Clock[kThreadSlotCount + 1]; diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_report.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_report.cpp index 8aefda1cd5025..e96933b7ff890 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_report.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_report.cpp @@ -27,7 +27,8 @@ void ReportDataRace(const TsanErrorReport &Report, ur_kernel_handle_t Kernel) { KernelName = DemangleName(KernelName); UR_LOG_L(getContext()->logger, QUIET, - "====WARNING: DeviceSanitizer: data race"); + "====WARNING: DeviceSanitizer: data race on {}", + Report.Type & kAccessLocal ? "Local Memory" : "Global Memory"); UR_LOG_L(getContext()->logger, QUIET, "When {} of size {} at {} in kernel <{}> LID({}, {}, {}) GID({}, " "{}, {})", diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_shadow.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_shadow.cpp index 495de6bf48e63..fbe1411f0e967 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_shadow.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_shadow.cpp @@ -13,6 +13,7 @@ #include "tsan_shadow.hpp" #include "sanitizer_common/sanitizer_utils.hpp" +#include "tsan_interceptor.hpp" namespace ur_sanitizer_layer { namespace tsan { @@ -190,6 +191,44 @@ ur_result_t ShadowMemoryGPU::CleanShadow(ur_queue_handle_t Queue, uptr Ptr, return UR_RESULT_SUCCESS; } +ur_result_t ShadowMemoryGPU::AllocLocalShadow(ur_queue_handle_t Queue, + uint32_t NumWG, uptr &Begin, + uptr &End) { + const size_t LocalMemorySize = GetDeviceLocalMemorySize(Device); + const size_t RequiredShadowSize = NumWG * LocalMemorySize; + static size_t LastAllocatedSize = 0; + if (RequiredShadowSize > LastAllocatedSize) { + if (LocalShadowOffset) { + UR_CALL(getContext()->urDdiTable.USM.pfnFree(Context, + (void *)LocalShadowOffset)); + LocalShadowOffset = 0; + LastAllocatedSize = 0; + } + + UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc( + Context, Device, nullptr, nullptr, RequiredShadowSize, + (void **)&LocalShadowOffset)); + + // Initialize shadow memory + ur_result_t URes = EnqueueUSMBlockingSet(Queue, (void *)LocalShadowOffset, + 0, RequiredShadowSize); + if (URes != UR_RESULT_SUCCESS) { + UR_CALL(getContext()->urDdiTable.USM.pfnFree(Context, + (void *)LocalShadowOffset)); + LocalShadowOffset = 0; + LastAllocatedSize = 0; + + return URes; + } + + LastAllocatedSize = RequiredShadowSize; + } + + Begin = LocalShadowOffset; + End = LocalShadowOffset + RequiredShadowSize - 1; + return UR_RESULT_SUCCESS; +} + RawShadow *ShadowMemoryPVC::MemToShadow(uptr Ptr) { Ptr = RoundDownTo(Ptr, kShadowCell); if (Ptr & 0xff00'0000'0000'0000ULL) { diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_shadow.hpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_shadow.hpp index ed9d3a241dad2..1ad9ad4e9db2f 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_shadow.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_shadow.hpp @@ -35,6 +35,9 @@ struct ShadowMemory { virtual ur_result_t CleanShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size) = 0; + virtual ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG, + uptr &Begin, uptr &End) = 0; + virtual size_t GetShadowSize() = 0; ur_context_handle_t Context{}; @@ -73,6 +76,13 @@ struct ShadowMemoryCPU final : public ShadowMemory { ur_result_t CleanShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size) override; + ur_result_t AllocLocalShadow(ur_queue_handle_t, uint32_t, uptr &Begin, + uptr &End) override { + Begin = ShadowBegin; + End = ShadowEnd; + return UR_RESULT_SUCCESS; + } + size_t GetShadowSize() override { return 0x2000'0000'0000ULL; } }; @@ -87,11 +97,16 @@ struct ShadowMemoryGPU : public ShadowMemory { ur_result_t CleanShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size) override; + ur_result_t AllocLocalShadow(ur_queue_handle_t Queue, uint32_t NumWG, + uptr &Begin, uptr &End) override final; + virtual uptr GetStartAddress() { return 0; } ur_mutex VirtualMemMapsMutex; std::unordered_map VirtualMemMaps; + + uptr LocalShadowOffset = 0; }; // clang-format off