diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h index d8fbade3b3b..ccd02273c4f 100644 --- a/backends/qualcomm/runtime/QnnExecuTorch.h +++ b/backends/qualcomm/runtime/QnnExecuTorch.h @@ -69,10 +69,6 @@ void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment); /// handle to tensor wrapper during execution void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem); -/// Add custom mem tensor info. Help to bring forward the memHandle creating -/// time from execution to initialization. -void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info); - /// Free the allocated shared memory. void QnnExecuTorchFreeCustomMem(void* buffer_ptr); diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 5e3220f25d9..da4c7fbc069 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -124,52 +124,6 @@ Error QnnManager::LoadQnnLibrary() { return ret; } -Error QnnManager::PreRegisterMem() { - SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager(); - for (const auto info : shared_buffer_manager.GetCustomMemTensorInfoSet()) { - void* unaligned_custom_mem_base = - shared_buffer_manager.GetUnAlignedAddr(info.custom_mem); - - size_t tensor_offset = (static_cast(info.custom_mem) - - static_cast(unaligned_custom_mem_base)) + - info.pos; - size_t total_custom_mem_size = - shared_buffer_manager.GetAllocatedSize(info.custom_mem); - - int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base); - if (mem_fd == -1) { - QNN_EXECUTORCH_LOG_WARN( - "PreRegisterMem failed to get file descriptor.", - "custom_mem: %p", - "tensor_addr: %p", - "pos: %uz", - "tensor_bytes: %uz", - "shape: %p", - "rank: %zu", - "qnn_dtype: %X", - info.custom_mem, - info.tensor_addr, - info.pos, - info.tensor_bytes, - info.shape, - info.rank, - info.dtype); - return Error::Internal; - } - - ET_CHECK_OR_RETURN_ERROR( - backend_params_ptr_->qnn_mem_manager_ptr_->PreRegisterCustomMemHandle( - mem_fd, - unaligned_custom_mem_base, - total_custom_mem_size, - tensor_offset, - info) == Error::Ok, - Internal, - "Fail to register to shared memory."); - } - return Error::Ok; -} - Error QnnManager::RegisterMem( void* data_ptr, const std::shared_ptr& tensor_wrapper) { @@ -256,6 +210,9 @@ Error QnnManager::RegisterCustomMem( Qnn_MemHandle_t pre_registered_handle = backend_params_ptr_->qnn_mem_manager_ptr_->GetPreRegisteredHandle(info); + // If this memory block has already been registered, we can use it directly. + // This applies when running llama in lookahead mode with the same AR-N model + // handling both the prompt processor and the token generator. if (pre_registered_handle != nullptr) { if (get_option(options_->log_level()) >= QnnExecuTorchLogLevel::kLogLevelInfo) { @@ -268,15 +225,15 @@ Error QnnManager::RegisterCustomMem( } SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager(); - void* unaligned_custom_mem_base = - shared_buffer_manager.GetUnAlignedAddr(custom_mem_base); - size_t tensor_offset = static_cast(custom_mem_base) - - static_cast(unaligned_custom_mem_base) + info.pos; + size_t tensor_offset = info.pos; size_t total_custom_mem_size = shared_buffer_manager.GetAllocatedSize(custom_mem_base); - int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base); + int32_t mem_fd = shared_buffer_manager.MemToFd(custom_mem_base); + // Note: If obtaining the file descriptor fails, it may be due to memory not + // being released with QnnExecuTorchFreeCustomMem. In this situation, we could + // consider adding a map to monitor it. if (mem_fd == -1) { QNN_EXECUTORCH_LOG_WARN( "Tensor name %s failed to get file descriptor.", @@ -289,7 +246,6 @@ Error QnnManager::RegisterCustomMem( tensor_wrapper, mem_fd, data_ptr, - unaligned_custom_mem_base, total_custom_mem_size, tensor_offset, info) == Error::Ok, @@ -355,13 +311,6 @@ Error QnnManager::Init() { BackendInitializeState::INITIALIZED; } -#if defined(__aarch64__) - ET_CHECK_OR_RETURN_ERROR( - PreRegisterMem() == Error::Ok, - Internal, - "Fail to pre register custom memory handle"); -#endif - if (IsOnlinePrepare()) { Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT}; qnn_loaded_backend_.GetQnnInterface().qnn_backend_get_api_version( @@ -697,8 +646,3 @@ void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem) { executorch::backends::qnn::SharedBuffer::GetSharedBufferManager() .AddCusomMemTensorAddr(tensor_addr, custom_mem); } - -void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info) { - executorch::backends::qnn::SharedBuffer::GetSharedBufferManager() - .AddCusomMemTensorInfo(info); -} diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp index 99dee7c9a7b..d79f8041932 100644 --- a/backends/qualcomm/runtime/SharedBuffer.cpp +++ b/backends/qualcomm/runtime/SharedBuffer.cpp @@ -69,14 +69,6 @@ void* SharedBuffer::GetCustomMemBase(void* buf) { return it->second; } -void* SharedBuffer::GetUnAlignedAddr(void* buf) { - auto it = restore_map_.find(buf); - if (it == restore_map_.end()) { - return nullptr; - } - return it->second; -} - size_t SharedBuffer::GetAllocatedSize(void* buf) { auto it = allocated_size_map_.find(buf); if (it == allocated_size_map_.end()) { @@ -123,10 +115,10 @@ void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) { QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory."); return nullptr; } - allocated_size_map_.insert({buf, allocate_bytes}); auto aligned_buf = reinterpret_cast( alignTo(alignment, reinterpret_cast(buf))); bool status = restore_map_.insert({aligned_buf, buf}).second; + allocated_size_map_.insert({aligned_buf, allocate_bytes}); if (!status) { QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory."); rpc_mem_free_(buf); @@ -152,6 +144,15 @@ void SharedBuffer::FreeMem(void* buf) { } else { rpc_mem_free_(restore_map_[buf]); restore_map_.erase(buf); + allocated_size_map_.erase(buf); + // Unbind the custom memory from tensor address. + auto mit = custom_mem_to_tensor_addr_.find(buf); + if (mit != custom_mem_to_tensor_addr_.end()) { + for (auto it = mit->second.begin(); it != mit->second.end(); ++it) { + tensor_addr_to_custom_mem_.erase(*it); + } + custom_mem_to_tensor_addr_.erase(buf); + } } } @@ -185,14 +186,18 @@ Error SharedBuffer::Load() { } void SharedBuffer::AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem) { - tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem}); + bool status = + tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem}).second; + if (!status) { + QNN_EXECUTORCH_LOG_WARN( + "Tensor address %p already associated with custom memory %p", + tensor_addr, + custom_mem); + return; + } + custom_mem_to_tensor_addr_[custom_mem].insert(tensor_addr); }; -void SharedBuffer::AddCusomMemTensorInfo(const CustomMemTensorInfo& info) { - custom_mem_tensor_info_set_.insert(info); - tensor_addr_to_custom_mem_.insert({info.tensor_addr, info.custom_mem}); -} - Error SharedBuffer::UnLoad() { if (dlclose(lib_cdsp_rpc_) != 0) { QNN_EXECUTORCH_LOG_ERROR( diff --git a/backends/qualcomm/runtime/SharedBuffer.h b/backends/qualcomm/runtime/SharedBuffer.h index a02ea0e4c25..6bf06a6350b 100644 --- a/backends/qualcomm/runtime/SharedBuffer.h +++ b/backends/qualcomm/runtime/SharedBuffer.h @@ -59,19 +59,10 @@ class SharedBuffer final { // memory handle is registered during execution void AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem); - // memory handle can be registered before execution - void AddCusomMemTensorInfo(const CustomMemTensorInfo& info); - size_t GetAllocatedSize(void* buf); void* GetCustomMemBase(void* buf); - void* GetUnAlignedAddr(void* buf); - - const std::unordered_set& GetCustomMemTensorInfoSet() { - return custom_mem_tensor_info_set_; - }; - private: SharedBuffer() = default; @@ -93,7 +84,10 @@ class SharedBuffer final { std::unordered_map allocated_size_map_; // Maps for the custom memory std::unordered_map tensor_addr_to_custom_mem_; - std::unordered_set custom_mem_tensor_info_set_; + // After the custom memory is freed, we will ensure that no tensor addresses + // remain linked to this custom memory. + std::unordered_map> + custom_mem_to_tensor_addr_; std::atomic_bool initialize_{false}; static std::mutex init_mutex_; }; diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.cpp b/backends/qualcomm/runtime/backends/QnnMemManager.cpp index 3b99dd10868..05a819286dd 100644 --- a/backends/qualcomm/runtime/backends/QnnMemManager.cpp +++ b/backends/qualcomm/runtime/backends/QnnMemManager.cpp @@ -56,13 +56,10 @@ Error QnnMemManager::RegisterIonMem( return Error::Ok; } -// TODO: Find a better way to unify RegisterCustomMem and -// PreRegisterCustomMemHandle Error QnnMemManager::RegisterCustomMem( const std::shared_ptr& tensor_wrapper, int32_t mem_fd, void* mem_ptr, - void* unaligned_custom_mem_base, size_t total_custom_mem_size, size_t tensor_offset, const CustomMemTensorInfo& info) { @@ -107,46 +104,6 @@ Error QnnMemManager::RegisterCustomMem( return Error::Ok; } -Error QnnMemManager::PreRegisterCustomMemHandle( - int32_t mem_fd, - void* unaligned_custom_mem_base, - size_t total_custom_mem_size, - size_t tensor_offset, - const CustomMemTensorInfo& info) { - const QnnInterface& qnn_interface = implementation_.GetQnnInterface(); - Qnn_MemDescriptor_t descriptor = { - {info.rank, info.shape, nullptr}, - scalar_type_to_qnn_dtype_[info.dtype], - QNN_MEM_TYPE_CUSTOM, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - QnnMemHtp_Descriptor_t htp_descriptor; - htp_descriptor.type = QNN_HTP_MEM_SHARED_BUFFER; - htp_descriptor.size = total_custom_mem_size; - - QnnHtpMem_SharedBufferConfig_t htpSharedBuffConfig = {mem_fd, tensor_offset}; - htp_descriptor.sharedBufferConfig = htpSharedBuffConfig; - - descriptor.customInfo = &htp_descriptor; - - error = qnn_interface.qnn_mem_register( - context_->GetHandle(), - &descriptor, - /*numDescriptors=*/1, - &handle); - if (error != QNN_SUCCESS) { - QNN_EXECUTORCH_LOG_WARN( - "PreRegisterCustomMemHandle fail", QNN_GET_ERROR_CODE(error)); - return Error::Internal; - } - - pre_registered_handles_.insert({info, handle}); - registered_map_.insert({handle, nullptr}); - return Error::Ok; -} - void* QnnMemManager::GetPreRegisteredHandle(const CustomMemTensorInfo& info) { auto it = pre_registered_handles_.find(info); if (it == pre_registered_handles_.end()) { diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h index 6a7f00b016a..f4d3beda2c2 100644 --- a/backends/qualcomm/runtime/backends/QnnMemManager.h +++ b/backends/qualcomm/runtime/backends/QnnMemManager.h @@ -39,16 +39,6 @@ class QnnMemManager { const std::shared_ptr& tensor_wrapper, int32_t mem_fd, void* mem_ptr, - void* unaligned_custom_mem_base, - size_t total_custom_mem_size, - size_t tensor_offset, - const CustomMemTensorInfo& info); - - // Pre-register custom mem handle from SharedBuffer. Bring forward the - // memHandle creating time from execution to initialization. - executorch::runtime::Error PreRegisterCustomMemHandle( - int32_t mem_fd, - void* unaligned_custom_mem_base, size_t total_custom_mem_size, size_t tensor_offset, const CustomMemTensorInfo& info); diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp index f0cc6d9a7a2..67d7ec80aab 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp @@ -44,20 +44,10 @@ void RpcMem::add_memory_info( it == io_pos_map_.end()) { ET_LOG(Error, "Shared buffer pointer %p is not found", data_ptr); } - size_t pos = io_pos_map_[static_cast(data_ptr)]; - uint32_t* shape = const_cast( - reinterpret_cast(tensor_info.sizes().data())); - uint32_t rank = static_cast(tensor_info.sizes().size()); - executorch::aten::ScalarType scalar_type = tensor_info.scalar_type(); - CustomMemTensorInfo info = { - shared_buffer_base_ptr_, - data_ptr, - pos, - data_size, - shape, - rank, - scalar_type}; - QnnExecuTorchAddCustomMemTensorInfo(info); + if (binded_tensor_addr_set_.find(data_ptr) == binded_tensor_addr_set_.end()) { + QnnExecuTorchAddCustomMemTensorAddr(data_ptr, shared_buffer_base_ptr_); + binded_tensor_addr_set_.insert(data_ptr); + } }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h index 08c92741545..99e9cb1dec1 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h +++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h @@ -9,6 +9,7 @@ #pragma once #include #include +#include namespace example { /** @@ -58,6 +59,7 @@ tensor. void* shared_buffer_base_ptr_; size_t calculated_offsets_; std::unordered_map io_pos_map_; + std::unordered_set binded_tensor_addr_set_; }; } // namespace example