pytorch · shewu-quic · Nov 18, 2025
@@ -69,10 +69,6 @@ void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
 /// handle to tensor wrapper during execution
 void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);
 
-/// Add custom mem tensor info. Help to bring forward the memHandle creating
-/// time from execution to initialization.
-void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info);
-
 /// Free the allocated shared memory.
 void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
 

@@ -124,52 +124,6 @@ Error QnnManager::LoadQnnLibrary() {
   return ret;
 }
 
-Error QnnManager::PreRegisterMem() {
-  SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
-  for (const auto info : shared_buffer_manager.GetCustomMemTensorInfoSet()) {
-    void* unaligned_custom_mem_base =
-        shared_buffer_manager.GetUnAlignedAddr(info.custom_mem);
-
-    size_t tensor_offset = (static_cast<char*>(info.custom_mem) -
-                            static_cast<char*>(unaligned_custom_mem_base)) +
-        info.pos;
-    size_t total_custom_mem_size =
-        shared_buffer_manager.GetAllocatedSize(info.custom_mem);
-
-    int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
-    if (mem_fd == -1) {
-      QNN_EXECUTORCH_LOG_WARN(
-          "PreRegisterMem failed to get file descriptor.",
-          "custom_mem: %p",
-          "tensor_addr: %p",
-          "pos: %uz",
-          "tensor_bytes: %uz",
-          "shape: %p",
-          "rank: %zu",
-          "qnn_dtype: %X",
-          info.custom_mem,
-          info.tensor_addr,
-          info.pos,
-          info.tensor_bytes,
-          info.shape,
-          info.rank,
-          info.dtype);
-      return Error::Internal;
-    }
-
-    ET_CHECK_OR_RETURN_ERROR(
-        backend_params_ptr_->qnn_mem_manager_ptr_->PreRegisterCustomMemHandle(
-            mem_fd,
-            unaligned_custom_mem_base,
-            total_custom_mem_size,
-            tensor_offset,
-            info) == Error::Ok,
-        Internal,
-        "Fail to register to shared memory.");
-  }
-  return Error::Ok;
-}
-
 Error QnnManager::RegisterMem(
     void* data_ptr,
     const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
@@ -256,6 +210,9 @@ Error QnnManager::RegisterCustomMem(
 
   Qnn_MemHandle_t pre_registered_handle =
       backend_params_ptr_->qnn_mem_manager_ptr_->GetPreRegisteredHandle(info);
+  // If this memory block has already been registered, we can use it directly.
+  // This applies when running llama in lookahead mode with the same AR-N model
+  // handling both the prompt processor and the token generator.
   if (pre_registered_handle != nullptr) {
     if (get_option(options_->log_level()) >=
         QnnExecuTorchLogLevel::kLogLevelInfo) {
@@ -268,15 +225,15 @@ Error QnnManager::RegisterCustomMem(
   }
 
   SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
-  void* unaligned_custom_mem_base =
-      shared_buffer_manager.GetUnAlignedAddr(custom_mem_base);
 
-  size_t tensor_offset = static_cast<char*>(custom_mem_base) -
-      static_cast<char*>(unaligned_custom_mem_base) + info.pos;
+  size_t tensor_offset = info.pos;
   size_t total_custom_mem_size =
       shared_buffer_manager.GetAllocatedSize(custom_mem_base);
 
-  int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
+  int32_t mem_fd = shared_buffer_manager.MemToFd(custom_mem_base);
+  // Note: If obtaining the file descriptor fails, it may be due to memory not
+  // being released with QnnExecuTorchFreeCustomMem. In this situation, we could
+  // consider adding a map to monitor it.
   if (mem_fd == -1) {
     QNN_EXECUTORCH_LOG_WARN(
         "Tensor name %s failed to get file descriptor.",
@@ -289,7 +246,6 @@ Error QnnManager::RegisterCustomMem(
           tensor_wrapper,
           mem_fd,
           data_ptr,
-          unaligned_custom_mem_base,
           total_custom_mem_size,
           tensor_offset,
           info) == Error::Ok,
@@ -355,13 +311,6 @@ Error QnnManager::Init() {
         BackendInitializeState::INITIALIZED;
   }
 
-#if defined(__aarch64__)
-  ET_CHECK_OR_RETURN_ERROR(
-      PreRegisterMem() == Error::Ok,
-      Internal,
-      "Fail to pre register custom memory handle");
-#endif
-
   if (IsOnlinePrepare()) {
     Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT};
     qnn_loaded_backend_.GetQnnInterface().qnn_backend_get_api_version(
@@ -697,8 +646,3 @@ void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem) {
   executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
       .AddCusomMemTensorAddr(tensor_addr, custom_mem);
 }
-
-void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info) {
-  executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
-      .AddCusomMemTensorInfo(info);
-}
@@ -69,14 +69,6 @@ void* SharedBuffer::GetCustomMemBase(void* buf) {
   return it->second;
 }
 
-void* SharedBuffer::GetUnAlignedAddr(void* buf) {
-  auto it = restore_map_.find(buf);
-  if (it == restore_map_.end()) {
-    return nullptr;
-  }
-  return it->second;
-}
-
 size_t SharedBuffer::GetAllocatedSize(void* buf) {
   auto it = allocated_size_map_.find(buf);
   if (it == allocated_size_map_.end()) {
@@ -123,10 +115,10 @@ void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
     QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory.");
     return nullptr;
   }
-  allocated_size_map_.insert({buf, allocate_bytes});
   auto aligned_buf = reinterpret_cast<void*>(
       alignTo(alignment, reinterpret_cast<intptr_t>(buf)));
   bool status = restore_map_.insert({aligned_buf, buf}).second;
+  allocated_size_map_.insert({aligned_buf, allocate_bytes});
   if (!status) {
     QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory.");
     rpc_mem_free_(buf);
@@ -152,6 +144,15 @@ void SharedBuffer::FreeMem(void* buf) {
   } else {
     rpc_mem_free_(restore_map_[buf]);
     restore_map_.erase(buf);
+    allocated_size_map_.erase(buf);
+    // Unbind the custom memory from tensor address.
+    auto mit = custom_mem_to_tensor_addr_.find(buf);
+    if (mit != custom_mem_to_tensor_addr_.end()) {
+      for (auto it = mit->second.begin(); it != mit->second.end(); ++it) {
+        tensor_addr_to_custom_mem_.erase(*it);
+      }
+      custom_mem_to_tensor_addr_.erase(buf);
+    }
   }
 }
 
@@ -185,14 +186,18 @@ Error SharedBuffer::Load() {
 }
 
 void SharedBuffer::AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem) {
-  tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem});
+  bool status =
+      tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem}).second;
+  if (!status) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Tensor address %p already associated with custom memory %p",
+        tensor_addr,
+        custom_mem);
+    return;
+  }
+  custom_mem_to_tensor_addr_[custom_mem].insert(tensor_addr);
 };
 
-void SharedBuffer::AddCusomMemTensorInfo(const CustomMemTensorInfo& info) {
-  custom_mem_tensor_info_set_.insert(info);
-  tensor_addr_to_custom_mem_.insert({info.tensor_addr, info.custom_mem});
-}
-
 Error SharedBuffer::UnLoad() {
   if (dlclose(lib_cdsp_rpc_) != 0) {
     QNN_EXECUTORCH_LOG_ERROR(

@@ -59,19 +59,10 @@ class SharedBuffer final {
   // memory handle is registered during execution
   void AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem);
 
-  // memory handle can be registered before execution
-  void AddCusomMemTensorInfo(const CustomMemTensorInfo& info);
-
   size_t GetAllocatedSize(void* buf);
 
   void* GetCustomMemBase(void* buf);
 
-  void* GetUnAlignedAddr(void* buf);
-
-  const std::unordered_set<CustomMemTensorInfo>& GetCustomMemTensorInfoSet() {
-    return custom_mem_tensor_info_set_;
-  };
-
  private:
   SharedBuffer() = default;
 
@@ -93,7 +84,10 @@ class SharedBuffer final {
   std::unordered_map<void*, size_t> allocated_size_map_;
   // Maps for the custom memory
   std::unordered_map<void*, void*> tensor_addr_to_custom_mem_;
-  std::unordered_set<CustomMemTensorInfo> custom_mem_tensor_info_set_;
+  // After the custom memory is freed, we will ensure that no tensor addresses
+  // remain linked to this custom memory.
+  std::unordered_map<void*, std::unordered_set<void*>>
+      custom_mem_to_tensor_addr_;
   std::atomic_bool initialize_{false};
   static std::mutex init_mutex_;
 };

@@ -56,13 +56,10 @@ Error QnnMemManager::RegisterIonMem(
   return Error::Ok;
 }
 
-// TODO: Find a better way to unify RegisterCustomMem and
-// PreRegisterCustomMemHandle
 Error QnnMemManager::RegisterCustomMem(
     const std::shared_ptr<TensorWrapper>& tensor_wrapper,
     int32_t mem_fd,
     void* mem_ptr,
-    void* unaligned_custom_mem_base,
     size_t total_custom_mem_size,
     size_t tensor_offset,
     const CustomMemTensorInfo& info) {
@@ -107,46 +104,6 @@ Error QnnMemManager::RegisterCustomMem(
   return Error::Ok;
 }
 
-Error QnnMemManager::PreRegisterCustomMemHandle(
-    int32_t mem_fd,
-    void* unaligned_custom_mem_base,
-    size_t total_custom_mem_size,
-    size_t tensor_offset,
-    const CustomMemTensorInfo& info) {
-  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
-  Qnn_MemDescriptor_t descriptor = {
-      {info.rank, info.shape, nullptr},
-      scalar_type_to_qnn_dtype_[info.dtype],
-      QNN_MEM_TYPE_CUSTOM,
-      {{mem_fd}}};
-  Qnn_MemHandle_t handle = nullptr;
-  Qnn_ErrorHandle_t error = QNN_SUCCESS;
-
-  QnnMemHtp_Descriptor_t htp_descriptor;
-  htp_descriptor.type = QNN_HTP_MEM_SHARED_BUFFER;
-  htp_descriptor.size = total_custom_mem_size;
-
-  QnnHtpMem_SharedBufferConfig_t htpSharedBuffConfig = {mem_fd, tensor_offset};
-  htp_descriptor.sharedBufferConfig = htpSharedBuffConfig;
-
-  descriptor.customInfo = &htp_descriptor;
-
-  error = qnn_interface.qnn_mem_register(
-      context_->GetHandle(),
-      &descriptor,
-      /*numDescriptors=*/1,
-      &handle);
-  if (error != QNN_SUCCESS) {
-    QNN_EXECUTORCH_LOG_WARN(
-        "PreRegisterCustomMemHandle fail", QNN_GET_ERROR_CODE(error));
-    return Error::Internal;
-  }
-
-  pre_registered_handles_.insert({info, handle});
-  registered_map_.insert({handle, nullptr});
-  return Error::Ok;
-}
-
 void* QnnMemManager::GetPreRegisteredHandle(const CustomMemTensorInfo& info) {
   auto it = pre_registered_handles_.find(info);
   if (it == pre_registered_handles_.end()) {

@@ -39,16 +39,6 @@ class QnnMemManager {
       const std::shared_ptr<TensorWrapper>& tensor_wrapper,
       int32_t mem_fd,
       void* mem_ptr,
-      void* unaligned_custom_mem_base,
-      size_t total_custom_mem_size,
-      size_t tensor_offset,
-      const CustomMemTensorInfo& info);
-
-  // Pre-register custom mem handle from SharedBuffer. Bring forward the
-  // memHandle creating time from execution to initialization.
-  executorch::runtime::Error PreRegisterCustomMemHandle(
-      int32_t mem_fd,
-      void* unaligned_custom_mem_base,
       size_t total_custom_mem_size,
       size_t tensor_offset,
       const CustomMemTensorInfo& info);

@@ -44,20 +44,10 @@ void RpcMem::add_memory_info(
       it == io_pos_map_.end()) {
     ET_LOG(Error, "Shared buffer pointer %p is not found", data_ptr);
   }
-  size_t pos = io_pos_map_[static_cast<std::byte*>(data_ptr)];
-  uint32_t* shape = const_cast<uint32_t*>(
-      reinterpret_cast<const uint32_t*>(tensor_info.sizes().data()));
-  uint32_t rank = static_cast<uint32_t>(tensor_info.sizes().size());
-  executorch::aten::ScalarType scalar_type = tensor_info.scalar_type();
-  CustomMemTensorInfo info = {
-      shared_buffer_base_ptr_,
-      data_ptr,
-      pos,
-      data_size,
-      shape,
-      rank,
-      scalar_type};
-  QnnExecuTorchAddCustomMemTensorInfo(info);
+  if (binded_tensor_addr_set_.find(data_ptr) == binded_tensor_addr_set_.end()) {
+    QnnExecuTorchAddCustomMemTensorAddr(data_ptr, shared_buffer_base_ptr_);
+    binded_tensor_addr_set_.insert(data_ptr);
+  }
 };
 
 } // namespace example
@@ -9,6 +9,7 @@
 #pragma once
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
 #include <unordered_map>
+#include <unordered_set>
 
 namespace example {
 /**
@@ -58,6 +59,7 @@ tensor.
   void* shared_buffer_base_ptr_;
   size_t calculated_offsets_;
   std::unordered_map<std::byte*, size_t> io_pos_map_;
+  std::unordered_set<void*> binded_tensor_addr_set_;
 };
 
 } // namespace example