Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions backends/qualcomm/runtime/QnnExecuTorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,6 @@ void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
/// handle to tensor wrapper during execution
void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);

/// Add custom mem tensor info. Help to bring forward the memHandle creating
/// time from execution to initialization.
void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info);

/// Free the allocated shared memory.
void QnnExecuTorchFreeCustomMem(void* buffer_ptr);

Expand Down
72 changes: 8 additions & 64 deletions backends/qualcomm/runtime/QnnManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,52 +124,6 @@ Error QnnManager::LoadQnnLibrary() {
return ret;
}

Error QnnManager::PreRegisterMem() {
SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
for (const auto info : shared_buffer_manager.GetCustomMemTensorInfoSet()) {
void* unaligned_custom_mem_base =
shared_buffer_manager.GetUnAlignedAddr(info.custom_mem);

size_t tensor_offset = (static_cast<char*>(info.custom_mem) -
static_cast<char*>(unaligned_custom_mem_base)) +
info.pos;
size_t total_custom_mem_size =
shared_buffer_manager.GetAllocatedSize(info.custom_mem);

int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
if (mem_fd == -1) {
QNN_EXECUTORCH_LOG_WARN(
"PreRegisterMem failed to get file descriptor.",
"custom_mem: %p",
"tensor_addr: %p",
"pos: %uz",
"tensor_bytes: %uz",
"shape: %p",
"rank: %zu",
"qnn_dtype: %X",
info.custom_mem,
info.tensor_addr,
info.pos,
info.tensor_bytes,
info.shape,
info.rank,
info.dtype);
return Error::Internal;
}

ET_CHECK_OR_RETURN_ERROR(
backend_params_ptr_->qnn_mem_manager_ptr_->PreRegisterCustomMemHandle(
mem_fd,
unaligned_custom_mem_base,
total_custom_mem_size,
tensor_offset,
info) == Error::Ok,
Internal,
"Fail to register to shared memory.");
}
return Error::Ok;
}

Error QnnManager::RegisterMem(
void* data_ptr,
const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
Expand Down Expand Up @@ -256,6 +210,9 @@ Error QnnManager::RegisterCustomMem(

Qnn_MemHandle_t pre_registered_handle =
backend_params_ptr_->qnn_mem_manager_ptr_->GetPreRegisteredHandle(info);
// If this memory block has already been registered, we can use it directly.
// This applies when running llama in lookahead mode with the same AR-N model
// handling both the prompt processor and the token generator.
if (pre_registered_handle != nullptr) {
if (get_option(options_->log_level()) >=
QnnExecuTorchLogLevel::kLogLevelInfo) {
Expand All @@ -268,15 +225,15 @@ Error QnnManager::RegisterCustomMem(
}

SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
void* unaligned_custom_mem_base =
shared_buffer_manager.GetUnAlignedAddr(custom_mem_base);

size_t tensor_offset = static_cast<char*>(custom_mem_base) -
static_cast<char*>(unaligned_custom_mem_base) + info.pos;
size_t tensor_offset = info.pos;
size_t total_custom_mem_size =
shared_buffer_manager.GetAllocatedSize(custom_mem_base);

int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
int32_t mem_fd = shared_buffer_manager.MemToFd(custom_mem_base);
// Note: If obtaining the file descriptor fails, it may be due to memory not
// being released with QnnExecuTorchFreeCustomMem. In this situation, we could
// consider adding a map to monitor it.
if (mem_fd == -1) {
QNN_EXECUTORCH_LOG_WARN(
"Tensor name %s failed to get file descriptor.",
Expand All @@ -289,7 +246,6 @@ Error QnnManager::RegisterCustomMem(
tensor_wrapper,
mem_fd,
data_ptr,
unaligned_custom_mem_base,
total_custom_mem_size,
tensor_offset,
info) == Error::Ok,
Expand Down Expand Up @@ -355,13 +311,6 @@ Error QnnManager::Init() {
BackendInitializeState::INITIALIZED;
}

#if defined(__aarch64__)
ET_CHECK_OR_RETURN_ERROR(
PreRegisterMem() == Error::Ok,
Internal,
"Fail to pre register custom memory handle");
#endif

if (IsOnlinePrepare()) {
Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT};
qnn_loaded_backend_.GetQnnInterface().qnn_backend_get_api_version(
Expand Down Expand Up @@ -697,8 +646,3 @@ void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem) {
executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
.AddCusomMemTensorAddr(tensor_addr, custom_mem);
}

void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info) {
executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
.AddCusomMemTensorInfo(info);
}
35 changes: 20 additions & 15 deletions backends/qualcomm/runtime/SharedBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,6 @@ void* SharedBuffer::GetCustomMemBase(void* buf) {
return it->second;
}

void* SharedBuffer::GetUnAlignedAddr(void* buf) {
auto it = restore_map_.find(buf);
if (it == restore_map_.end()) {
return nullptr;
}
return it->second;
}

size_t SharedBuffer::GetAllocatedSize(void* buf) {
auto it = allocated_size_map_.find(buf);
if (it == allocated_size_map_.end()) {
Expand Down Expand Up @@ -123,10 +115,10 @@ void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory.");
return nullptr;
}
allocated_size_map_.insert({buf, allocate_bytes});
auto aligned_buf = reinterpret_cast<void*>(
alignTo(alignment, reinterpret_cast<intptr_t>(buf)));
bool status = restore_map_.insert({aligned_buf, buf}).second;
allocated_size_map_.insert({aligned_buf, allocate_bytes});
if (!status) {
QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory.");
rpc_mem_free_(buf);
Expand All @@ -152,6 +144,15 @@ void SharedBuffer::FreeMem(void* buf) {
} else {
rpc_mem_free_(restore_map_[buf]);
restore_map_.erase(buf);
allocated_size_map_.erase(buf);
// Unbind the custom memory from tensor address.
auto mit = custom_mem_to_tensor_addr_.find(buf);
if (mit != custom_mem_to_tensor_addr_.end()) {
for (auto it = mit->second.begin(); it != mit->second.end(); ++it) {
tensor_addr_to_custom_mem_.erase(*it);
}
custom_mem_to_tensor_addr_.erase(buf);
}
}
}

Expand Down Expand Up @@ -185,14 +186,18 @@ Error SharedBuffer::Load() {
}

void SharedBuffer::AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem) {
tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem});
bool status =
tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem}).second;
if (!status) {
QNN_EXECUTORCH_LOG_WARN(
"Tensor address %p already associated with custom memory %p",
tensor_addr,
custom_mem);
return;
}
custom_mem_to_tensor_addr_[custom_mem].insert(tensor_addr);
};

void SharedBuffer::AddCusomMemTensorInfo(const CustomMemTensorInfo& info) {
custom_mem_tensor_info_set_.insert(info);
tensor_addr_to_custom_mem_.insert({info.tensor_addr, info.custom_mem});
}

Error SharedBuffer::UnLoad() {
if (dlclose(lib_cdsp_rpc_) != 0) {
QNN_EXECUTORCH_LOG_ERROR(
Expand Down
14 changes: 4 additions & 10 deletions backends/qualcomm/runtime/SharedBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,10 @@ class SharedBuffer final {
// memory handle is registered during execution
void AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem);

// memory handle can be registered before execution
void AddCusomMemTensorInfo(const CustomMemTensorInfo& info);

size_t GetAllocatedSize(void* buf);

void* GetCustomMemBase(void* buf);

void* GetUnAlignedAddr(void* buf);

const std::unordered_set<CustomMemTensorInfo>& GetCustomMemTensorInfoSet() {
return custom_mem_tensor_info_set_;
};

private:
SharedBuffer() = default;

Expand All @@ -93,7 +84,10 @@ class SharedBuffer final {
std::unordered_map<void*, size_t> allocated_size_map_;
// Maps for the custom memory
std::unordered_map<void*, void*> tensor_addr_to_custom_mem_;
std::unordered_set<CustomMemTensorInfo> custom_mem_tensor_info_set_;
// After the custom memory is freed, we will ensure that no tensor addresses
// remain linked to this custom memory.
std::unordered_map<void*, std::unordered_set<void*>>
custom_mem_to_tensor_addr_;
std::atomic_bool initialize_{false};
static std::mutex init_mutex_;
};
Expand Down
43 changes: 0 additions & 43 deletions backends/qualcomm/runtime/backends/QnnMemManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,10 @@ Error QnnMemManager::RegisterIonMem(
return Error::Ok;
}

// TODO: Find a better way to unify RegisterCustomMem and
// PreRegisterCustomMemHandle
Error QnnMemManager::RegisterCustomMem(
const std::shared_ptr<TensorWrapper>& tensor_wrapper,
int32_t mem_fd,
void* mem_ptr,
void* unaligned_custom_mem_base,
size_t total_custom_mem_size,
size_t tensor_offset,
const CustomMemTensorInfo& info) {
Expand Down Expand Up @@ -107,46 +104,6 @@ Error QnnMemManager::RegisterCustomMem(
return Error::Ok;
}

Error QnnMemManager::PreRegisterCustomMemHandle(
int32_t mem_fd,
void* unaligned_custom_mem_base,
size_t total_custom_mem_size,
size_t tensor_offset,
const CustomMemTensorInfo& info) {
const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
Qnn_MemDescriptor_t descriptor = {
{info.rank, info.shape, nullptr},
scalar_type_to_qnn_dtype_[info.dtype],
QNN_MEM_TYPE_CUSTOM,
{{mem_fd}}};
Qnn_MemHandle_t handle = nullptr;
Qnn_ErrorHandle_t error = QNN_SUCCESS;

QnnMemHtp_Descriptor_t htp_descriptor;
htp_descriptor.type = QNN_HTP_MEM_SHARED_BUFFER;
htp_descriptor.size = total_custom_mem_size;

QnnHtpMem_SharedBufferConfig_t htpSharedBuffConfig = {mem_fd, tensor_offset};
htp_descriptor.sharedBufferConfig = htpSharedBuffConfig;

descriptor.customInfo = &htp_descriptor;

error = qnn_interface.qnn_mem_register(
context_->GetHandle(),
&descriptor,
/*numDescriptors=*/1,
&handle);
if (error != QNN_SUCCESS) {
QNN_EXECUTORCH_LOG_WARN(
"PreRegisterCustomMemHandle fail", QNN_GET_ERROR_CODE(error));
return Error::Internal;
}

pre_registered_handles_.insert({info, handle});
registered_map_.insert({handle, nullptr});
return Error::Ok;
}

void* QnnMemManager::GetPreRegisteredHandle(const CustomMemTensorInfo& info) {
auto it = pre_registered_handles_.find(info);
if (it == pre_registered_handles_.end()) {
Expand Down
10 changes: 0 additions & 10 deletions backends/qualcomm/runtime/backends/QnnMemManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,6 @@ class QnnMemManager {
const std::shared_ptr<TensorWrapper>& tensor_wrapper,
int32_t mem_fd,
void* mem_ptr,
void* unaligned_custom_mem_base,
size_t total_custom_mem_size,
size_t tensor_offset,
const CustomMemTensorInfo& info);

// Pre-register custom mem handle from SharedBuffer. Bring forward the
// memHandle creating time from execution to initialization.
executorch::runtime::Error PreRegisterCustomMemHandle(
int32_t mem_fd,
void* unaligned_custom_mem_base,
size_t total_custom_mem_size,
size_t tensor_offset,
const CustomMemTensorInfo& info);
Expand Down
18 changes: 4 additions & 14 deletions examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,10 @@ void RpcMem::add_memory_info(
it == io_pos_map_.end()) {
ET_LOG(Error, "Shared buffer pointer %p is not found", data_ptr);
}
size_t pos = io_pos_map_[static_cast<std::byte*>(data_ptr)];
uint32_t* shape = const_cast<uint32_t*>(
reinterpret_cast<const uint32_t*>(tensor_info.sizes().data()));
uint32_t rank = static_cast<uint32_t>(tensor_info.sizes().size());
executorch::aten::ScalarType scalar_type = tensor_info.scalar_type();
CustomMemTensorInfo info = {
shared_buffer_base_ptr_,
data_ptr,
pos,
data_size,
shape,
rank,
scalar_type};
QnnExecuTorchAddCustomMemTensorInfo(info);
if (binded_tensor_addr_set_.find(data_ptr) == binded_tensor_addr_set_.end()) {
QnnExecuTorchAddCustomMemTensorAddr(data_ptr, shared_buffer_base_ptr_);
binded_tensor_addr_set_.insert(data_ptr);
}
};

} // namespace example
2 changes: 2 additions & 0 deletions examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
#include <unordered_map>
#include <unordered_set>

namespace example {
/**
Expand Down Expand Up @@ -58,6 +59,7 @@ tensor.
void* shared_buffer_base_ptr_;
size_t calculated_offsets_;
std::unordered_map<std::byte*, size_t> io_pos_map_;
std::unordered_set<void*> binded_tensor_addr_set_;
};

} // namespace example