Skip to content

Commit ac53c11

Browse files
authored
[ET-VK] Split up prepack command buffer (#12533)
## Changes * Introduce `run_prepack()` API which combines the functionality of `encode_prepack()` and `prepack()`, but submits prepacking shaders incrementally rather than all at once. * Introduce graph config options to control command buffer submission behaviour during prepacking. Note that the current default values for the prepack submission thresholds were determined through experimentation. I will leave determining optimal values for specific devices as a later exercise. The goal of this diff is simply to introduce this mechanism to fix the Llama model loading crash on Samsung S24 (described below). ## Context Currently, ET-VK will encode all prepacking shaders, and then perform prepacking by submitting only one command buffer. However, this approach has some drawbacks: * CPU/GPU parallelism is decreased, since the command buffer is submitted only after all commands have been encoded. * There can be performance issues at the Vulkan API level when processing a single "large" command buffer. By splitting up prepacking to occur over multiple command buffers, performance can be improved by avoiding both the aforementioned issues. ## Llama 3.2 1B crash on Samsung S24 I have also noticed that running large models (i.e. Llama 3.2 1B) on the Samsung S24 with ET-VK, the device's display will crash (causing the screen to go black and become unresponsive), and sometimes the device will shut down entirely. Fortunately, this change also fixes this behaviour, in addition to providing a significant performance boost to model load time for Llama models (from 9s to 3s). ## Performance Impact * Improves model load time, especially on larger models. ## Future Work * Deprecate the `encode_prepack()` + `prepack()` pattern in favor of the `run_prepack()` pattern Differential Revision: [D78275586](https://our.internmc.facebook.com/intern/diff/D78275586/)
1 parent f77bb5e commit ac53c11

File tree

7 files changed

+111
-3
lines changed

7 files changed

+111
-3
lines changed

backends/vulkan/runtime/VulkanBackend.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -507,8 +507,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
507507
compute_graph->prepare();
508508
compute_graph->prepare_pipelines();
509509

510-
compute_graph->encode_prepack();
511-
compute_graph->prepack();
510+
compute_graph->run_prepack();
512511

513512
// If dynamic shapes are not expected, then the command buffer only needs to
514513
// be encoded once. Otherwise, wait until the first inference to encode the

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,12 @@ ComputeGraph::ComputeGraph(GraphConfig config)
145145
execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
146146
execute_descriptor_counts_.descriptor_storage_image_count = 0;
147147

148-
context_->set_cmd(/*reusable = */ true);
148+
// If certain graph config variables are not specified, then set them
149+
// automatically.
150+
if (config_.prepack_threshold_nbytes == 0) {
151+
config_.prepack_threshold_nbytes = 10 * MB;
152+
config_.prepack_initial_threshold_nbytes = 10 * MB;
153+
}
149154
}
150155

151156
ComputeGraph::~ComputeGraph() {
@@ -431,6 +436,7 @@ ValueRef ComputeGraph::add_tensorref(
431436
ValueRef idx(static_cast<int>(values_.size()));
432437
check_no_active_value_ptrs();
433438
values_.emplace_back(TensorRef(sizes, dtype, data));
439+
total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes();
434440
return idx;
435441
}
436442

@@ -750,6 +756,19 @@ void ComputeGraph::prepare_pipelines() {
750756
vkapi::ComputePipelineCache::Hasher>();
751757
}
752758

759+
void ComputeGraph::submit_current_cmd(const bool final_use) {
760+
context_->submit_cmd_to_gpu(VK_NULL_HANDLE, final_use);
761+
}
762+
763+
void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
764+
vkapi::VulkanFence fence = context_->fences().get_fence();
765+
context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
766+
fence.wait();
767+
context_->fences().return_fence(fence);
768+
769+
context_->flush();
770+
}
771+
753772
void ComputeGraph::encode_prepack() {
754773
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
755774
node->encode(this);
@@ -766,6 +785,37 @@ void ComputeGraph::prepack() const {
766785
context_->flush();
767786
}
768787

788+
void ComputeGraph::run_prepack() {
789+
int i = 0;
790+
bool submitted = false;
791+
const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB;
792+
// int count = 0;
793+
context_->set_cmd();
794+
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
795+
// Do not trigger on the first or last prepack node.
796+
const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
797+
size_t threshold = submitted ? config_.prepack_threshold_nbytes
798+
: config_.prepack_initial_threshold_nbytes;
799+
if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
800+
// If reducing peak memory usage, wait for the current command buffer to
801+
// finish executing and flush to recycle the staging memory. This will
802+
// reduce peak memory usage, but will slightly increase load latency.
803+
// Otherwise, just submit the current command buffer for execution and
804+
// proceed. This results in lower load latency at the cost of higher peak
805+
// memory usage.
806+
reduce_peak_memory ? submit_current_cmd_and_wait() : submit_current_cmd();
807+
staging_nbytes_in_cmd_ = 0;
808+
context_->set_cmd();
809+
submitted = true;
810+
}
811+
812+
node->encode(this);
813+
i++;
814+
}
815+
submit_current_cmd_and_wait(/*final_use=*/true);
816+
staging_nbytes_in_cmd_ = 0;
817+
}
818+
769819
void ComputeGraph::encode_execute() {
770820
context_->flush();
771821
context_->set_cmd(/*reusable = */ true);

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,10 +190,20 @@ class ComputeGraph final {
190190
vkapi::ComputePipelineCache::Hasher>
191191
pipeline_descriptors_;
192192

193+
// Utility constexpr to express byte quantities
194+
constexpr static size_t MB = 1024 * 1024;
195+
193196
protected:
194197
size_t values_in_use_ = 0;
195198
size_t execute_count_ = 0;
196199

200+
// Total number of bytes needed to store model weights
201+
size_t total_constant_nbytes_ = 0;
202+
203+
// Represents the amount of staging buffer data that will be copied if the
204+
// current Context's command buffer is submitted now.
205+
size_t staging_nbytes_in_cmd_ = 0;
206+
197207
public:
198208
//
199209
// Accessors
@@ -827,13 +837,39 @@ class ComputeGraph final {
827837
copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
828838
void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
829839

840+
protected:
841+
// Command Buffer Management
842+
843+
/*
844+
* Submits the current command buffer in the Context to the GPU for execution.
845+
*/
846+
void submit_current_cmd(const bool final_use = false);
847+
848+
/*
849+
* Submits the current command buffer in the Context to the GPU for execution,
850+
* and wait for it to complete before returning. This function will also flush
851+
* the Context after execution.
852+
*/
853+
void submit_current_cmd_and_wait(const bool final_use = false);
854+
855+
public:
830856
//
831857
// Graph Prepacking
832858
//
833859

860+
inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
861+
staging_nbytes_in_cmd_ += staging_bytes;
862+
}
863+
834864
void encode_prepack();
835865
void prepack() const;
836866

867+
/*
868+
* Executes prepacking operations to transfer model weight data from the CPU
869+
* to GPU.
870+
*/
871+
void run_prepack();
872+
837873
//
838874
// Graph Execution
839875
//

backends/vulkan/runtime/graph/GraphConfig.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,20 @@ struct GraphConfig final {
3636
// Whether or not the ComputeGraph should expect input shapes to be dynamic
3737
bool expect_dynamic_shapes;
3838

39+
// Execution properties that determine specifics re: how command buffer
40+
// submission is handled, etc. 0 means this field is not set.
41+
42+
// During prepacking, once this threshold is reached, submit the current
43+
// command buffer for execution. This allows the work to be distributed over
44+
// multiple command buffer submissions, which can improve model load
45+
// performance and prevent crashes when loading large models.
46+
size_t prepack_threshold_nbytes = 0;
47+
// Threshold used for the first command buffer submission during prepacking.
48+
// This can be set to be lower than prepack_submission_threshold_nbytes to
49+
// submit a command buffer for execution earlier which can improve performance
50+
// by taking more advantage of parallelism between the CPU and GPU.
51+
size_t prepack_initial_threshold_nbytes = 0;
52+
3953
vkapi::Adapter* external_adapter;
4054

4155
// Generate a default graph config with pre-configured settings

backends/vulkan/runtime/graph/containers/Constant.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ struct TensorRef final {
2828
const std::vector<int64_t>& t_sizes,
2929
vkapi::ScalarType t_dtype,
3030
const void* const t_data);
31+
32+
inline size_t nbytes() const {
33+
return utils::multiply_integers(sizes) * vkapi::element_size(dtype);
34+
}
3135
};
3236

3337
} // namespace vkcompute

backends/vulkan/runtime/graph/ops/PrepackNode.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
6262
TensorRefPtr tref = graph->get_tref(tref_);
6363
size_t numel = utils::multiply_integers(tref->sizes);
6464
api::StagingBuffer staging(graph->context(), tref->dtype, numel);
65+
graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
6566
size_t nbytes = numel * vkapi::element_size(tref->dtype);
6667
staging.copy_from(tref->data, nbytes);
6768
return staging;

backends/vulkan/runtime/vk_api/memory/Buffer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ class VulkanBuffer final {
138138
return buffer_properties_.size;
139139
}
140140

141+
inline size_t mem_size_as_size_t() const {
142+
return utils::safe_downcast<size_t>(mem_size());
143+
}
144+
141145
inline bool has_memory() const {
142146
return (memory_.allocation != VK_NULL_HANDLE);
143147
}

0 commit comments

Comments
 (0)