[ET-VK] Using push constants for buffer to image prepack nodes.

trivedivivek · trivedivivek · commit 81f8a35adeb1 · 2025-05-30T07:56:49.000-07:00
This diff enables the use of push constants for buffer to image prepack nodes in the Vulkan runtime graph. Push constants are a more efficient way to pass small amounts of data to shaders, compared to using uniform buffers. * The `nchw_to_*.yaml` files have been updated to include the `USE_PUSH_CONST` flag which is `True` by default, and enables the use of push constants for all `nchw_to_*` operations. * New variants of the `nchw_to_*` operation have been added with suffix `_no_pc`, which do not use push constants. These variants are used for compatibility with testing and utility functions. * The `Convolution.cpp` and `Staging.cpp` files have been updated to pass empty parameter buffers and instead use push constants. Differential Revision: [D70102398](https://our.internmc.facebook.com/intern/diff/D70102398/) ghstack-source-id: 287186740 Pull Request resolved: #11252
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
@@ -22,7 +22,13 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "nchw_in", "int")}
-${layout_declare_ubo(B, "ivec4", "sizes")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 sizes;
+  };
+$else:
+  ${layout_declare_ubo(B, "ivec4", "sizes")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
@@ -8,6 +8,7 @@ nchw_to_bitw8_image_nobitw8buffer:
   parameter_names_with_default_values:
     STORAGE: texture3d
     DTYPE: int8
+    USE_PUSH_CONST: True
   generate_variant_forall:
     STORAGE:
       - VALUE: texture2d
@@ -17,3 +18,5 @@ nchw_to_bitw8_image_nobitw8buffer:
       - VALUE: uint8
   shader_variants:
     - NAME: nchw_to_bitw8_image_nobitw8buffer
+    - NAME: nchw_to_bitw8_image_nobitw8buffer_no_pc
+      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -12,9 +12,17 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "out_strides")}
-${layout_declare_ubo(4, "int", "numel")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 out_sizes;
+    ivec4 out_strides;
+    int numel;
+  };
+$else:
+  ${layout_declare_ubo(2, "ivec4", "out_sizes")}
+  ${layout_declare_ubo(3, "ivec4", "out_strides")}
+  ${layout_declare_ubo(4, "int", "numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
@@ -8,6 +8,7 @@ nchw_to_buffer:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: buffer
+    USE_PUSH_CONST: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -16,3 +17,5 @@ nchw_to_buffer:
       - VALUE: int8
   shader_variants:
     - NAME: nchw_to_buffer
+    - NAME: nchw_to_buffer_no_pc
+      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -21,9 +21,17 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
-${layout_declare_ubo(B, "ivec4", "sizes")}
-$if not FROM_STAGING:
-  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 sizes;
+  $if not FROM_STAGING:
+    ivec4 buf_strides;
+  };
+$else:
+  ${layout_declare_ubo(B, "ivec4", "sizes")}
+  $if not FROM_STAGING:
+    ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -9,6 +9,7 @@ nchw_to_image:
     STORAGE: texture3d
     DTYPE: float
     FROM_STAGING: True
+    USE_PUSH_CONST: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -21,3 +22,11 @@ nchw_to_image:
       STORAGE: texture2d
     - NAME: clone_buffer_to_image
       FROM_STAGING: False
+    - NAME: nchw_to_image_no_pc_texture3d
+      USE_PUSH_CONST: False
+    - NAME: nchw_to_image_no_pc_texture2d
+      STORAGE: texture2d
+      USE_PUSH_CONST: False
+    - NAME: clone_buffer_to_image_no_pc
+      FROM_STAGING: False
+      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -105,9 +105,9 @@ void add_buffer_to_image_node(
       // Input and Outputs
       {{image, vkapi::kWrite}, {buffer, vkapi::kRead}},
       // Parameter Buffers
-      {graph.sizes_ubo(image), graph.strides_ubo(buffer)},
-      // Push Constants
       {},
+      // Push Constants
+      {graph.sizes_pc_of(image), graph.strides_pc_of(buffer)},
       // Specialization Constants
       {graph.hashed_layout_of(image)},
       // Resize Args
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -106,9 +106,10 @@ ValueRef prepack_biases(
       graph.create_local_wg_size(v),
       vref,
       v,
-      {t->sizes_ubo()},
+      {},
       // Specialization constants
-      {t->hashed_layout()}));
+      {t->hashed_layout()},
+      {graph.sizes_pc_of(v)}));
 
   return v;
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -28,14 +28,14 @@ void add_staging_to_tensor_node(
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       *graph.get_tensor(out_tensor), graph.int8_buffers_enabled());
 
-  vkapi::ParamsBindList ubos;
+  std::vector<PushConstantDataInfo> pcs;
   if (graph.is_buffer_storage(out_tensor)) {
-    ubos.append(
-        {graph.sizes_ubo(out_tensor),
-         graph.strides_ubo(out_tensor),
-         graph.numel_ubo(out_tensor)});
+    pcs = {
+        graph.sizes_pc_of(out_tensor),
+        graph.strides_pc_of(out_tensor),
+        graph.numel_pc_of(out_tensor)};
   } else {
-    ubos.append({graph.sizes_ubo(out_tensor)});
+    pcs = {graph.sizes_pc_of(out_tensor)};
   }
 
   graph.execute_nodes().emplace_back(new DispatchNode(
@@ -46,9 +46,9 @@ void add_staging_to_tensor_node(
       // Input and Outputs
       {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
       // Parameter Buffers
-      ubos,
-      // Push Constants
       {},
+      // Push Constants
+      pcs,
       // Specialization Constants
       {graph.hashed_layout_of(out_tensor)},
       // Resize Args
@@ -127,14 +127,14 @@ void add_prepack_standard_node(
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       *graph.get_tensor(tensor), graph.int8_buffers_enabled());
 
-  vkapi::ParamsBindList ubos;
+  std::vector<PushConstantDataInfo> pcs;
   if (graph.is_buffer_storage(tensor)) {
-    ubos.append(
-        {graph.sizes_ubo(tensor),
-         graph.strides_ubo(tensor),
-         graph.numel_ubo(tensor)});
+    pcs = {
+        graph.sizes_pc_of(tensor),
+        graph.strides_pc_of(tensor),
+        graph.numel_pc_of(tensor)};
   } else {
-    ubos.append({graph.sizes_ubo(tensor)});
+    pcs = {graph.sizes_pc_of(tensor)};
   }
 
   int transpose_hw_spec = transpose_hw ? 1 : 0;
@@ -148,9 +148,10 @@ void add_prepack_standard_node(
       tensor_data,
       tensor,
       // Parameter Buffers
-      ubos,
+      {},
       // Specialization Constants
-      {graph.hashed_layout_of(tensor), transpose_hw_spec}));
+      {graph.hashed_layout_of(tensor), transpose_hw_spec},
+      pcs));
 }
 
 ValueRef prepack_standard(
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -22,25 +22,35 @@ bool is_bitw8(vkapi::ScalarType dtype) {
 
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
-    const bool int8_buffer_enabled) {
+    bool int8_buffer_enabled,
+    bool push_constant_variant) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
 
   if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer &&
       !int8_buffer_enabled) {
     kernel_name = "nchw_to_bitw8_image_nobitw8buffer";
+    if (!push_constant_variant) {
+      kernel_name += "_no_pc";
+    }
     add_storage_type_suffix(kernel_name, v_dst);
     add_dtype_suffix(kernel_name, v_dst);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
   if (v_dst.storage_type() == utils::kBuffer) {
     kernel_name = "nchw_to_buffer";
+    if (!push_constant_variant) {
+      kernel_name += "_no_pc";
+    }
     add_dtype_suffix(kernel_name, v_dst);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
   kernel_name = "nchw_to_image";
+  if (!push_constant_variant) {
+    kernel_name += "_no_pc";
+  }
   add_storage_type_suffix(kernel_name, v_dst);
   add_dtype_suffix(kernel_name, v_dst);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -14,7 +14,8 @@ namespace vkcompute {
 
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
-    bool int8_buffer_enabled = true);
+    bool int8_buffer_enabled = true,
+    bool push_constant_variant = true);
 vkapi::ShaderInfo get_tensor_to_nchw_shader(
     const api::vTensor& v_src,
     bool int8_buffer_enabled = true);
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -28,7 +28,7 @@ void record_nchw_to_buffer_op(
   vkapi::PipelineBarrier pipeline_barrier{};
 
   context->submit_compute_job(
-      get_nchw_to_tensor_shader(v_dst),
+      get_nchw_to_tensor_shader(v_dst, true, false),
       pipeline_barrier,
       {uint32_t(v_dst.numel()), 1, 1},
       {64, 1, 1},
@@ -74,7 +74,7 @@ void record_nchw_to_image_op(
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(
-          v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
+          v_dst, context->adapter_ptr()->has_full_int8_buffers_support(), false),
       pipeline_barrier,
       v_dst.logical_limits(),
       adaptive_work_group_size(v_dst.logical_limits()),
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1600,8 +1600,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 4);
 
   // +2: t.sizes_ubo() for each staging shader
-  // +2: staging buffer for each input tensor
-  expected_vma_allocation_count += 4;
+  expected_vma_allocation_count += 2;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef c = graph.add_tensor(
@@ -1621,8 +1620,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 2);
 
   // +1: t.sizes_ubo() uniform buffer for staging shader
-  // +1: staging buffer for the input tensor
-  expected_vma_allocation_count += 2;
+  expected_vma_allocation_count += 1;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef e = graph.add_tensor(