Implement grid_priors op (#4440)

Yujie Hui · facebook-github-bot · commit 9aeceeee3df8 · 2024-07-30T15:29:34.000-07:00
Summary: Pull Request resolved: #4440 Modify the spec of customized op `grid_prirors` to take a tensor as input. Compared to previous definition, the `height` and `width` arguments will be determined by the input tensor as `height, width = self.shape[-2:]`. The reason we change the spec is: if we want to support dynamic shape, the input should be a tensor. Implement customized op `grid_priors`. This op is used to generate mapped x,y points from different level feature map to original images. Op spec: ``` (Tensor self, int stride, float offset) -> Tensor ``` Example: ``` input_tensor = torch.rand(size = [1, 5, 2, 3]) stride = 8 offset = 0.5 output.shape = [3x2, 2] output = tensor([[ 4., 4.], [12., 4.], [20., 4.], [ 4., 12.], [12., 12.], [20., 12.]]) ``` Add smoke test for now due to some issue to lower customized op to Vulkan backend. Will add unit test and nn.Module test when be able to lower customized op from PyTorch to Vulkan backend. bypass-github-export-checks bypass-github-pytorch-ci-checks bypass-github-executorch-ci-checks Reviewed By: copyrightly Differential Revision: D60203196 fbshipit-source-id: 93e5180e80e07cc0b9acb50890a1187ce0f82951
diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py
@@ -49,11 +49,11 @@ def conv_with_clamp_impl(
 
 
 def grid_priors_impl(
-    height,
-    width,
+    x,
     stride,
     offset,
 ):
+    height, width = x.shape[-2:]
     shift_x = (torch.arange(0, width) + offset) * stride
     shift_y = (torch.arange(0, height) + offset) * stride
     shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x)
@@ -64,6 +64,6 @@ def grid_priors_impl(
 
 
 name = "grid_priors"
-lib.define(f"{name}(int height, int width, int stride, float offset) -> Tensor")
-lib.impl(name, grid_priors_impl)
+lib.define(f"{name}(Tensor self, int stride, float offset) -> Tensor")
+lib.impl(name, grid_priors_impl, "CompositeExplicitAutograd")
 grid_priors_op = getattr(getattr(torch.ops, namespace), name)
diff --git a/backends/vulkan/passes/test_custom_ops.py b/backends/vulkan/passes/test_custom_ops.py
@@ -97,14 +97,15 @@ class GridPriors(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
-            def forward(self, height, width, stride, offset):
-                return torch.ops.et_vk.grid_priors(height, width, stride, offset)
+            def forward(self, x, stride, offset):
+                return torch.ops.et_vk.grid_priors(x, stride, offset)
 
         model = GridPriors()
-        sample_input = (2, 3, 4, 0.5)
+        sample_input = (torch.rand(2, 5, 2, 3), 4, 0.5)
         custom_out = model(*sample_input)
 
-        def calculate_expected_output(height, width, stride, offset):
+        def calculate_expected_output(x, stride, offset):
+            height, width = x.shape[-2:]
             shift_x = (torch.arange(0, width) + offset) * stride
             shift_y = (torch.arange(0, height) + offset) * stride
             shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl
@@ -0,0 +1,38 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_ubo(1, "ivec4", "in_sizes")}
+${layout_declare_ubo(2, "ivec4", "out_sizes")}
+${layout_declare_ubo(3, "int", "stride", "float", "offset")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
+
+  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+    return;
+  }
+  int width = in_sizes.x;
+  VEC4_T outtex;
+  if (pos.x == 0) {
+    float value = (pos.y % width + offset) * stride;
+    outtex = VEC4_T(value, 0, 0, 0);
+  } else if (pos.x == 1) {
+    float value = (pos.y / width + offset) * stride;
+    outtex = VEC4_T(value, 0, 0, 0);
+  }
+
+  imageStore(t_out, pos, outtex);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml
@@ -0,0 +1,12 @@
+grid_priors:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: C_packed
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: grid_priors
diff --git a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+struct GridPriorsParam final {
+  int32_t stride;
+  float offset;
+};
+
+void resize_grid_priors_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(extra_args[0]);
+  std::vector<int64_t> in_sizes = in->sizes();
+  int64_t height = in_sizes.at(in_sizes.size() - 2);
+  int64_t width = in_sizes.at(in_sizes.size() - 1);
+  std::vector<int64_t> sizes = {height * width, 2};
+  out->virtual_resize(sizes);
+}
+
+void add_grid_priors_node(
+    ComputeGraph& graph,
+    const ValueRef& in,
+    const ValueRef& stride_ref,
+    const ValueRef& offset_ref,
+    const ValueRef& out) {
+  vTensorPtr t_out = graph.get_tensor(out);
+  vTensorPtr t_in = graph.get_tensor(in);
+  int32_t stride = graph.extract_scalar<int32_t>(stride_ref);
+  float offset = graph.extract_scalar<float>(offset_ref);
+
+  std::string kernel_name = "grid_priors";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  GridPriorsParam param = {stride, offset};
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
+      // Inputs and Outputs
+      {
+          {out, vkapi::MemoryAccessType::WRITE},
+      },
+      // Shader params buffers
+      {
+          t_in->sizes_ubo(),
+          t_out->sizes_ubo(),
+          graph.create_params_buffer(param),
+      },
+      // Specialization Constants
+      {},
+      resize_grid_priors_node,
+      {in}));
+}
+
+void grid_priors(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_grid_priors_node(graph, args[0], args[1], args[2], args[3]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(grid_priors.default, grid_priors);
+}
+} // namespace vkcompute
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2203,3 +2203,75 @@ TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
                                 0, 3, 9, 0, 0, 6, 12, 0, 0, 5,  11,
                                 0, 0, 0, 0, 0, 0, 0,  0, 0, 0});
 }
+
+void test_grid_priors(
+    std::vector<int64_t> input_sizes,
+    std::vector<int64_t> output_sizes,
+    int stride,
+    double offset,
+    const std::vector<float>& data_out_expected) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  // Build graph
+  IOValueRef in = graph.add_input_tensor(
+      input_sizes,
+      vkapi::kFloat,
+      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
+  IOValueRef out;
+  out.value = graph.add_tensor(
+      output_sizes,
+      vkapi::kFloat,
+      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
+
+  VK_GET_OP_FN("grid_priors.default")
+  (graph,
+   {in.value,
+    graph.add_scalar<int64_t>(stride),
+    graph.add_scalar<double>(offset),
+    out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  vTensorPtr t_in = graph.get_tensor(in.value);
+  vTensorPtr t_out = graph.get_tensor(out.value);
+  // Resize input
+  graph.propagate_resize();
+
+  // run graph
+  graph.execute();
+
+  std::vector<float> output_data(t_out->gpu_numel());
+  graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
+
+  // check results
+  int h_out = utils::val_at(-2, t_out->sizes());
+  int w_out = utils::val_at(-1, t_out->sizes());
+  for (size_t i = 0; i < h_out; ++i) {
+    for (size_t j = 0; j < w_out; ++j) {
+      size_t idx_out = i * w_out + j;
+      CHECK_VALUE(output_data, idx_out, data_out_expected[idx_out]);
+    }
+  }
+}
+
+TEST(VulkanComputeGraphOpsTest, grid_priors_test) {
+  test_grid_priors(
+      /*input size = */ {1, 5, 2, 3},
+      /*output size = */ {6, 2},
+      /*stride = */ 1,
+      /*offset = */ 0.0,
+      /*data_out_expected = */ {0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1});
+
+  test_grid_priors(
+      /*input size = */ {1, 5, 2, 3},
+      /*output size = */ {6, 2},
+      /*stride = */ 8,
+      /*offset = */ 0.5,
+      /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12});
+}