[QNN-EP] Define SpaceToDepth fusion for YOLOv2. (#24848)

minfhong-quic · web-flow · commit 801006d80f51 · 2025-05-28T15:00:02.000-07:00
### Description
&lt;!-- Describe your changes. --&gt;
- Add SpaceToDepth fusion for QNN preprocess.
- The pattern in YOLOv2 is uncommon while the common seen one is left as future work.
- Add entry point/API for non-quantization user to preprocess models for QNN execution.
- Revise cmake to package newly introduced directory into Python wheel.

### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
- While executing YOLOv2 model on QNN-EP, a sequence of Reshape and Transpose having 6D shapes are falling back to CPU due to HTP limitation. Add fusion to fuse this sequence of ops into a single SpaceToDepth which can be directly executed on QNN-EP.
- Since current QNN preprocess is provided in `onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py` which is under quantization directory, the path may be confusing for non-quantization users. In order to allow non-quantization users to preprocess models for QNN, introduce `onnxruntime/python/tools/qnn/preprocess.py` to serve as the entry point and provide API to preprocess models.
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
@@ -453,6 +453,9 @@ endif()
 file(GLOB onnxruntime_python_tools_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/*.py"
 )
+file(GLOB onnxruntime_python_tools_qnn_src CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/python/tools/qnn/*.py"
+)
 file(GLOB onnxruntime_python_quantization_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/quantization/*.py"
 )
@@ -564,6 +567,7 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/qdq_helpers
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/ort_format_model
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/ort_format_model/ort_flatbuffers_py
+  COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/qnn
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/bart
@@ -649,6 +653,9 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy_directory
       ${ONNXRUNTIME_ROOT}/core/flatbuffers/ort_flatbuffers_py
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/ort_format_model/ort_flatbuffers_py
+  COMMAND ${CMAKE_COMMAND} -E copy
+      ${onnxruntime_python_tools_qnn_src}
+      $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/qnn/
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_quantization_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/
diff --git a/onnxruntime/python/tools/qnn/preprocess.py b/onnxruntime/python/tools/qnn/preprocess.py
@@ -0,0 +1,139 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""Provide entry point to preprocess ONNX model especially for QNN."""
+
+import argparse
+import pathlib
+
+import onnx
+
+from onnxruntime.quantization.execution_providers import qnn
+
+
+def _parse_arguments():
+    """Parse cmdline arguments."""
+    parser = argparse.ArgumentParser(description="Arguments for QNN model preprocess.")
+
+    parser.add_argument("--input_model_path", "-i", required=True, help="Path to the input ONNX model.")
+    parser.add_argument("--output_model_path", "-o", required=True, help="Path to the output ONNX model.")
+
+    # Save preprocessed model with external data.
+    parser.add_argument(
+        "--save_as_external_data",
+        action="store_true",
+        help="Whether the output model would be saved with external data.",
+    )
+    parser.add_argument(
+        "--all_tensors_to_one_file",
+        action="store_true",
+        help="Whether to save all external data in one file or save each tensor to a file named with the tensor name.",
+    )
+    parser.add_argument(
+        "--external_data_location",
+        help="Filename of the external file where all tensors are saved. The path is relative to the model path.",
+    )
+    parser.add_argument(
+        "--external_data_size_threshold",
+        default=1024,
+        type=int,
+        help="Tensors with data size larger than this threshold are converted to external data.",
+    )
+    parser.add_argument(
+        "--external_data_convert_attribute",
+        action="store_true",
+        help="Whether to save all tensors, including attribute tensors, to external data.",
+    )
+
+    # Preprocess options.
+    parser.add_argument(
+        "--fuse_layernorm",
+        action="store_true",
+        help="Whether to fuse matched sequences into LayerNormalization nodes if possible.",
+    )
+
+    # I/O layouts.
+    parser.add_argument(
+        "--inputs_to_make_channel_last",
+        nargs="+",
+        default=None,
+        help="List of graph input names to be transposed into channel-last.",
+    )
+
+    parser.add_argument(
+        "--outputs_to_make_channel_last",
+        nargs="+",
+        default=None,
+        help="List of graph output names to be transposed into channel-last.",
+    )
+
+    return parser.parse_args()
+
+
+def qnn_preprocess_model(
+    model_input: str | pathlib.Path | onnx.ModelProto,
+    model_output: str | pathlib.Path,
+    fuse_layernorm: bool = False,
+    save_as_external_data: bool = False,
+    all_tensors_to_one_file: bool = False,
+    external_data_location: str | None = None,
+    external_data_size_threshold: int = 1024,
+    external_data_convert_attribute: bool = False,
+    inputs_to_make_channel_last: list[str] | None = None,
+    outputs_to_make_channel_last: list[str] | None = None,
+) -> bool:
+    """Preprocess ONNX model for QNN.
+
+    Args:
+        model_input: A path or ONNX ModelProto specifiying the model to be preprocessed.
+        model_output: A path specifying where the preprocessed model to be saved.
+        fuse_layernorm: A bool specifying whether to fuse the matched sequence into a single LayerNormalization node.
+            Defaults to False.
+        save_as_external_data: A bool specifying whether to save model with external data. Defaults to False.
+        all_tensors_to_one_file: A bool specifying whether to save all external data in one file or save each tensor to
+            a file named with the tensor name. This argument is effective only when `save_as_external_data` is True.
+            Defaults to False.
+        external_data_location: A str specifying where to save the external data. The path is relative to the model
+            path. This argument is effective only when `save_as_external_data` is True. Defaults to the model name.
+        external_data_size_threshold: An int specifying the threshold of data size for tensors be saved as external
+            data. This argument is effective only when `save_as_external_data` is True. Defaults to 1024.
+        external_data_convert_attribute: A bool specifying whether to save all tensors including attributes as external
+            data. This argument is effective only when `save_as_external_data` is True. Defaults to False.
+        inputs_to_make_channel_last: A list of strs specifying graph input names to be transposed into channel-last.
+            Defaults to None.
+        outputs_to_make_channel_last: A list of strs specifying graph output names to be transposed into channel-last.
+            Defaults to None.
+
+    Returns:
+        A bool indicating whether the model is modified.
+    """
+    return qnn.qnn_preprocess_model(
+        model_input,
+        model_output,
+        fuse_layernorm=fuse_layernorm,
+        save_as_external_data=save_as_external_data,
+        all_tensors_to_one_file=all_tensors_to_one_file,
+        external_data_location=external_data_location,
+        external_data_size_threshold=external_data_size_threshold,
+        external_data_convert_attribute=external_data_convert_attribute,
+        inputs_to_make_channel_last=inputs_to_make_channel_last,
+        outputs_to_make_channel_last=outputs_to_make_channel_last,
+    )
+
+
+if __name__ == "__main__":
+    args = _parse_arguments()
+    qnn_preprocess_model(
+        args.input_model_path,
+        args.output_model_path,
+        fuse_layernorm=args.fuse_layernorm,
+        save_as_external_data=args.save_as_external_data,
+        all_tensors_to_one_file=args.all_tensors_to_one_file,
+        external_data_location=args.external_data_location,
+        external_data_size_threshold=args.external_data_size_threshold,
+        external_data_convert_attribute=args.external_data_convert_attribute,
+        inputs_to_make_channel_last=args.inputs_to_make_channel_last,
+        outputs_to_make_channel_last=args.outputs_to_make_channel_last,
+    )
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_spacetodepth.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_spacetodepth.py
@@ -0,0 +1,162 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""Define SpaceToDepth fusion."""
+
+import onnx
+
+from ... import fusions, onnx_model
+
+
+class FusionSpaceToDepth(fusions.Fusion):
+    """Fusion for SpaceToDepth."""
+
+    def __init__(self, model: onnx_model.ONNXModel):
+        """Initialize.
+
+        Args:
+            model: An onnx_model.ONNXModel instance.
+        """
+        super().__init__(model, "SpaceToDepth", "Reshape")
+
+    def _fuse_yolo(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Fuse for early version of YOLO.
+
+        Pattern:
+
+                |     [N, C, H, W]
+             Reshape
+                |     [N, C, H/blk, blk, W/blk, blk]
+            Transpose
+                |     [N, C, H/blk, W/blk, blk, blk]
+             Reshape
+                |     [N, C, H/blk * W/blk, blk * blk]
+            Transpose
+                |     [N, C, blk * blk, H/blk * W/blk]
+             Reshape
+                |     [N, C, blk * blk, H/blk, W/blk]
+            Transpose
+                |     [N, blk * blk, C, H/blk, W/blk]
+             Reshape
+                |     [N, blk * blk * C, H/blk, W/blk]
+
+        This sequence can be fused into a single SpaceToDepth with blocksize `blk`. Note that unlike DepthToSpace
+        supporting DCR or CRD mode, SpaceToDepth only supports DCR mode in its latest opset version (13), which matches
+        the pattern here.
+        """
+        reshape_node1 = node
+
+        def get_target_child(parent_node, target_op_type):
+            """Get target child of given node."""
+            if parent_node.output[0] not in input_name_to_nodes:
+                return None
+
+            children = input_name_to_nodes[parent_node.output[0]]
+            if len(children) > 1 or children[0].op_type != target_op_type:
+                return None
+
+            return children[0]
+
+        if (
+            (transpose_node1 := get_target_child(reshape_node1, "Transpose")) is None
+            or (reshape_node2 := get_target_child(transpose_node1, "Reshape")) is None
+            or (transpose_node2 := get_target_child(reshape_node2, "Transpose")) is None
+            or (reshape_node3 := get_target_child(transpose_node2, "Reshape")) is None
+            or (transpose_node3 := get_target_child(reshape_node3, "Transpose")) is None
+            or (reshape_node4 := get_target_child(transpose_node3, "Reshape")) is None
+        ):
+            return False
+
+        def get_tensor_shape(tensor_name):
+            """Get shape for given tensor name."""
+            tensor_type = self.model.get_tensor_type(tensor_name)
+            if not tensor_type:
+                return None
+
+            tensor_shape = self.tensor_shape_to_list(tensor_type)
+            if not tensor_shape:
+                return None
+
+            return tensor_shape
+
+        if (
+            (input_shape := get_tensor_shape(reshape_node1.input[0])) is None
+            or (reshape_shape1 := get_tensor_shape(reshape_node1.output[0])) is None
+            or (reshape_shape2 := get_tensor_shape(reshape_node2.output[0])) is None
+            or (reshape_shape3 := get_tensor_shape(reshape_node3.output[0])) is None
+            or (reshape_shape4 := get_tensor_shape(reshape_node4.output[0])) is None
+        ):
+            return False
+
+        transpose_perm1 = self.get_node_attribute(transpose_node1, "perm")
+        transpose_perm2 = self.get_node_attribute(transpose_node2, "perm")
+        transpose_perm3 = self.get_node_attribute(transpose_node3, "perm")
+
+        # Check rank.
+        if (
+            len(input_shape) != 4
+            or len(reshape_shape1) != 6
+            or len(reshape_shape2) != 4
+            or len(reshape_shape3) != 5
+            or len(reshape_shape4) != 4
+        ):
+            return False
+
+        # Check shape and perm.
+        batch, channel, height, width = input_shape
+        blocksize = reshape_shape1[3]
+        if (
+            reshape_shape1 != [batch, channel, height // blocksize, blocksize, width // blocksize, blocksize]
+            or transpose_perm1 != [0, 1, 2, 4, 3, 5]
+            or reshape_shape2 != [batch, channel, (height // blocksize) * (width // blocksize), blocksize**2]
+            or transpose_perm2 != [0, 1, 3, 2]
+            or reshape_shape3 != [batch, channel, blocksize**2, height // blocksize, width // blocksize]
+            or transpose_perm3 != [0, 2, 1, 3, 4]
+            or reshape_shape4 != [batch, blocksize**2 * channel, height // blocksize, width // blocksize]
+        ):
+            return False
+
+        self.nodes_to_remove.extend(
+            [
+                reshape_node1,
+                transpose_node1,
+                reshape_node2,
+                transpose_node2,
+                reshape_node3,
+                transpose_node3,
+                reshape_node4,
+            ]
+        )
+
+        s2d_node = onnx.helper.make_node(
+            self.fused_op_type,
+            name=self.create_unique_node_name(),
+            inputs=[reshape_node1.input[0]],
+            outputs=[reshape_node4.output[0]],
+            blocksize=blocksize,
+        )
+        self.nodes_to_add.append(s2d_node)
+
+        return True
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Fuse a sequence of Reshape and Transpose nodes into a single SpaceToDepth node.
+
+        Args:
+            node: An onnx.NodeProto matching the specified search type (i.e., Reshape).
+            input_name_to_nodes: A dict mapping tensor name to consumed nodes.
+            output_name_to_node: A dict mapping tensor name to produced node.
+        """
+        self._fuse_yolo(node, input_name_to_nodes, output_name_to_node)
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
@@ -12,7 +12,9 @@
 
 from ...fusions import FusionGelu, FusionLayerNormalization
 from ...onnx_model import ONNXModel
+from ...quant_utils import save_and_reload_model_with_shape_infer
 from .fusion_lpnorm import FusionLpNormalization
+from .fusion_spacetodepth import FusionSpaceToDepth
 
 
 def qnn_preprocess_model(
@@ -83,6 +85,7 @@ def qnn_preprocess_model(
     """
     modified = False
     model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
+    model = save_and_reload_model_with_shape_infer(model)
     onnx_model = ONNXModel(model)
 
     # Fuse Erf sequence into a single Gelu
@@ -95,6 +98,11 @@ def qnn_preprocess_model(
     if fusion_lpnorm.apply():
         modified = True
 
+    # Fuse Reshape/Transpose sequence into a single SpaceToDepth.
+    fusion_s2d = FusionSpaceToDepth(onnx_model)
+    if fusion_s2d.apply():
+        modified = True
+
     # Optionally, fuse ReduceMean sequence into a single LayerNormalization node.
     if fuse_layernorm:
         onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
diff --git a/setup.py b/setup.py
@@ -514,6 +514,7 @@ def finalize_options(self):
     "onnxruntime.tools.ort_format_model.ort_flatbuffers_py",
     "onnxruntime.tools.ort_format_model.ort_flatbuffers_py.fbs",
     "onnxruntime.tools.qdq_helpers",
+    "onnxruntime.tools.qnn",
     "onnxruntime.quantization",
     "onnxruntime.quantization.operators",
     "onnxruntime.quantization.CalTableFlatBuffers",