fix bugs and clean codes

zewenli98 · zewenli98 · commit f1654f5ce2e6 · 2025-06-12T13:09:32.000-07:00
diff --git a/examples/hierarchical_partitioner_example.py b/examples/hierarchical_partitioner_example.py
@@ -1,9 +1,6 @@
 import torch
 import torch.nn as nn
 import torch_tensorrt
-from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
-    DYNAMO_ATEN_CONVERTERS,
-)
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     DYNAMO_CONVERTERS as CONVERTERS,
 )
@@ -15,6 +12,7 @@
 from torch_tensorrt.dynamo.partitioning._hierarchical_partitioner import (
     hierarchical_adjacency_partition,
 )
+from torchvision import models
 
 
 class SimpleModel(nn.Module):
@@ -50,18 +48,18 @@ def main():
 
     gm = exported_program.module()
 
-    print(gm.graph)
+    print(gm)
 
     original_output = model(example_input)
 
-    # Partition the model using the adjacency partitioner
+    # Partition the model using the adjacency partitioner, compared with below
     # partitioned_model, op_support = partition(
     #     gm,
     #     verbose=True,
     #     min_block_size=1,
-    #     torch_executed_ops=[
-    #         torch.ops.aten.relu.default,
-    #     ],
+    #     torch_executed_ops={
+    #         "torch.ops.aten.relu.default",
+    #     },
     # )
 
     partitioned_model, op_support = hierarchical_adjacency_partition(
@@ -71,21 +69,18 @@ def main():
         backend_priority=["inductor", "tensorrt"],
         backend_support_map={
             "inductor": {
-                # operator.getitem,
-                torch.ops.aten.conv2d.default,
-                torch.ops.aten.convolution.default,
+                "torch.ops.aten.convolution.default",
             },
-            "tensorrt": set(DYNAMO_ATEN_CONVERTERS.keys()),
+            "tensorrt": CONVERTERS.keys(),
+        },
+        torch_executed_ops={
+            "torch.ops.aten._native_batch_norm_legit_no_training.default"
         },
-        torch_executed_ops=[
-            torch.ops.aten._native_batch_norm_legit_no_training.default
-        ],
         require_full_compilation=False,
-        skip_fusion=False,
+        skip_fusion=True,
     )
 
-    print("\nPartitioned Model Structure:")
-    print(partitioned_model)
+    print("\nPartitioned Model Structure:\n", partitioned_model)
 
     print("0. Original_output:", original_output)
 
@@ -98,8 +93,15 @@ def main():
         )
 
     compiled_model = torch_tensorrt.compile(
-        model, inputs=[example_input], min_block_size=1
+        model,
+        inputs=[example_input],
+        min_block_size=1,
+        torch_executed_ops={
+            "torch.ops.aten._native_batch_norm_legit_no_training.default"
+        },
     )
+    print("\nCompiled Model Structure:\n", compiled_model)
+
     with torch.no_grad():
         compiled_output = compiled_model(example_input)
         print("2. Compiled_output:", compiled_output)
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -4,7 +4,17 @@
 import logging
 import platform
 import warnings
-from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Collection,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import torch
 from torch.export import ExportedProgram
@@ -28,9 +38,6 @@
     interpret_module_to_result,
     repair_double_inputs,
 )
-from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
-    DYNAMO_ATEN_CONVERTERS,
-)
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     DYNAMO_CONVERTERS as CONVERTERS,
 )
@@ -792,16 +799,15 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
         )
 
     ############ TODO: testing only ############
-    use_hierarchical_partitioner = False
+    use_hierarchical_partitioner = True
     backend_priority = ["inductor", "tensorrt"]
     backend_support_map = {
         "inductor": {
-            # operator.getitem,
-            torch.ops.aten.conv2d.default,
-            torch.ops.aten.convolution.default,
+            "torch.ops.aten.convolution.default",
         },
-        "tensorrt": set(DYNAMO_ATEN_CONVERTERS.keys()),
+        "tensorrt": CONVERTERS.keys(),
     }
+    skip_fusion = True
     #############################################
     # Partition module into components that can be TRT-accelerated
     fast_partitioner_failed = False
@@ -819,7 +825,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
                         min_block_size=settings.min_block_size,
                         torch_executed_ops=settings.torch_executed_ops,
                         require_full_compilation=settings.require_full_compilation,
-                        skip_fusion=(num_supported_ops == total_ops),
+                        skip_fusion=skip_fusion,
                         backend_priority=backend_priority,
                         backend_support_map=backend_support_map,
                     )
@@ -953,19 +959,17 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
                 if "_run_on_acc_inductor" in name:
                     sub_inputs = []
                     for input in submodule_inputs:
-                        sub_input = (
-                            torch.randn(input.shape)
-                            .to(dtype.to(input.dtype, t=torch.dtype))
-                            .cuda()
-                        )
+                        sub_input = input.torch_tensor.to(
+                            dtype.to(input.dtype, t=torch.dtype)
+                        ).cuda()
                         sub_inputs.append(sub_input)
 
                     compiled_func = torch._inductor.compile(
                         submodule,
                         sub_inputs,
                     )
                     # Wrap the compiled function to be a torch.nn.Module
-                    compiled_submodule = FunctionWrapper(compiled_func)
+                    compiled_submodule = InductorModule(compiled_func)
 
                 elif "_run_on_acc_tensorrt" in name:
                     compiled_submodule = convert_module(
@@ -1345,10 +1349,12 @@ def load_cross_compiled_exported_program(file_path: str = "") -> Any:
     return replace_execute_engine_no_op_node(exp_program)
 
 
-class FunctionWrapper(torch.nn.Module):
-    def __init__(self, func):
+class InductorModule(torch.nn.Module):  # type: ignore[misc]
+    """Wrapper module for inductor compiled function."""
+
+    def __init__(self, func: Callable[..., Any]) -> None:
         super().__init__()
         self.func = func
 
-    def forward(self, *args, **kwargs):
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
         return self.func(*args, **kwargs)
diff --git a/py/torch_tensorrt/dynamo/partitioning/_hierarchical_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_hierarchical_partitioner.py
@@ -1,12 +1,11 @@
 import logging
 from dataclasses import dataclass
-from typing import Collection, Dict, List, Optional, Set, Tuple
+from typing import Collection, Dict, List, Optional, Tuple
 
 import torch
 import torch.fx.passes.operator_support as ops
-from torch._ops import OpOverload
 from torch.fx._compatibility import compatibility
-from torch.fx.node import Target, _get_qualified_name
+from torch.fx.node import Target
 from torch.fx.passes.splitter_base import (
     _SplitterBase,
     _SplitterSettingBase,
@@ -24,12 +23,15 @@
     REQUIRE_FULL_COMPILATION,
 )
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
-    DYNAMO_ATEN_CONVERTERS,
+    DYNAMO_CONVERTERS,
     ConverterRegistry,
 )
 
 logger = logging.getLogger(__name__)
 
+NON_COMPUTE_NODES = {"torch.ops.aten.view", "_operator.getitem"}
+NON_ACC_BACKEND_NAME = "None"
+
 
 @compatibility(is_backward_compatible=False)
 @dataclass
@@ -45,7 +47,7 @@ class BackendOpSupportTester(ops.OperatorSupportBase):  # type: ignore
 
     def __init__(
         self,
-        backend_support_map: Dict[str, Set[OpOverload]],
+        backend_support_map: Dict[str, Collection[Target]],
         backend_priority: List[str],
         torch_executed_ops: Collection[Target] = set(),
     ) -> None:
@@ -62,12 +64,14 @@ def __init__(
 
     def is_node_supported(
         self, submodules: Dict[str, torch.nn.Module], node: torch.fx.Node
-    ) -> Tuple[bool, Optional[str]]:
+    ) -> Tuple[bool, str]:
         node_name = ConverterRegistry.qualified_name_or_str(node.target)
 
         for i, backend_name in enumerate(self.backend_priority):
             supported_ops = self.backend_support_map.get(backend_name, set())
-            supported_ops = {_get_qualified_name(op) for op in supported_ops}
+            supported_ops = {
+                ConverterRegistry.qualified_name_or_str(op) for op in supported_ops
+            }
 
             if (
                 (node_name in supported_ops or node.op == "get_attr")
@@ -89,7 +93,7 @@ def is_node_supported(
                     else:
                         self.unsupported_operators[node_name] += 1
 
-        return False, None
+        return False, NON_ACC_BACKEND_NAME
 
     def print_support_overview(self, num_acc_subgraphs: Optional[int] = None) -> None:
         if num_acc_subgraphs is not None:
@@ -137,7 +141,7 @@ def __init__(
         self,
         module: torch.fx.GraphModule,
         operator_support: ops.OperatorSupportBase,
-        backend_support_map: Dict[str, Set[Target]],
+        backend_support_map: Dict[str, Collection[Target]],
         backend_priority: List[str],
         allowed_single_node_partition_ops: Optional[Collection[str]] = None,
         min_block_size: int = MIN_BLOCK_SIZE,
@@ -488,15 +492,24 @@ def reduce_acc_nodes_non_tensor_output(self):
 
     def __call__(self) -> NodeSet:
         submodules = dict(self.module.named_modules())
+        backend = NON_ACC_BACKEND_NAME
         for n in self.module.graph.nodes:
-            n.backend = "None"
+            # Group non-compute nodes with previous compute nodes
+            if ConverterRegistry.qualified_name_or_str(n.target) in NON_COMPUTE_NODES:
+                n.backend = backend
+                if backend != NON_ACC_BACKEND_NAME:
+                    self.acc_nodes.add(n)
+                continue
+
             if n.op in CALLABLE_NODE_OPS:
                 is_supported, backend = self.operator_support.is_node_supported(
                     submodules, n
                 )
                 if is_supported:
                     n.backend = backend
                     self.acc_nodes.add(n)
+                else:
+                    n.backend = NON_ACC_BACKEND_NAME
 
         if not self.allow_non_tensor:
             self.reduce_acc_nodes_non_tensor_input()
@@ -515,7 +528,7 @@ def hierarchical_adjacency_partition(
     verbose: bool = DEBUG,
     min_block_size: int = MIN_BLOCK_SIZE,
     torch_executed_ops: Collection[Target] = set(),
-    backend_support_map: Optional[Dict[str, Set[OpOverload]]] = None,
+    backend_support_map: Optional[Dict[str, Collection[Target]]] = None,
     backend_priority: Optional[List[str]] = None,
     require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
     skip_fusion: bool = False,
@@ -542,7 +555,7 @@ def hierarchical_adjacency_partition(
     # Default backend support map if none provided
     if backend_support_map is None:
         backend_support_map = {
-            "tensorrt": set(DYNAMO_ATEN_CONVERTERS.keys()),
+            "tensorrt": DYNAMO_CONVERTERS.keys(),
             "inductor": set(),
         }
 
diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py
@@ -84,43 +84,49 @@ def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]:
         module_inputs = [
             node for node in module.graph.nodes if node.op == "placeholder"
         ]
-        for input in module_inputs:
-            if input.meta:
-                if "val" in input.meta:
-                    input_meta = input.meta["val"]
+        for input_node in module_inputs:
+            if input_node.meta:
+                if "val" in input_node.meta:
+                    input_meta = input_node.meta["val"]
+
+                    if isinstance(input_meta, Sequence):
+                        input_meta = input_meta[0]
+
                     if isinstance(input_meta, (FakeTensor, torch.Tensor)):
                         input_shape = input_meta.size()
                         torchtrt_inputs.append(
-                            get_input(input_shape, input_meta.dtype, name=input.name)
+                            get_input(
+                                input_shape, input_meta.dtype, name=input_node.name
+                            )
                         )
                     elif isinstance(input_meta, torch.SymInt):
                         # Assuming sym_integers | shape inputs always have torch.int64 dtype
                         torchtrt_inputs.append(
                             get_input(
                                 [input_meta],
                                 torch.int64,
-                                name=input.name,
+                                name=input_node.name,
                                 is_shape_tensor=True,
                             )
                         )
                     else:
                         raise ValueError(
-                            f"The meta val for input node {input.target} is of type : {type(input_meta)}. Supported types: torch.Tensor|FakeTensor|torch.SymInt"
+                            f"The meta val for input node {input_node.target} is of type : {type(input_meta)}. Supported types: torch.Tensor|FakeTensor|torch.SymInt"
                         )
 
-                elif "tensor_meta" in input.meta:
-                    input_meta = input.meta["tensor_meta"]
+                elif "tensor_meta" in input_node.meta:
+                    input_meta = input_node.meta["tensor_meta"]
                     input_shape = input_meta.shape
                     torchtrt_inputs.append(
-                        get_input(input_shape, input_meta.dtype, name=input.name)
+                        get_input(input_shape, input_meta.dtype, name=input_node.name)
                     )
                 else:
                     raise AssertionError(
-                        f"Input {input.name} does not contain val and tensor_meta fields in the metadata. Please ensure you have exported the graph correctly"
+                        f"Input {input_node.name} does not contain val and tensor_meta fields in the metadata. Please ensure you have exported the graph correctly"
                     )
             else:
                 raise AssertionError(
-                    f"Input {input.name} does not contain metadata. Please ensure you have exported the graph correctly"
+                    f"Input {input_node.name} does not contain metadata. Please ensure you have exported the graph correctly"
                 )
 
         return torchtrt_inputs