Added engine visualization

cehongwang · cehongwang · commit 12619831dffc · 2025-05-28T01:29:37.000Z
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -281,6 +281,18 @@ void TRTEngine::enable_profiling() {
   exec_ctx->setProfiler(trt_engine_profiler.get());
 }
 
+void TRTEngine::set_profile_format(std::string format) {
+  if (format == "trex") {
+    profile_format = TraceFormat::kTREX;
+  } else if (format == "perfetto") {
+    profile_format = TraceFormat::kPERFETTO;
+  } else {
+    TORCHTRT_THROW_ERROR("Invalid profile format: " + format);
+  }
+
+  profile_format = profile_format;
+}
+
 std::string TRTEngine::get_engine_layer_info() {
   auto inspector = cuda_engine->createEngineInspector();
   return inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON);
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -147,6 +147,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::string to_str() const;
   static void verify_serialization_fmt(const std::vector<std::string>& serialized_info);
   void enable_profiling();
+  void set_profile_format(std::string profile_format);
   void disable_profiling();
   std::string get_engine_layer_info();
 
@@ -191,6 +192,7 @@ struct TRTEngine : torch::CustomClassHolder {
 #else
   bool profile_execution = false;
 #endif
+  TraceFormat profile_format = TraceFormat::kPERFETTO;
   std::string device_profile_path;
   std::string input_profile_path;
   std::string output_profile_path;
diff --git a/core/runtime/TRTEngineProfiler.cpp b/core/runtime/TRTEngineProfiler.cpp
@@ -32,25 +32,36 @@ TRTEngineProfiler::TRTEngineProfiler(const std::string& name, const std::vector<
   }
 }
 
-void dump_trace(const std::string& path, const TRTEngineProfiler& value) {
+void dump_trace(const std::string& path, const TRTEngineProfiler& value, TraceFormat format) {
   std::stringstream out;
   out << "[" << std::endl;
   double ts = 0.0;
+  double running_time = 0.0;
+  for (size_t i = 0; i < value.layer_names.size(); i++) {
+    auto layer_name = value.layer_names[i];
+    auto elem = value.profile.at(layer_name);
+    ts += elem.time;
+  }
   for (size_t i = 0; i < value.layer_names.size(); i++) {
     auto layer_name = value.layer_names[i];
     auto elem = value.profile.at(layer_name);
 
     out << "  {" << std::endl;
     out << "    \"name\": \"" << layer_name << "\"," << std::endl;
-    out << "    \"ph\": \"X\"," << std::endl;
-    out << "    \"ts\": " << ts * 1000 << "," << std::endl;
-    out << "    \"dur\": " << elem.time * 1000 << "," << std::endl;
-    out << "    \"tid\": 1," << std::endl;
-    out << "    \"pid\": \"" << value.name << " Engine Execution\"," << std::endl;
-    out << "    \"args\": {}" << std::endl;
+    if (format == kPERFETTO) {
+      out << "    \"ph\": \"X\"," << std::endl;
+      out << "    \"ts\": " << running_time * 1000 << "," << std::endl;
+      out << "    \"dur\": " << elem.time * 1000 << "," << std::endl;
+      out << "    \"tid\": 1," << std::endl;
+      out << "    \"pid\": \"" << value.name << " Engine Execution\"," << std::endl;
+    } else { // kTREX
+      out << "    \"timeMs\": " << elem.time << "," << std::endl;
+      out << "    \"averageMs\": " << elem.time / elem.count << "," << std::endl;
+      out << "    \"percentage\": " << (elem.time * 100.0 / ts) << "," << std::endl;
+      out << "    \"args\": {}" << std::endl;
+    }
     out << "  }," << std::endl;
-
-    ts += elem.time;
+    running_time += elem.time;
   }
   out.seekp(-2, out.cur);
   out << "\n]" << std::endl;
diff --git a/core/runtime/TRTEngineProfiler.h b/core/runtime/TRTEngineProfiler.h
@@ -10,6 +10,10 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
+enum TraceFormat { kPERFETTO, kTREX };
+
+// Forward declare the function
+
 struct TRTEngineProfiler : public nvinfer1::IProfiler {
   struct Record {
     float time{0};
@@ -21,7 +25,7 @@ struct TRTEngineProfiler : public nvinfer1::IProfiler {
       const std::string& name,
       const std::vector<TRTEngineProfiler>& srcProfilers = std::vector<TRTEngineProfiler>());
   friend std::ostream& operator<<(std::ostream& out, const TRTEngineProfiler& value);
-  friend void dump_trace(const std::string& path, const TRTEngineProfiler& value);
+  friend void dump_trace(const std::string& path, const TRTEngineProfiler& value, TraceFormat format);
 
  private:
   std::string name;
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -339,7 +339,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     if (compiled_engine->profile_execution) {
       LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
-      dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
+      dump_trace(
+          compiled_engine->trt_engine_profile_path,
+          *compiled_engine->trt_engine_profiler,
+          compiled_engine->profile_format);
       compiled_engine->dump_engine_layer_info();
     }
 
@@ -440,7 +443,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     if (compiled_engine->profile_execution) {
       LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
-      dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
+      dump_trace(
+          compiled_engine->trt_engine_profile_path,
+          *compiled_engine->trt_engine_profiler,
+          compiled_engine->profile_format);
       compiled_engine->dump_engine_layer_info();
     }
 
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -82,6 +82,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("__repr__", &TRTEngine::to_str)
         .def("__obj_flatten__", &TRTEngine::__obj_flatten__)
         .def("enable_profiling", &TRTEngine::enable_profiling)
+        .def("set_profile_format", &TRTEngine::set_profile_format)
         .def("disable_profiling", &TRTEngine::disable_profiling)
         .def_readwrite("profile_path_prefix", &TRTEngine::profile_path_prefix)
         .def("dump_engine_layer_info_to_file", &TRTEngine::dump_engine_layer_info_to_file)
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -2,6 +2,7 @@
 
 import collections.abc
 import logging
+import os
 import platform
 import warnings
 from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union
@@ -421,6 +422,7 @@ def compile(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
+    engine_vis_dir: Optional[str] = _defaults.ENGINE_VIS_DIR,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -674,6 +676,7 @@ def compile(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "engine_vis_dir": engine_vis_dir,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -904,6 +907,19 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
 
             trt_modules[name] = trt_module
 
+            if settings.debug and settings.engine_vis_dir:
+                if settings.use_python_runtime:
+                    logger.warning(
+                        "Profiling can only be enabled when using the C++ runtime"
+                    )
+                else:
+                    if not os.path.exists(settings.engine_vis_dir):
+                        os.makedirs(settings.engine_vis_dir)
+                    trt_module.enable_profiling(
+                        profiling_results_dir=settings.engine_vis_dir,
+                        profile_format="trex",
+                    )
+
     # Parse the graph I/O and store it in dryrun tracker
     parse_graph_io(gm, dryrun_tracker)
 
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -15,6 +15,7 @@
 DLA_SRAM_SIZE = 1048576
 ENGINE_CAPABILITY = EngineCapability.STANDARD
 WORKSPACE_SIZE = 0
+ENGINE_VIS_DIR = None
 MIN_BLOCK_SIZE = 5
 PASS_THROUGH_BUILD_FAILURES = False
 MAX_AUX_STREAMS = None
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -18,6 +18,7 @@
     ENABLE_WEIGHT_STREAMING,
     ENABLED_PRECISIONS,
     ENGINE_CAPABILITY,
+    ENGINE_VIS_DIR,
     HARDWARE_COMPATIBLE,
     IMMUTABLE_WEIGHTS,
     L2_LIMIT_FOR_TILING,
@@ -140,6 +141,7 @@ class CompilationSettings:
     tiling_optimization_level: str = TILING_OPTIMIZATION_LEVEL
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
+    engine_vis_dir: Optional[str] = ENGINE_VIS_DIR
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -334,7 +334,9 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
 
         return tuple(outputs)
 
-    def enable_profiling(self, profiling_results_dir: Optional[str] = None) -> None:
+    def enable_profiling(
+        self, profiling_results_dir: Optional[str] = None, profile_format: str = "trex"
+    ) -> None:
         """Enable the profiler to collect latency information about the execution of the engine
 
         Traces can be visualized using https://ui.perfetto.dev/ or compatible alternatives
@@ -347,7 +349,9 @@ def enable_profiling(self, profiling_results_dir: Optional[str] = None) -> None:
 
         if profiling_results_dir is not None:
             self.engine.profile_path_prefix = profiling_results_dir
+        assert profile_format in ["trex", "perfetto"]
         self.engine.enable_profiling()
+        self.engine.set_profile_format(profile_format)
 
     def disable_profiling(self) -> None:
         """Disable the profiler"""
diff --git a/tools/debug/engine_visualization/README.md b/tools/debug/engine_visualization/README.md
@@ -0,0 +1,11 @@
+## Introduction
+We use the TRT Engine Explorer (TREX) to visualize the engien graph structure. TREX is a diagnostic and profiling tool for TensorRT engine files. It allows you to inspect, benchmark, and debug TensorRT engines with ease.
+
+## Installation
+```bash
+git clone https://github.com/NVIDIA/TensorRT.git
+cd TensorRT/tools/experimental/trt-engine-explorer
+python3 -m pip install -e .[notebook]
+sudo apt --yes install graphviz
+```
+
diff --git a/tools/debug/engine_visualization/draw_engine_graph.py b/tools/debug/engine_visualization/draw_engine_graph.py
@@ -0,0 +1,44 @@
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import warnings
+from typing import Tuple
+
+import networkx as nx
+import trex
+import trex.engine_plan
+import trex.graphing
+
+
+def draw_engine(dir_path: str):
+    try:
+        import trex
+    except ImportError:
+        print("trex is required but it is not installed.\n")
+        print("Check README.md for installation instructions.")
+        exit()
+
+    engine_json_fname = os.path.join(
+        dir_path, "_run_on_acc_0_engine_layer_information.json"
+    )
+    profiling_json_fname = os.path.join(
+        dir_path, "_run_on_acc_0_engine_engine_exectuion_profile.trace"
+    )
+
+    graphviz_is_installed = shutil.which("dot") is not None
+    if not graphviz_is_installed:
+        print("graphviz is required but it is not installed.\n")
+        print("To install on Ubuntu:")
+        print("sudo apt --yes install graphviz")
+        exit()
+
+    plan = trex.engine_plan.EnginePlan(
+        engine_json_fname, profiling_file=profiling_json_fname
+    )
+    layer_node_formatter = trex.graphing.layer_type_formatter
+    graph = trex.graphing.to_dot(plan, layer_node_formatter)
+    output_format = "png"  # svg or jpg
+
+    trex.graphing.render_dot(graph, engine_json_fname, output_format)
diff --git a/tools/debug/engine_visualization/llama_hlo.py b/tools/debug/engine_visualization/llama_hlo.py
@@ -0,0 +1,28 @@
+import numpy as np
+import torch
+import torch_tensorrt as torch_tensorrt
+import torchvision.models as models
+
+inputs = [torch.rand((1, 3, 224, 224)).to("cuda")]
+model = models.resnet18(pretrained=False).eval().to("cuda")
+exp_program = torch.export.export(model, tuple(inputs))
+enabled_precisions = {torch.float}
+debug = False
+workspace_size = 20 << 30
+min_block_size = 0
+use_python_runtime = False
+torch_executed_ops = {}
+trt_gm = torch_tensorrt.dynamo.compile(
+    exp_program,
+    inputs=inputs,
+    enabled_precisions=enabled_precisions,
+    truncate_double=True,
+    debug=True,
+    use_python_runtime=False,
+    engine_vis_dir="/home/profile",
+)
+trt_output = trt_gm(*inputs)
+
+from draw_engine_graph import draw_engine
+
+draw_engine("/home/profile")