pytorch · Apr 18, 2025
diff --git a/‎examples/apps/flux-quantization-fp32.py
Lines changed: 168 additions & 0 deletions b/‎examples/apps/flux-quantization-fp32.py
Lines changed: 168 additions & 0 deletions
diff --git a/‎examples/apps/flux-quantization.py
Lines changed: 163 additions & 0 deletions b/‎examples/apps/flux-quantization.py
Lines changed: 163 additions & 0 deletions
diff --git a/‎py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
Lines changed: 3 additions & 1 deletion b/‎py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎py/torch_tensorrt/dynamo/conversion/impl/quantize.py
Lines changed: 41 additions & 7 deletions b/‎py/torch_tensorrt/dynamo/conversion/impl/quantize.py
Lines changed: 41 additions & 7 deletions
diff --git a/‎py/torch_tensorrt/dynamo/utils.py
Lines changed: 5 additions & 1 deletion b/‎py/torch_tensorrt/dynamo/utils.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎tools/perf/Flux/create_env.sh
Lines changed: 1 addition & 0 deletions b/‎tools/perf/Flux/create_env.sh
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,168 @@
+# %%
+# Import the following libraries
+# -----------------------------
+import re
+
+import modelopt.torch.opt as mto
+import modelopt.torch.quantization as mtq
+import torch
+import torch_tensorrt
+from diffusers import FluxPipeline
+from diffusers.models.attention_processor import Attention
+from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
+from modelopt.torch.quantization.utils import export_torch_mode
+from torch.export._trace import _export
+from transformers import AutoModelForCausalLM
+
+# %%
+DEVICE = "cuda:0"
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.float32,
+)
+pipe.transformer = FluxTransformer2DModel(
+    num_layers=1, num_single_layers=1, guidance_embeds=True
+)
+
+pipe.to(DEVICE).to(torch.float32)
+# Store the config and transformer backbone
+config = pipe.transformer.config
+# global backbone
+backbone = pipe.transformer
+backbone.eval()
+
+
+def filter_func(name):
+    pattern = re.compile(
+        r".*(time_emb_proj|time_embedding|conv_in|conv_out|conv_shortcut|add_embedding|pos_embed|time_text_embed|context_embedder|norm_out|x_embedder).*"
+    )
+    return pattern.match(name) is not None
+
+
+def generate_image(pipe, prompt, image_name):
+    seed = 42
+    image = pipe(
+        prompt,
+        output_type="pil",
+        num_inference_steps=20,
+        generator=torch.Generator("cuda").manual_seed(seed),
+    ).images[0]
+    image.save(f"{image_name}.png")
+    print(f"Image generated using {image_name} model saved as {image_name}.png")
+
+
+generate_image(pipe, ["A golden retriever holding a sign to code"], "dog_code")
+
+# %%
+# Quantization
+
+
+def do_calibrate(
+    pipe,
+    prompt: str,
+) -> None:
+    """
+    Run calibration steps on the pipeline using the given prompts.
+    """
+    image = pipe(
+        prompt,
+        output_type="pil",
+        num_inference_steps=20,
+        generator=torch.Generator("cuda").manual_seed(0),
+    ).images[0]
+
+
+def forward_loop(mod):
+    # Switch the pipeline's backbone, run calibration
+    pipe.transformer = mod
+    do_calibrate(
+        pipe=pipe,
+        prompt="test",
+    )
+
+
+ptq_config = mtq.FP8_DEFAULT_CFG
+backbone = mtq.quantize(backbone, ptq_config, forward_loop)
+mtq.disable_quantizer(backbone, filter_func)
+
+
+# %%
+# Export the backbone using torch.export
+# --------------------------------------------------
+# Define the dummy inputs and their respective dynamic shapes. We export the transformer backbone with dynamic shapes with a ``batch_size=2``
+# due to `0/1 specialization <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ&tab=t.0#heading=h.ez923tomjvyk>`_
+
+batch_size = 2
+BATCH = torch.export.Dim("batch", min=1, max=2)
+SEQ_LEN = torch.export.Dim("seq_len", min=1, max=512)
+# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
+# To see this recommendation, you can try exporting using min=1, max=4096
+IMG_ID = torch.export.Dim("img_id", min=3586, max=4096)
+dynamic_shapes = {
+    "hidden_states": {0: BATCH},
+    "encoder_hidden_states": {0: BATCH, 1: SEQ_LEN},
+    "pooled_projections": {0: BATCH},
+    "timestep": {0: BATCH},
+    "txt_ids": {0: SEQ_LEN},
+    "img_ids": {0: IMG_ID},
+    "guidance": {0: BATCH},
+    "joint_attention_kwargs": {},
+    "return_dict": None,
+}
+# The guidance factor is of type torch.float32
+dummy_inputs = {
+    "hidden_states": torch.randn((batch_size, 4096, 64), dtype=torch.float32).to(
+        DEVICE
+    ),
+    "encoder_hidden_states": torch.randn(
+        (batch_size, 512, 4096), dtype=torch.float32
+    ).to(DEVICE),
+    "pooled_projections": torch.randn((batch_size, 768), dtype=torch.float32).to(
+        DEVICE
+    ),
+    "timestep": torch.tensor([1.0, 1.0], dtype=torch.float32).to(DEVICE),
+    "txt_ids": torch.randn((512, 3), dtype=torch.float32).to(DEVICE),
+    "img_ids": torch.randn((4096, 3), dtype=torch.float32).to(DEVICE),
+    "guidance": torch.tensor([1.0, 1.0], dtype=torch.float32).to(DEVICE),
+    "joint_attention_kwargs": {},
+    "return_dict": False,
+}
+
+# This will create an exported program which is going to be compiled with Torch-TensorRT
+with export_torch_mode():
+    ep = _export(
+        backbone,
+        args=(),
+        kwargs=dummy_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=False,
+        allow_complex_guards_as_runtime_asserts=True,
+    )
+
+with torch_tensorrt.logging.debug():
+    trt_gm = torch_tensorrt.dynamo.compile(
+        ep,
+        inputs=dummy_inputs,
+        enabled_precisions={torch.float8_e4m3fn},
+        truncate_double=True,
+        min_block_size=1,
+        debug=False,
+        use_python_runtime=True,
+        immutable_weights=True,
+        offload_module_to_cpu=True,
+    )
+
+
+del ep
+pipe.transformer = trt_gm
+pipe.transformer.config = config
+
+
+# %%
+trt_gm.device = torch.device(DEVICE)
+# Function which generates images from the flux pipeline
+
+for _ in range(2):
+    generate_image(pipe, ["A golden retriever holding a sign to code"], "dog_code")
+
+# For this dummy model, the fp16 engine size is around 1GB, fp32 engine size is around 2GB
@@ -0,0 +1,163 @@
+# %%
+# Import the following libraries
+# -----------------------------
+import re
+
+import modelopt.torch.opt as mto
+import modelopt.torch.quantization as mtq
+import torch
+import torch_tensorrt
+from diffusers import FluxPipeline
+from diffusers.models.attention_processor import Attention
+from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
+from modelopt.torch.quantization.utils import export_torch_mode
+from torch.export._trace import _export
+from transformers import AutoModelForCausalLM
+
+# Load the ModelOpt-modified model architecture and weights using Huggingface APIs
+
+# %%
+DEVICE = "cuda:0"
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.float16,
+)
+pipe.transformer = FluxTransformer2DModel(
+    num_layers=1, num_single_layers=1, guidance_embeds=True
+)
+
+pipe.to(DEVICE).to(torch.float16)
+# Store the config and transformer backbone
+config = pipe.transformer.config
+# global backbone
+backbone = pipe.transformer
+backbone.eval()
+
+
+def filter_func(name):
+    pattern = re.compile(
+        r".*(time_emb_proj|time_embedding|conv_in|conv_out|conv_shortcut|add_embedding|pos_embed|time_text_embed|context_embedder|norm_out|x_embedder).*"
+    )
+    return pattern.match(name) is not None
+
+
+def generate_image(pipe, prompt, image_name):
+    seed = 42
+    image = pipe(
+        prompt,
+        output_type="pil",
+        num_inference_steps=20,
+        generator=torch.Generator("cuda").manual_seed(seed),
+    ).images[0]
+    image.save(f"{image_name}.png")
+    print(f"Image generated using {image_name} model saved as {image_name}.png")
+
+
+generate_image(pipe, ["A golden retriever holding a sign to code"], "dog_code")
+
+# %%
+# Quantization
+
+
+def do_calibrate(
+    pipe,
+    prompt: str,
+) -> None:
+    """
+    Run calibration steps on the pipeline using the given prompts.
+    """
+    image = pipe(
+        prompt,
+        output_type="pil",
+        num_inference_steps=20,
+        generator=torch.Generator("cuda").manual_seed(0),
+    ).images[0]
+
+
+def forward_loop(mod):
+    # Switch the pipeline's backbone, run calibration
+    pipe.transformer = mod
+    do_calibrate(
+        pipe=pipe,
+        prompt="test",
+    )
+
+
+ptq_config = mtq.FP8_DEFAULT_CFG
+backbone = mtq.quantize(backbone, ptq_config, forward_loop)
+mtq.disable_quantizer(backbone, filter_func)
+
+batch_size = 1
+BATCH = torch.export.Dim("batch", min=1, max=2)
+SEQ_LEN = torch.export.Dim("seq_len", min=1, max=512)
+# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
+# To see this recommendation, you can try exporting using min=1, max=4096
+IMG_ID = torch.export.Dim("img_id", min=3586, max=4096)
+dynamic_shapes = {
+    "hidden_states": {0: BATCH},
+    "encoder_hidden_states": {0: BATCH, 1: SEQ_LEN},
+    "pooled_projections": {0: BATCH},
+    "timestep": {0: BATCH},
+    "txt_ids": {0: SEQ_LEN},
+    "img_ids": {0: IMG_ID},
+    "guidance": {0: BATCH},
+    "joint_attention_kwargs": {},
+    "return_dict": None,
+}
+# The guidance factor is of type torch.float32
+dummy_inputs = {
+    "hidden_states": torch.randn((batch_size, 4096, 64), dtype=torch.float16).to(
+        DEVICE
+    ),
+    "encoder_hidden_states": torch.randn(
+        (batch_size, 512, 4096), dtype=torch.float16
+    ).to(DEVICE),
+    "pooled_projections": torch.randn((batch_size, 768), dtype=torch.float16).to(
+        DEVICE
+    ),
+    "timestep": torch.tensor([1.0] * batch_size, dtype=torch.float16).to(DEVICE),
+    "txt_ids": torch.randn((512, 3), dtype=torch.float16).to(DEVICE),
+    "img_ids": torch.randn((4096, 3), dtype=torch.float16).to(DEVICE),
+    "guidance": torch.tensor([1.0] * batch_size, dtype=torch.float32).to(DEVICE),
+    "joint_attention_kwargs": {},
+    "return_dict": False,
+}
+
+# This will create an exported program which is going to be compiled with Torch-TensorRT
+with export_torch_mode():
+    ep = _export(
+        backbone,
+        args=(),
+        kwargs=dummy_inputs,
+        # dynamic_shapes=dynamic_shapes,
+        strict=False,
+        allow_complex_guards_as_runtime_asserts=True,
+    )
+
+with torch_tensorrt.logging.debug():
+    trt_gm = torch_tensorrt.dynamo.compile(
+        ep,
+        inputs=dummy_inputs,
+        enabled_precisions={torch.float8_e4m3fn, torch.float16},
+        truncate_double=True,
+        min_block_size=1,
+        debug=True,
+        use_python_runtime=True,
+        immutable_weights=True,
+        offload_module_to_cpu=True,
+    )
+
+
+del ep
+pipe.transformer = trt_gm
+pipe.transformer.config = config
+
+
+# %%
+trt_gm.device = torch.device(DEVICE)
+# Function which generates images from the flux pipeline
+
+for _ in range(2):
+    generate_image(pipe, ["A golden retriever holding a sign to code"], "dog_code")
+
+# For this dummy model, the fp16 engine size is around 1GB, fp32 engine size is around 2GB
@@ -597,7 +597,9 @@ def aten_ops_neg(
     )
 else:
 
-    @dynamo_tensorrt_converter(torch.ops.tensorrt.quantize_op.default)
+    @dynamo_tensorrt_converter(
+        torch.ops.tensorrt.quantize_op.default, supports_dynamic_shapes=True
+    )
     def aten_ops_quantize_op(
         ctx: ConversionContext,
         target: Target,
 
@@ -6,12 +6,31 @@
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch.fx.node import Target
 from torch_tensorrt.dynamo._SourceIR import SourceIR
+from torch_tensorrt.dynamo.conversion import impl
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor, to_torch
 from torch_tensorrt.fx.converters.converter_utils import set_layer_name
 from torch_tensorrt.fx.types import TRTTensor
 
 
+def get_ir(target: Target) -> SourceIR:
+    target_module = getattr(target, "__module__", "None")
+    if any(
+        target_module.startswith(prefix)
+        for prefix in ("torch.ops.aten", "torch._ops.aten")
+    ):
+        return SourceIR.ATEN
+    elif any(
+        target_module.startswith(prefix)
+        for prefix in ("torch.ops.prims", "torch._ops.prims")
+    ):
+        return SourceIR.PRIM
+    elif target_module.startswith("torch.nn"):
+        return SourceIR.NN
+
+    return SourceIR.UNKNOWN
+
+
 def quantize(
     ctx: ConversionContext,
     target: Target,
@@ -44,20 +63,35 @@ def quantize(
         elif num_bits == 8 and exponent_bits == 4:
             max_bound = 448
 
-        amax = to_torch(amax, None)
-        scale = torch.divide(amax, max_bound)
-        scale = get_trt_tensor(ctx, scale, name + "_scale")
+        if not isinstance(amax, trt.ITensor):
+            amax = to_torch(amax, None)
+            scale = torch.divide(amax, max_bound)
+            scale = get_trt_tensor(ctx, scale, name + "_scale")
+        else:
+            scale = impl.elementwise.div(
+                ctx,
+                target,
+                get_ir(target),
+                name,
+                amax,
+                max_bound,
+            )
+            scale = get_trt_tensor(ctx, scale, name + "_scale")
+
         # Add Q node
-        quantize_layer = ctx.net.add_quantize(input_tensor, scale)
         if num_bits == 8 and exponent_bits == 0:
-            quantize_layer.set_output_type(0, trt.DataType.INT8)
+            dtype = trt.DataType.INT8
         elif num_bits == 8 and exponent_bits == 4:
-            quantize_layer.set_output_type(0, trt.DataType.FP8)
+            dtype = trt.DataType.FP8
+
+        quantize_layer = ctx.net.add_quantize(input_tensor, scale, dtype)
 
         set_layer_name(quantize_layer, target, name + "_quantize", source_ir)
         q_output = quantize_layer.get_output(0)
         # Add DQ node
-        dequantize_layer = ctx.net.add_dequantize(q_output, scale)
+        dequantize_layer = ctx.net.add_dequantize(
+            q_output, scale, output_type=input_tensor.dtype
+        )
         set_layer_name(dequantize_layer, target, name + "_dequantize", source_ir)
         if num_bits == 8 and exponent_bits == 0:
             dequantize_layer.precision = trt.DataType.INT8
 
@@ -419,7 +419,9 @@ def unwrap_tensor_dtype(tensor: Union[torch.Tensor, FakeTensor, torch.SymInt]) -
     """
     Returns the dtype of torch.tensor or FakeTensor. For symbolic integers, we return int64
     """
-    if isinstance(tensor, (torch.Tensor, FakeTensor, int, float, bool)):
+    if isinstance(tensor, (torch.Tensor, FakeTensor)):
+        return tensor.dtype
+    elif isinstance(tensor, (int, float, bool)):
         return torch.tensor(tensor).dtype
     elif isinstance(tensor, torch.SymInt):
         return torch.int64
@@ -791,6 +793,8 @@ def get_output_dtypes(output: Any, truncate_doulbe: bool = False) -> List[dtype]
                     output_dtypes.append(dtype.float32)
                 else:
                     output_dtypes.append(dtype._from(output_meta.dtype))
+            elif isinstance(output_meta, torch.SymInt):
+                output_dtypes.append(dtype.int64)
         elif "tensor_meta" in output.meta:
             output_meta = output.meta["tensor_meta"]
             output_dtypes.append(dtype._from(output_meta.dtype))
 
@@ -24,3 +24,4 @@ pip install sentencepiece=="0.2.0" transformers=="4.48.2" accelerate=="1.3.0" di
 
 pip install notebook
 pip install gradio safetensors peft pyinstrument
+pip install nvidia-modelopt onnx torchprofile pulp onnxruntime
Original file line number	Diff line number	Diff line change
`@@ -24,3 +24,4 @@ pip install sentencepiece=="0.2.0" transformers=="4.48.2" accelerate=="1.3.0" di`
`24`	`24`
`25`	`25`	`pip install notebook`
`26`	`26`	`pip install gradio safetensors peft pyinstrument`
	`27`	`+pip install nvidia-modelopt onnx torchprofile pulp onnxruntime`