Fixed fp16 quantization error

cehongwang · cehongwang · commit a4ff6bb40e39 · 2025-05-20T06:18:13.000Z
diff --git a/examples/apps/flux-demo.py b/examples/apps/flux-demo.py
@@ -15,7 +15,7 @@
 parser.add_argument(
     "--dtype",
     choices=["fp8", "int8", "fp16"],
-    default="int8",
+    default="fp16",
     help="Select the data type to use (fp8 or int8 or fp16)",
 )
 args = parser.parse_args()
@@ -30,7 +30,7 @@
     ptq_config["quant_cfg"]["*weight_quantizer"]["axis"] = None
 elif args.dtype == "fp16":
     enabled_precisions = {torch.float16}
-print(f"\nUsing {args.dtype} quantization")
+print(f"\nUsing {args.dtype}")
 
 
 DEVICE = "cuda:0"
@@ -152,6 +152,9 @@ def load_lora(path):
     print("Refitting Finished!")
 
 
+load_lora("/home/TensorRT/examples/apps/NGRVNG.safetensors")
+
+
 # Create Gradio interface
 with gr.Blocks(title="Flux Demo with Torch-TensorRT") as demo:
     gr.Markdown("# Flux Image Generation Demo Accelerated by Torch-TensorRT")
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -66,7 +66,7 @@ def quantize(
         if not isinstance(amax, trt.ITensor):
             amax = to_torch(amax, None)
             scale = torch.divide(amax, max_bound)
-            scale = get_trt_tensor(ctx, scale, name + "_scale")
+            scale = get_trt_tensor(ctx, scale, name + "_scale", dtype=torch.float32)
         else:
             scale = impl.elementwise.div(
                 ctx,
@@ -76,7 +76,7 @@ def quantize(
                 amax,
                 max_bound,
             )
-            scale = get_trt_tensor(ctx, scale, name + "_scale")
+            scale = get_trt_tensor(ctx, scale, name + "_scale", dtype=torch.float32)
 
         # Add Q node
         if num_bits == 8 and exponent_bits == 0:
@@ -96,7 +96,6 @@ def quantize(
             q_output, scale, output_type=input_tensor.dtype
         )
         set_layer_name(dequantize_layer, target, name + "_dequantize", source_ir)
-        dequantize_layer.precision = dtype
 
         dq_output = dequantize_layer.get_output(0)