feat(//py/torch_tensorrt/dynamo): Allow the refit system to cache complex numerics

narendasan · narendasan · commit ea3e98a75c50 · 2026-03-05T18:10:10.000Z
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -43,6 +43,7 @@
 from torch_tensorrt.dynamo.utils import (
     CPU_DEVICE,
     check_module_output,
+    check_output_equal,
     get_model_device,
     get_torch_inputs,
     to_torch_device,
@@ -110,6 +111,17 @@ def construct_refit_mapping_from_weight_name_map(
                 engine_weight_name.split(" ")[-1].lower()
             )
 
+        elif isinstance(sd_weight_name, tuple):
+            # Buffer-slice mapping created by Stage 3 of _save_weight_mapping.
+            # Encodes (state_dict_key, dim, index) for weights that are slices
+            # of a source buffer (e.g. real/imag parts of an unpacked complex buffer).
+            sd_key, dim, idx = sd_weight_name
+            if sd_key not in state_dict:
+                continue
+            engine_weight_map[engine_weight_name] = (
+                state_dict[sd_key].select(dim, idx).to(to_torch_device(settings.device))
+            )
+
         elif sd_weight_name not in state_dict:
             # If weights is not in sd, we can leave it unchanged
             continue
@@ -585,14 +597,31 @@ def refit_module_weights(
 
     if verify_output and arg_inputs is not None:
         new_gm.to(to_torch_device(settings.device))
-        if check_module_output(
-            new_module=new_gm,
-            refitted_module=compiled_module,
-            arg_inputs=torch_inputs,
-            kwarg_inputs=torch_kwarg_inputs,
-        ):
+        # complex_graph_detection rewrites complex placeholders to real (view_as_real).
+        # The compiled TRT module handles complex→real internally, but the lowered
+        # PyTorch reference module (new_gm) expects real-unpacked inputs directly.
+        has_complex_inputs = any(
+            isinstance(x, torch.Tensor) and x.is_complex() for x in torch_inputs
+        )
+        if has_complex_inputs:
+            lowered_inputs = [
+                torch.view_as_real(x).contiguous()
+                if isinstance(x, torch.Tensor) and x.is_complex()
+                else x
+                for x in torch_inputs
+            ]
+            trt_outputs = compiled_module(*torch_inputs)
+            ref_outputs = new_gm(*lowered_inputs, **torch_kwarg_inputs)
+            outputs_match = check_output_equal(trt_outputs, ref_outputs)
+        else:
+            outputs_match = check_module_output(
+                new_module=new_gm,
+                refitted_module=compiled_module,
+                arg_inputs=torch_inputs,
+                kwarg_inputs=torch_kwarg_inputs,
+            )
+        if outputs_match:
             logger.info("Refitting Succeed!")
-            new_gm.to(CPU_DEVICE)
         else:
             if weight_name_map:
                 logger.warning(
@@ -608,7 +637,6 @@ def refit_module_weights(
                     in_place=in_place,
                 )
             logger.error("Refitting Failed! The outputs do not match.")
-            new_gm.to(CPU_DEVICE)
     else:
         logger.info("Refitting Completed! Output verification skipped.")
 
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -587,6 +587,41 @@ def _save_weight_mapping(self) -> None:
                 weight_refit_map[engine_weight_name].dtype,
             ]
 
+        # Stage 3: Slice matching for unmatched non-scalar CONSTANT weights.
+        # complex_graph_detection unpacks complex buffers to real:
+        #   freqs (S,D complex64) → freqs_unpacked_complex (S,D,2 float32)
+        # The real and imag slices (freqs_unpacked_complex[...,0] and [...,1]) are
+        # embedded as separate TRT constants, but their shapes differ from the source
+        # buffer, so Stage 2 value matching fails. Here we try selecting each slice
+        # along the last dimension of every sd entry to find the match.
+        for engine_weight_name, val in list(weight_name_map.items()):
+            if not isinstance(val, list) or len(val) != 2:
+                continue
+            sd_weight_name, dtype_val = val
+            if sd_weight_name != "" or engine_weight_name not in weight_refit_map:
+                continue
+            ew_tensor = weight_refit_map[engine_weight_name].to(torch_device)
+            if ew_tensor.numel() <= 1:
+                continue  # scalars are handled via constant_mapping
+            matched = False
+            for sd_key, sd_tensor in sd.items():
+                if sd_tensor.dim() < 1 or sd_tensor.shape[-1] < 2:
+                    continue
+                last_dim = sd_tensor.dim() - 1
+                for idx in range(sd_tensor.shape[last_dim]):
+                    sd_slice = sd_tensor.select(last_dim, idx)
+                    if TRTInterpreter.check_weight_equal(
+                        sd_slice, ew_tensor, torch_device
+                    ):
+                        weight_name_map[engine_weight_name] = [
+                            (sd_key, last_dim, idx),
+                            dtype_val,
+                        ]
+                        matched = True
+                        break
+                if matched:
+                    break
+
         weight_name_map["constant_mapping"] = constant_mapping
         self.weight_name_map = weight_name_map
 
diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py
@@ -521,6 +521,10 @@ def test_refit_one_engine_bert_with_weightmap():
     torch._dynamo.reset()
 
 
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"),
+    "torchvision is not installed",
+)
 @unittest.skipIf(
     not torch_trt.ENABLED_FEATURES.torch_tensorrt_runtime,
     "TorchScript Frontend is not available",
@@ -577,6 +581,10 @@ def test_refit_one_engine_inline_runtime_with_weightmap(tmpdir):
     torch._dynamo.reset()
 
 
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"),
+    "torchvision is not installed",
+)
 @unittest.skipIf(
     not torch_trt.ENABLED_FEATURES.refit,
     "Refit feature is not supported in Python 3.13 or higher",
@@ -764,6 +772,10 @@ def forward(self, x):
     torch._dynamo.reset()
 
 
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"),
+    "torchvision is not installed",
+)
 @unittest.skipIf(
     not torch_trt.ENABLED_FEATURES.torch_tensorrt_runtime,
     "TorchScript Frontend is not available",
@@ -879,6 +891,10 @@ def test_refit_one_engine_bert_without_weightmap():
     torch._dynamo.reset()
 
 
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"),
+    "torchvision is not installed",
+)
 @unittest.skipIf(
     not torch_trt.ENABLED_FEATURES.torch_tensorrt_runtime,
     "TorchScript Frontend is not available",
@@ -932,6 +948,10 @@ def test_refit_one_engine_inline_runtime_without_weightmap(tmpdir):
     torch._dynamo.reset()
 
 
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"),
+    "torchvision is not installed",
+)
 @unittest.skipIf(
     not torch_trt.ENABLED_FEATURES.refit,
     "Refit feature is not supported in Python 3.13 or higher",
@@ -1107,3 +1127,220 @@ def forward(self, x):
         # Clean up model env
 
     torch._dynamo.reset()
+
+
+@unittest.skipIf(
+    not torch_trt.ENABLED_FEATURES.torch_tensorrt_runtime,
+    "TorchScript Frontend is not available",
+)
+@unittest.skipIf(
+    not torch_trt.ENABLED_FEATURES.refit,
+    "Refit feature is not supported in Python 3.13 or higher",
+)
+@pytest.mark.unit
+def test_complex_buffer_refit():
+    """Refit a model whose weights include a complex-valued buffer (e.g. RoPE freqs).
+
+    Exercises the combined complex_graph_detection + refit_module_weights path:
+      - complex get_attr buffer is unpacked to real by the lowering pass
+      - complex placeholder input goes through view_as_real at the TRT boundary
+      - after refitting with new frequencies the output matches the new model
+    """
+
+    class ComplexFreqModel(nn.Module):
+        def __init__(self, freqs: torch.Tensor):
+            super().__init__()
+            self.register_buffer("freqs", freqs.cuda())
+
+        def forward(self, z: torch.Tensor) -> torch.Tensor:
+            # complex mul then expose as real so TRT can produce a real output
+            return torch.view_as_real(z * self.freqs)
+
+    SEQ, DIM = 8, 32
+
+    def make_freqs() -> torch.Tensor:
+        angles = torch.rand(SEQ, DIM // 2)
+        return torch.polar(torch.ones_like(angles), angles).cuda()
+
+    freqs1 = make_freqs()
+    freqs2 = make_freqs()
+
+    model1 = ComplexFreqModel(freqs1).eval()
+    model2 = ComplexFreqModel(freqs2).eval()
+
+    z = torch.randn(SEQ, DIM // 2, dtype=torch.complex64).cuda()
+    inputs = [z]
+
+    exp_program1 = torch.export.export(model1, tuple(inputs))
+    exp_program2 = torch.export.export(model2, tuple(inputs))
+
+    trt_gm = torchtrt.dynamo.compile(
+        exp_program1,
+        tuple(inputs),
+        use_python_runtime=True,
+        enabled_precisions={torch.float},
+        min_block_size=1,
+        immutable_weights=False,
+    )
+
+    new_trt_gm = refit_module_weights(
+        compiled_module=trt_gm,
+        new_weight_module=exp_program2,
+        arg_inputs=inputs,
+        use_weight_map_cache=True,
+        verify_output=True,
+    )
+
+    expected_output = exp_program2.module()(*inputs)
+    refitted_output = new_trt_gm(*inputs)
+
+    assertions.assertTrue(
+        torch.allclose(expected_output, refitted_output, atol=1e-2, rtol=1e-2),
+        "Refit with complex buffer failed: output mismatch after refit",
+    )
+
+    torch._dynamo.reset()
+
+
+@unittest.skipIf(
+    not torch_trt.ENABLED_FEATURES.torch_tensorrt_runtime,
+    "TorchScript Frontend is not available",
+)
+@unittest.skipIf(
+    not torch_trt.ENABLED_FEATURES.refit,
+    "Refit feature is not supported in Python 3.13 or higher",
+)
+@pytest.mark.unit
+def test_complex_buffer_with_real_param_refit():
+    """Refit a model that mixes a complex buffer with a real nn.Linear weight.
+
+    Verifies that Stage 3 slice-matching for complex buffer constants coexists
+    correctly with ordinary weight-name-map entries for real parameters.
+    After refitting both the frequencies and the projection matrix, the output
+    should match the new model exactly.
+    """
+
+    SEQ, DIM = 8, 32
+
+    class ComplexRotateAndProject(nn.Module):
+        def __init__(self, freqs: torch.Tensor):
+            super().__init__()
+            self.register_buffer("freqs", freqs.cuda())
+            self.proj = nn.Linear(DIM, DIM, bias=False)
+
+        def forward(self, z: torch.Tensor) -> torch.Tensor:
+            rotated = z * self.freqs  # complex mul, (SEQ, DIM//2)
+            r = torch.view_as_real(rotated)  # (SEQ, DIM//2, 2)
+            flat = r.reshape(z.shape[0], -1)  # (SEQ, DIM)
+            return self.proj(flat)  # (SEQ, DIM) real output
+
+    def make_freqs() -> torch.Tensor:
+        angles = torch.rand(SEQ, DIM // 2)
+        return torch.polar(torch.ones_like(angles), angles).cuda()
+
+    model1 = ComplexRotateAndProject(make_freqs()).eval().cuda()
+    model2 = ComplexRotateAndProject(make_freqs()).eval().cuda()
+
+    z = torch.randn(SEQ, DIM // 2, dtype=torch.complex64).cuda()
+    inputs = [z]
+
+    exp_program1 = torch.export.export(model1, tuple(inputs))
+    exp_program2 = torch.export.export(model2, tuple(inputs))
+
+    trt_gm = torchtrt.dynamo.compile(
+        exp_program1,
+        tuple(inputs),
+        use_python_runtime=True,
+        enabled_precisions={torch.float},
+        min_block_size=1,
+        immutable_weights=False,
+    )
+
+    new_trt_gm = refit_module_weights(
+        compiled_module=trt_gm,
+        new_weight_module=exp_program2,
+        arg_inputs=inputs,
+        use_weight_map_cache=True,
+        verify_output=True,
+    )
+
+    expected_output = exp_program2.module()(*inputs)
+    refitted_output = new_trt_gm(*inputs)
+
+    assertions.assertTrue(
+        torch.allclose(expected_output, refitted_output, atol=1e-2, rtol=1e-2),
+        "Refit with complex buffer + real param failed: output mismatch",
+    )
+
+    torch._dynamo.reset()
+
+
+@unittest.skipIf(
+    not torch_trt.ENABLED_FEATURES.torch_tensorrt_runtime,
+    "TorchScript Frontend is not available",
+)
+@unittest.skipIf(
+    not torch_trt.ENABLED_FEATURES.refit,
+    "Refit feature is not supported in Python 3.13 or higher",
+)
+@pytest.mark.unit
+def test_dual_complex_buffer_refit():
+    """Refit a model with two independent complex buffers.
+
+    Ensures Stage 3 value-based matching correctly distinguishes the real and
+    imaginary slices of freqs_a from those of freqs_b when both are unpacked to
+    separate _unpacked_complex state-dict entries with the same shape.
+    """
+
+    SEQ, DIM = 8, 32
+
+    class DualComplexFreqModel(nn.Module):
+        def __init__(self, freqs_a: torch.Tensor, freqs_b: torch.Tensor):
+            super().__init__()
+            self.register_buffer("freqs_a", freqs_a.cuda())
+            self.register_buffer("freqs_b", freqs_b.cuda())
+
+        def forward(self, z: torch.Tensor) -> torch.Tensor:
+            ra = torch.view_as_real(z * self.freqs_a)  # (SEQ, DIM//2, 2)
+            rb = torch.view_as_real(z * self.freqs_b)  # (SEQ, DIM//2, 2)
+            return ra + rb  # real output
+
+    def make_freqs() -> torch.Tensor:
+        angles = torch.rand(SEQ, DIM // 2)
+        return torch.polar(torch.ones_like(angles), angles).cuda()
+
+    model1 = DualComplexFreqModel(make_freqs(), make_freqs()).eval()
+    model2 = DualComplexFreqModel(make_freqs(), make_freqs()).eval()
+
+    z = torch.randn(SEQ, DIM // 2, dtype=torch.complex64).cuda()
+    inputs = [z]
+
+    exp_program1 = torch.export.export(model1, tuple(inputs))
+    exp_program2 = torch.export.export(model2, tuple(inputs))
+
+    trt_gm = torchtrt.dynamo.compile(
+        exp_program1,
+        tuple(inputs),
+        use_python_runtime=True,
+        enabled_precisions={torch.float},
+        min_block_size=1,
+        immutable_weights=False,
+    )
+
+    new_trt_gm = refit_module_weights(
+        compiled_module=trt_gm,
+        new_weight_module=exp_program2,
+        arg_inputs=inputs,
+        use_weight_map_cache=True,
+        verify_output=True,
+    )
+
+    expected_output = exp_program2.module()(*inputs)
+    refitted_output = new_trt_gm(*inputs)
+
+    assertions.assertTrue(
+        torch.allclose(expected_output, refitted_output, atol=1e-2, rtol=1e-2),
+        "Refit with dual complex buffers failed: output mismatch",
+    )
+
+    torch._dynamo.reset()