From 886ea478b1739c1f5d2a0798827a5877f5246d46 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 10 Jul 2025 08:46:38 +0530
Subject: [PATCH 1/3] unify the quant compile + offloading tests.

---
 tests/quantization/bnb/test_4bit.py           |  9 +----
 tests/quantization/bnb/test_mixed_int8.py     | 10 ++---
 tests/quantization/gguf/test_gguf.py          |  9 -----
 .../quantization/test_torch_compile_utils.py  | 38 +++++++++++--------
 tests/quantization/torchao/test_torchao.py    |  7 +---
 5 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 98005cfbc810..6eb44d7be990 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -888,12 +888,7 @@ def quantization_config(self):
 
     def test_torch_compile(self):
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
-        super()._test_torch_compile(quantization_config=self.quantization_config)
-
-    def test_torch_compile_with_cpu_offload(self):
-        super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
+        super().test_torch_compile()
 
     def test_torch_compile_with_group_offload_leaf(self):
-        super()._test_torch_compile_with_group_offload_leaf(
-            quantization_config=self.quantization_config, use_stream=True
-        )
+        super()._test_torch_compile_with_group_offload_leaf(use_stream=True)
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index f3bbc34e8b2c..3dd1f2244a7e 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -849,15 +849,11 @@ def quantization_config(self):
 
     def test_torch_compile(self):
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
-        super()._test_torch_compile(quantization_config=self.quantization_config, torch_dtype=torch.float16)
+        super()._test_torch_compile(torch_dtype=torch.float16)
 
     def test_torch_compile_with_cpu_offload(self):
-        super()._test_torch_compile_with_cpu_offload(
-            quantization_config=self.quantization_config, torch_dtype=torch.float16
-        )
+        super()._test_torch_compile_with_cpu_offload(torch_dtype=torch.float16)
 
     @pytest.mark.xfail(reason="Test fails because of an offloading problem from Accelerate with confusion in hooks.")
     def test_torch_compile_with_group_offload_leaf(self):
-        super()._test_torch_compile_with_group_offload_leaf(
-            quantization_config=self.quantization_config, torch_dtype=torch.float16, use_stream=True
-        )
+        super()._test_torch_compile_with_group_offload_leaf(torch_dtype=torch.float16, use_stream=True)
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index fe56f890ee8c..1228ad6cb3e4 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -662,15 +662,6 @@ class GGUFCompileTests(QuantCompileTests):
     def quantization_config(self):
         return GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
 
-    def test_torch_compile(self):
-        super()._test_torch_compile(quantization_config=self.quantization_config)
-
-    def test_torch_compile_with_cpu_offload(self):
-        super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
-
-    def test_torch_compile_with_group_offload_leaf(self):
-        super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
-
     def _init_pipeline(self, *args, **kwargs):
         transformer = FluxTransformer2DModel.from_single_file(
             self.gguf_ckpt, quantization_config=self.quantization_config, torch_dtype=self.torch_dtype
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
index 99bb8980ef9f..755fb57134b0 100644
--- a/tests/quantization/test_torch_compile_utils.py
+++ b/tests/quantization/test_torch_compile_utils.py
@@ -50,30 +50,29 @@ def _init_pipeline(self, quantization_config, torch_dtype):
         )
         return pipe
 
-    def _test_torch_compile(self, quantization_config, torch_dtype=torch.bfloat16):
-        pipe = self._init_pipeline(quantization_config, torch_dtype).to("cuda")
-        # import to ensure fullgraph True
+    def _test_torch_compile(self, torch_dtype=torch.bfloat16):
+        pipe = self._init_pipeline(self.quantization_config, torch_dtype).to("cuda")
+        # `fullgraph=True` ensures no graph breaks
         pipe.transformer.compile(fullgraph=True)
 
-        for _ in range(2):
-            # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
+        with torch._dynamo.config.patch(error_on_recompile=True):
+            for _ in range(2):
+                # small resolutions to ensure speedy execution.
+                pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
 
-    def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype=torch.bfloat16):
-        pipe = self._init_pipeline(quantization_config, torch_dtype)
+    def _test_torch_compile_with_cpu_offload(self, torch_dtype=torch.bfloat16):
+        pipe = self._init_pipeline(self.quantization_config, torch_dtype)
         pipe.enable_model_cpu_offload()
         pipe.transformer.compile()
 
         for _ in range(2):
             # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
+            pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
 
-    def _test_torch_compile_with_group_offload_leaf(
-        self, quantization_config, torch_dtype=torch.bfloat16, *, use_stream: bool = False
-    ):
-        torch._dynamo.config.cache_size_limit = 10000
+    def _test_torch_compile_with_group_offload_leaf(self, torch_dtype=torch.bfloat16, *, use_stream: bool = False):
+        torch._dynamo.config.cache_size_limit = 1000
 
-        pipe = self._init_pipeline(quantization_config, torch_dtype)
+        pipe = self._init_pipeline(self.quantization_config, torch_dtype)
         group_offload_kwargs = {
             "onload_device": torch.device("cuda"),
             "offload_device": torch.device("cpu"),
@@ -89,4 +88,13 @@ def _test_torch_compile_with_group_offload_leaf(
 
         for _ in range(2):
             # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
+            pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
+
+    def test_torch_compile(self):
+        self._test_torch_compile()
+
+    def test_torch_compile_with_cpu_offload(self):
+        self._test_torch_compile_with_cpu_offload()
+
+    def test_torch_compile_with_group_offload_leaf(self):
+        self._test_torch_compile_with_group_offload_leaf()
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index c4cfc8eb87fb..90534ed25352 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -639,16 +639,13 @@ def quantization_config(self):
             },
         )
 
-    def test_torch_compile(self):
-        super()._test_torch_compile(quantization_config=self.quantization_config)
-
     @unittest.skip(
         "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work "
         "when compiling."
     )
     def test_torch_compile_with_cpu_offload(self):
         # RuntimeError: _apply(): Couldn't swap Linear.weight
-        super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
+        super()._test_torch_compile_with_cpu_offload()
 
     @unittest.skip(
         """
@@ -673,7 +670,7 @@ def test_torch_compile_with_group_offload_leaf(self):
 
         # For use_stream=True:
         # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
-        super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
+        super()._test_torch_compile_with_group_offload_leaf()
 
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners

From e59f957963816689ac98d414c0772854e4889dd1 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 10 Jul 2025 12:33:04 +0530
Subject: [PATCH 2/3] fix

---
 tests/quantization/bnb/test_4bit.py            | 2 +-
 tests/quantization/bnb/test_mixed_int8.py      | 2 +-
 tests/quantization/gguf/test_gguf.py           | 2 +-
 tests/quantization/test_torch_compile_utils.py | 3 +--
 tests/quantization/torchao/test_torchao.py     | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 6eb44d7be990..f2381ef8f542 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -873,7 +873,7 @@ def test_fp4_double_safe(self):
 
 @require_torch_version_greater("2.7.1")
 @require_bitsandbytes_version_greater("0.45.5")
-class Bnb4BitCompileTests(QuantCompileTests):
+class Bnb4BitCompileTests(QuantCompileTests, unittest.TestCase):
     @property
     def quantization_config(self):
         return PipelineQuantizationConfig(
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 3dd1f2244a7e..64f56b02b0dd 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -838,7 +838,7 @@ def test_serialization_sharded(self):
 
 @require_torch_version_greater_equal("2.6.0")
 @require_bitsandbytes_version_greater("0.45.5")
-class Bnb8BitCompileTests(QuantCompileTests):
+class Bnb8BitCompileTests(QuantCompileTests, unittest.TestCase):
     @property
     def quantization_config(self):
         return PipelineQuantizationConfig(
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index 1228ad6cb3e4..ba41678eaa64 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -654,7 +654,7 @@ def get_dummy_inputs(self):
 
 
 @require_torch_version_greater("2.7.1")
-class GGUFCompileTests(QuantCompileTests):
+class GGUFCompileTests(QuantCompileTests, unittest.TestCase):
     torch_dtype = torch.bfloat16
     gguf_ckpt = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
 
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
index 755fb57134b0..bfa89fc6ee97 100644
--- a/tests/quantization/test_torch_compile_utils.py
+++ b/tests/quantization/test_torch_compile_utils.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
-import unittest
 
 import torch
 
@@ -23,7 +22,7 @@
 
 @require_torch_gpu
 @slow
-class QuantCompileTests(unittest.TestCase):
+class QuantCompileTests:
     @property
     def quantization_config(self):
         raise NotImplementedError(
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 90534ed25352..581ee9e7c59c 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -630,7 +630,7 @@ def test_int_a16w8_cpu(self):
 
 
 @require_torchao_version_greater_or_equal("0.7.0")
-class TorchAoCompileTest(QuantCompileTests):
+class TorchAoCompileTest(QuantCompileTests, unittest.TestCase):
     @property
     def quantization_config(self):
         return PipelineQuantizationConfig(

From 49b0f5db132028e4de131ea9ec88d6a0ae1afe73 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 10 Jul 2025 15:11:39 +0530
Subject: [PATCH 3/3] update

---
 tests/quantization/bnb/test_4bit.py           |  2 +-
 .../quantization/test_torch_compile_utils.py  | 24 +++++++++----------
 tests/quantization/torchao/test_torchao.py    |  8 +++----
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index f2381ef8f542..8e2a8515c662 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -877,7 +877,7 @@ class Bnb4BitCompileTests(QuantCompileTests, unittest.TestCase):
     @property
     def quantization_config(self):
         return PipelineQuantizationConfig(
-            quant_backend="bitsandbytes_8bit",
+            quant_backend="bitsandbytes_4bit",
             quant_kwargs={
                 "load_in_4bit": True,
                 "bnb_4bit_quant_type": "nf4",
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
index bfa89fc6ee97..cfe2339e2b56 100644
--- a/tests/quantization/test_torch_compile_utils.py
+++ b/tests/quantization/test_torch_compile_utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
+import inspect
 
 import torch
 
@@ -54,19 +55,16 @@ def _test_torch_compile(self, torch_dtype=torch.bfloat16):
         # `fullgraph=True` ensures no graph breaks
         pipe.transformer.compile(fullgraph=True)
 
-        with torch._dynamo.config.patch(error_on_recompile=True):
-            for _ in range(2):
-                # small resolutions to ensure speedy execution.
-                pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
+        # small resolutions to ensure speedy execution.
+        pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
 
     def _test_torch_compile_with_cpu_offload(self, torch_dtype=torch.bfloat16):
         pipe = self._init_pipeline(self.quantization_config, torch_dtype)
         pipe.enable_model_cpu_offload()
         pipe.transformer.compile()
 
-        for _ in range(2):
-            # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
+        # small resolutions to ensure speedy execution.
+        pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
 
     def _test_torch_compile_with_group_offload_leaf(self, torch_dtype=torch.bfloat16, *, use_stream: bool = False):
         torch._dynamo.config.cache_size_limit = 1000
@@ -85,9 +83,8 @@ def _test_torch_compile_with_group_offload_leaf(self, torch_dtype=torch.bfloat16
                 if torch.device(component.device).type == "cpu":
                     component.to("cuda")
 
-        for _ in range(2):
-            # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
+        # small resolutions to ensure speedy execution.
+        pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
 
     def test_torch_compile(self):
         self._test_torch_compile()
@@ -95,5 +92,8 @@ def test_torch_compile(self):
     def test_torch_compile_with_cpu_offload(self):
         self._test_torch_compile_with_cpu_offload()
 
-    def test_torch_compile_with_group_offload_leaf(self):
-        self._test_torch_compile_with_group_offload_leaf()
+    def test_torch_compile_with_group_offload_leaf(self, use_stream=False):
+        for cls in inspect.getmro(self.__class__):
+            if "test_torch_compile_with_group_offload_leaf" in cls.__dict__ and cls is not QuantCompileTests:
+                return
+        self._test_torch_compile_with_group_offload_leaf(use_stream=use_stream)
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 581ee9e7c59c..9d09fd2f1bab 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -645,8 +645,9 @@ def quantization_config(self):
     )
     def test_torch_compile_with_cpu_offload(self):
         # RuntimeError: _apply(): Couldn't swap Linear.weight
-        super()._test_torch_compile_with_cpu_offload()
+        super().test_torch_compile_with_cpu_offload()
 
+    @parameterized.expand([False, True])
     @unittest.skip(
         """
         For `use_stream=False`:
@@ -656,8 +657,7 @@ def test_torch_compile_with_cpu_offload(self):
             Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO.
         """
     )
-    @parameterized.expand([False, True])
-    def test_torch_compile_with_group_offload_leaf(self):
+    def test_torch_compile_with_group_offload_leaf(self, use_stream):
         # For use_stream=False:
         # If we run group offloading without compilation, we will see:
         #   RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0".  This is no longer allowed; the devices must match.
@@ -670,7 +670,7 @@ def test_torch_compile_with_group_offload_leaf(self):
 
         # For use_stream=True:
         # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
-        super()._test_torch_compile_with_group_offload_leaf()
+        super()._test_torch_compile_with_group_offload_leaf(use_stream=use_stream)
 
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners