Relax int4wo device mismatch error (#2254)

andrewor14 · web-flow · commit a776b1ff7e01 · 2025-05-23T17:32:31.000-04:00
**Summary:** We have an guard preventing users from using a cuda quantized on cpu and vice versa. However, this also prevents users who load their checkpoints on cpu first and then move them to cuda later, which is what torchtune does: ``` quantize_(model.cuda(), Int4WeightOnlyConfig()) # save checkpoint in cuda torch.save(model.state_dict(), "my_checkpoint.pt") # load checkpoint on cpu # This is what torchtune does: https://github.com/pytorch/torchtune/blob/v0.6.1/torchtune/training/checkpointing/_utils.py#L253 sd = torch.load("my_checkpoint.pt", weights_only=False, map_location="cpu") # move checkpoint to cuda for k, v in sd.items(): sd[k] = v.to("cuda") # load state_dict in cuda model.load_state_dict(sd, assign=True) ``` This use case is safe in that the model was quantized in cuda and ultimately used on cuda. This commit relaxes the error to allow the above use case. More details here: #1117. **Test Plan:** python test/quantization/test_quant_api.py -k test_int4wo_cuda_serialization
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -1017,6 +1017,25 @@ def test_ao_per_module_config_skip(self):
         assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout)
         assert not isinstance(model.linear2.weight, AffineQuantizedTensor)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    def test_int4wo_cuda_serialization(self):
+        config = Int4WeightOnlyConfig(group_size=32)
+        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
+        # quantize in cuda
+        quantize_(model, config)
+        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model(*example_inputs)
+        with tempfile.NamedTemporaryFile() as ckpt:
+            # save checkpoint in cuda
+            torch.save(model.state_dict(), ckpt)
+            # load checkpoint on cpu then move checkpoint to cuda
+            # This is what torchtune does: https://github.com/pytorch/torchtune/blob/v0.6.1/torchtune/training/checkpointing/_utils.py#L253
+            sd = torch.load(ckpt.name, weights_only=False, map_location="cpu")
+            for k, v in sd.items():
+                sd[k] = v.to("cuda")
+            # load state_dict in cuda
+            model.load_state_dict(sd, assign=True)
+
 
 class TestMultiTensorFlow(TestCase):
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
diff --git a/torchao/dtypes/uintx/tensor_core_tiled_layout.py b/torchao/dtypes/uintx/tensor_core_tiled_layout.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
+import logging
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -318,7 +319,7 @@ def to(self, *args, **kwargs):
         # between these two devices, in the future we should not use the same layout for
         # cpu and cuda device: https://github.com/pytorch/ao/issues/1117
         if not is_device(torch.device(self.device).type, device):
-            raise ValueError(
+            logging.warning(
                 f"TensorCoreTiledAQTTensorImpl does not support conversion from {self.device} to {device}"
             )
         return self.__class__(