pytorch · Xia-Weiwen · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 28, 2025
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -29,6 +29,7 @@
     AffineQuantizedTensor,
     Int4CPULayout,
     Int4XPULayout,
+    Int8DynamicActInt4WeightCPULayout,
     PlainLayout,
     QDQLayout,
     TensorCoreTiledLayout,
@@ -875,6 +876,43 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq):
             assert "_weight_int4pack_mm_for_cpu" in code[0]
             assert "aten.mm.default" not in code[0]
 
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
+    @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half])
+    @common_utils.parametrize("x_dim", [2, 3])
+    def test_8da4w_cpu(self, dtype, x_dim):
+        device = "cpu"
+        m = ToyLinearModel().eval().to(dtype).to(device)
+        m2 = copy.deepcopy(m)
+        example_inputs = m.example_inputs(dtype=dtype, device=device)
+        if x_dim == 3:
+            example_inputs = (example_inputs[0].unsqueeze(0),)
+
+        with torch.no_grad():
+            # Currently, the difference between Int8DynamicActInt4WeightCPULayout and PlainLayout
+            # is that the former packs two int4 weights into one int8, while the latter does not.
+            quantize_(
+                m,
+                int8_dynamic_activation_int4_weight(
+                    group_size=32, layout=Int8DynamicActInt4WeightCPULayout()
+                ),
+            )
+            y, code = torch._inductor.utils.run_and_get_code(
+                torch.compile(m, fullgraph=True, dynamic=True),
+                *example_inputs,
+            )
+            # ensure the expected op is in the code
+            assert "shift" in code[0]  # unpacking int4 values
+            assert "extern_kernels.mm" in code[0]
+            quantize_(
+                m2,
+                int8_dynamic_activation_int4_weight(
+                    group_size=32, layout=PlainLayout()
+                ),
+            )
+            torch._dynamo.reset()  # may segfault without this
+            y2 = torch.compile(m2, fullgraph=True, dynamic=True)(*example_inputs)
+            assert torch.allclose(y, y2)
+
     # TODO(#1690): move to new config names
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")

diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -18,6 +18,7 @@
     CutlassInt4PackedLayout,
     Int4CPULayout,
     Int4XPULayout,
+    Int8DynamicActInt4WeightCPULayout,
     MarlinQQQLayout,
     MarlinQQQTensor,
     MarlinSparseLayout,
@@ -61,4 +62,5 @@
     "PackedLinearInt8DynamicActivationIntxWeightLayout",
     "to_affine_quantized_packed_linear_int8_dynamic_activation_intx_weight",
     "Int4XPULayout",
+    "Int8DynamicActInt4WeightCPULayout",
 ]
diff --git a/torchao/dtypes/uintx/__init__.py b/torchao/dtypes/uintx/__init__.py
@@ -6,6 +6,7 @@
 )
 from .int4_cpu_layout import (
     Int4CPULayout,
+    Int8DynamicActInt4WeightCPULayout,
 )
 from .int4_xpu_layout import (
     Int4XPULayout,
@@ -48,4 +49,5 @@
     "PackedLinearInt8DynamicActivationIntxWeightLayout",
     "QDQLayout",
     "Int4XPULayout",
+    "Int8DynamicActInt4WeightCPULayout",
 ]
diff --git a/torchao/dtypes/uintx/int4_cpu_layout.py b/torchao/dtypes/uintx/int4_cpu_layout.py
@@ -147,7 +147,7 @@ def to(self, *args, **kwargs):
         device = kwargs["device"]
         if not is_device(torch.device(self.device).type, device):
             raise ValueError(
-                f"Int4CPUAQTTensorImpl does not support conversion from {self.device} to {device}"
+                f"{self.__class__.__name__} does not support conversion from {self.device} to {device}"
             )
         return self.__class__(
             self.packed_weight.to(device),
@@ -214,11 +214,11 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
                 return return_and_correct_aliasing(func, args, kwargs, sliced)
             else:
                 raise NotImplementedError(
-                    f"Int4CPUAQTTensorImpl dispatch: attempting to run {func}, with dim={dim}, that is not supported"
+                    f"{cls.__name__} dispatch: attempting to run {func}, with dim={dim}, that is not supported"
                 )
 
         raise NotImplementedError(
-            f"Int4CPUAQTTensorImpl dispatch: attempting to run {func}, this is not supported"
+            f"{cls.__name__} dispatch: attempting to run {func}, this is not supported"
         )
 
     __torch_function__ = torch._C._disabled_torch_function_impl
@@ -352,3 +352,134 @@ def _linear_fp_act_uint4_weight_cpu_impl(input_tensor, weight_tensor, bias):
     if bias is not None:
         y += bias
     return y.to(orig_dtype)
+
+
+@dataclass(frozen=True)
+class Int8DynamicActInt4WeightCPULayout(Layout):
+    """Layout class for da8w4 CPU layout for affine quantized tensor"""
+
+    pass
+
+
+@register_layout(Int8DynamicActInt4WeightCPULayout)
+class DA8W4CPUAQTTensorImpl(Int4CPUAQTTensorImpl):
+    """TensorImpl for da8w4 CPU layout for affine quantized tensor
+    It stores the original tensor of dimension [n][k] (int32 dtype) as packed weight of 2-d tensor of
+    dimension: [n][k / 2] (uint8 dtype)
+    It is similar to Int4CPUAQTTensorImpl but with a different memory layout of weight data
+    fields:
+      packed_weight (torch.Tensor): the 2-d packed tensor in a Int4 CPU layout
+      scales (torch.Tensor): the scales Tensor used to map between floating point tensor to quantized tensor
+      qzeros (torch.Tensor): the zero_point Tensor used to map between floating point tensor to quantized tensor
+    """
+
+    def __new__(
+        cls,
+        packed_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: torch.Tensor,
+        transposed: bool,
+        _layout: Layout,
+    ):
+        kwargs = {}
+        kwargs["device"] = packed_weight.device
+        kwargs["layout"] = (
+            kwargs.get("layout")
+            if kwargs.get("layout", False)
+            else packed_weight.layout
+        )
+        kwargs["dtype"] = packed_weight.dtype
+        kwargs["requires_grad"] = False
+        shape = packed_weight.shape
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+    def __init__(
+        self,
+        packed_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: torch.Tensor,
+        transposed: bool,
+        _layout: Layout,
+    ):
+        self.packed_weight = packed_weight
+        self.scales = scales
+        self.qzeros = qzeros
+        self.transposed = transposed
+        self._layout = _layout
+
+    def __tensor_flatten__(self):
+        return ["packed_weight", "scales", "qzeros"], [self.transposed, self._layout]
+
+    @classmethod
+    def __tensor_unflatten__(
+        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
+    ):
+        packed_weight, scales, qzeros = (
+            tensor_data_dict["packed_weight"],
+            tensor_data_dict["scales"],
+            tensor_data_dict["qzeros"],
+        )
+        (
+            transposed,
+            _layout,
+        ) = tensor_attributes
+        return cls(packed_weight, scales, qzeros, transposed, _layout)
+
+    @classmethod
+    def from_plain(
+        cls,
+        int_data: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        _layout: Layout,
+    ):
+        assert isinstance(_layout, Int8DynamicActInt4WeightCPULayout)
+        assert int_data.dtype == torch.int8, "DA8W4 CPU: expects int8 weight"
+        assert int_data.shape[1] % 2 == 0, "DA8W4 CPU: expects even number of columns"
+        weight_int4 = ((int_data[..., 1::2] & 0xF) << 4) | (int_data[..., 0::2] & 0xF)
+        return cls(weight_int4, scale, zero_point, False, _layout)
+
+    def _apply_fn_to_data(self, fn):
+        return self.__class__(
+            fn(self.packed_weight),
+            fn(self.scales),
+            fn(self.qzeros),
+            self.transposed,
+            self._layout,
+        )
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        kwargs = {} if kwargs is None else kwargs
+        if func is aten.t.default:
+            """we don't need to repack the weight and just rely on external
+            shape being changed and record the status of transpose/no-transpose
+            """
+            transposed = DA8W4CPUAQTTensorImpl(
+                args[0].packed_weight,
+                args[0].scales,
+                args[0].qzeros,
+                not args[0].transposed,
+                args[0]._layout,
+            )
+            return return_and_correct_aliasing(func, args, kwargs, transposed)
+        else:
+            return super().__torch_dispatch__(func, types, args, kwargs)
+
+    __torch_function__ = torch._C._disabled_torch_function_impl
+
+    @property
+    def block_size(self):
+        assert len(self.packed_weight.shape) == 2
+        weight_shape = self.packed_weight.shape
+        N = weight_shape[0]
+        K = weight_shape[1] * 2
+        groups = self.scales.numel() // N
+        group_size = K // groups
+        return (1, group_size)
+
+    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        plain_weight = torch.stack(
+            ((self.packed_weight << 4) >> 4, self.packed_weight >> 4), dim=-1
+        ).view(self.packed_weight.shape[:-1] + (2 * self.packed_weight.shape[-1],))
+        return plain_weight, self.scales, self.qzeros