pytorch · xiaowangintel · May 23, 2025 · May 23, 2025 · May 23, 2025 · May 28, 2025
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -135,6 +135,8 @@ def _groupwise_affine_quantize_tensor_from_qparams(
     if TORCH_VERSION_AT_LEAST_2_5:
         if (not (check_cpu_version(w.device))) and (not (check_xpu_version(w.device))):
             w_int4x8 = (w_int4x8[::, ::2] << 4 | w_int4x8[::, 1::2]).to(torch.uint8)
+        if (check_xpu_version(w.device)):
+            w_int4x8 = (w_int4x8[::, 1::2] << 4 | w_int4x8[::, ::2]).to(torch.uint8)
 
     return w_int4x8
 
@@ -730,6 +732,8 @@ def test_groupwise_affine_dequantize_tensor_from_qparams(self):
                     not (check_xpu_version(input.device))
                 ):
                     input_tmp = (input[::, ::2] << 4 | input[::, 1::2]).to(torch.uint8)
+                if (check_xpu_version(input.device)):
+                    input_tmp = (input[::, 1::2] << 4 | input[::, ::2]).to(torch.uint8)
                 w_bf16 = groupwise_affine_dequantize_tensor_from_qparams(
                     input_tmp, scales, zeros, n_bit, groupsize, zero_point_domain
                 )

diff --git a/torchao/dtypes/uintx/int4_xpu_layout.py b/torchao/dtypes/uintx/int4_xpu_layout.py
@@ -246,14 +246,15 @@ def from_plain(
     ):
         assert isinstance(_layout, Int4XPULayout)
 
-        from torchao.quantization.utils import convert_weight_to_int4pack_xpu
-
         if TORCH_VERSION_AT_LEAST_2_8:
             assert int_data.dtype == torch.int32, (
                 "torch.ops.aten._convert_weight_to_int4pack_for_cpu expects `int32` dtype"
             )
-            packed_weight = convert_weight_to_int4pack_xpu(
-                int_data, zero_point.dtype != scale.dtype
+            packed_weight = (int_data[::, 1::2] << 4 | int_data[::, ::2]).to(
+                torch.uint8
+            )
+            packed_weight = torch.ops.aten._convert_weight_to_int4pack(
+                packed_weight.contiguous(), 8
             )
         else:
             assert False, "INT4 not supported on XPU until 2.8"

diff --git a/torchao/prototype/awq/api.py b/torchao/prototype/awq/api.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 import types
 from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional, Tuple, Union
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import Optional
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import Optional
 
 import torch
 
@@ -13,6 +14,7 @@
 from torchao.dtypes import (
     TensorCoreTiledLayout,
     to_affine_quantized_intx,
+    Int4XPULayout,
 )
 from torchao.dtypes.uintx.uintx_layout import _DTYPE_TO_BIT_WIDTH, UintxLayout
 from torchao.quantization import to_weight_tensor_with_linear_activation_scale_metadata
@@ -114,6 +116,7 @@ class AWQUIntXConfig(AOBaseConfig):
     group_size: int = 64
     use_hqq: bool = False
     set_inductor_config: bool = True
+    zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.FLOAT
 
 
 # for bc
@@ -135,16 +138,21 @@ def _awq_uintx_transform(
     assert quant_dtype in _DTYPE_TO_BIT_WIDTH or quant_dtype == torch.uint8, (
         "Invalid quant_dtype. Please use torch.uint1 .. torch.uint8"
     )
-
+
+    device = observed_linear.weight.device
     equalization_scale = observed_linear.act_obs.calculate_qparams()
     # AQT config
     if quant_dtype == torch.uint4:
         target_dtype = torch.int32
         eps = 1e-6
         preserve_zero = False
-        zero_point_dtype = torch.bfloat16
-        zero_point_domain = ZeroPointDomain.FLOAT
-        _layout = TensorCoreTiledLayout(inner_k_tiles=8)
+        zero_point_dtype = torch.bfloat16 if config.zero_point_domain != ZeroPointDomain.INT else torch.int8
+        zero_point_domain = config.zero_point_domain
+
+        if "xpu" in device.type:
+            _layout = Int4XPULayout()
+        else:
+            _layout = TensorCoreTiledLayout(inner_k_tiles=8)
     else:
         target_dtype = torch.uint8
         eps = torch.finfo(torch.float32).eps

diff --git a/torchao/prototype/awq/example.py b/torchao/prototype/awq/example.py
@@ -13,6 +13,13 @@
 
 from torchao.prototype.awq import AWQObservedLinear, awq_uintx, insert_awq_observer_
 from torchao.quantization import int4_weight_only, quantize_
+from torchao.quantization.quant_primitives import (
+    ZeroPointDomain,
+)
+from torchao.dtypes import Int4XPULayout
+
+
+zero_point_domain_dict = {"float":ZeroPointDomain.FLOAT, "int":ZeroPointDomain.INT, "none":ZeroPointDomain.NONE}
 
 
 # adapted from: https://github.com/mit-han-lab/llm-awq/blob/main/awq/entry.py#L255
@@ -71,6 +78,8 @@ def wiki2_eval(
             log_likelihood = model(input_ids, labels=target_ids).loss * trg_len
         if device.startswith("cuda"):
             torch.cuda.synchronize()
+        if device.startswith("xpu"):
+            torch.xpu.synchronize()
         t2 = time.time()
         t.append((t2 - t1))
         lls.append(log_likelihood)
@@ -190,6 +199,7 @@ def wikitext2_ppl(
     precision: torch.dtype,
     sequence_length: int,
     compile: bool,
+    zero_point_domin: str,
     model_save_path: str,
 ):
     print(f"Loading model on {device}...")
@@ -231,8 +241,9 @@ def wikitext2_ppl(
         t0 = time.time()
         quantize_(
             model,
-            awq_uintx(quant_dtype=quant_dtype, group_size=group_size, use_hqq=use_hqq),
+            awq_uintx(quant_dtype=quant_dtype, group_size=group_size, use_hqq=use_hqq, zero_point_domain=zero_point_domain_dict[zero_point_domin]),
             is_observed_linear,
+            torch.device(device),
         )
         print(f"time for quantization: {time.time() - t0:.02f} seconds")
         if model_save_path is not None:
@@ -242,10 +253,15 @@ def wikitext2_ppl(
         group_size = int(quant.split("-")[1])
         use_hqq = "hqq" in quant
         print(f"running {quant} quantization with group size {group_size}")
-        quantize_(model, int4_weight_only(group_size=group_size, use_hqq=use_hqq))
+        int4_weight_only_config = int4_weight_only(group_size=group_size, use_hqq=use_hqq)
+        if "xpu" in device:
+            int4_weight_only_config.layout = Int4XPULayout()
+            int4_weight_only_config.layout.zero_point_domin = zero_point_domain_dict["zero_point_domin"]
+        quantize_(model, int4_weight_only_config)
     if compile:
         model = torch.compile(model)
 
+    print("model:", model)
     return benchmark(model, tokenizer, sequence_length, tasks=tasks, device=device)
 
 
@@ -299,6 +315,13 @@ def wikitext2_ppl(
         action="store_true",
         help="Flag to indicate if compilation is required.",
     )
+    parser.add_argument(
+        "--zero_point_domin",
+        type=str,
+        default="float",
+        choices=['float', 'int', 'none'],
+        help="Zero point type. Default is 'float'.",
+    )
     parser.add_argument(
         "--model_save_path",
         type=str,
@@ -320,6 +343,7 @@ def wikitext2_ppl(
         args.precision,
         args.seq_len,
         args.compile,
+        args.zero_point_domin,
         args.model_save_path,
     )
 

diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
@@ -697,13 +697,6 @@ def to_qtensor_components(
             int_data = aten._convert_weight_to_int4pack_for_cpu(
                 input_int4x8, inner_k_tiles
             )
-        if check_xpu_version(input_float.device):
-            from torchao.quantization.utils import convert_weight_to_int4pack_xpu
-
-            int_data = convert_weight_to_int4pack_xpu(
-                input_int4x8,
-                zero_point_domain_is_int=zero_point_domain == ZeroPointDomain.INT,
-            )
         else:
             int_data = aten._convert_weight_to_int4pack(input_int4x8, inner_k_tiles)
         return int_data, scales_and_zeros, False, groupsize, inner_k_tiles
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -127,6 +127,11 @@ def cuda(self):
             val.cuda() if isinstance(val, torch.Tensor) else val for val in self.values
         ]
 
+    def xpu(self):
+        self.values = [
+            val.xpu() if isinstance(val, torch.Tensor) else val for val in self.values
+        ]
+
 
 def guard_dtype_size(tensor_arg, arg_name, dtype=None, size=None):
     if dtype is not None and tensor_arg.dtype != dtype:
@@ -415,25 +420,6 @@ def unpack_tinygemm_scales_and_zeros(scales_and_zeros):
     return torch.split(scales_and_zeros.transpose(-3, -2), 1, -1)
 
 
-def convert_weight_to_int4pack_xpu(weight, zero_point_domain_is_int=False):
-    assert weight.device.type == "xpu"
-
-    if zero_point_domain_is_int:
-        # int_data = weight.to(dtype=torch.uint8)
-        int_data = (weight[::, 1::2] << 4 | weight[::, ::2]).to(torch.uint8)
-        packed_weight = torch.ops.aten._convert_weight_to_int4pack(
-            int_data,
-            8,  # TODO:remove
-        )
-    else:
-        out = weight.to(dtype=torch.uint8)
-        out = (out[::, 1::2] << 4 | out[::, ::2]).to(torch.uint8)
-        packed_weight = out.view(torch.int32)
-
-    # Second, N * K/2 uint8 -> N * K/8 int32
-    return packed_weight
-
-
 def groupwise_affine_quantize_tensor_from_qparams(
     w, scales, zeros, n_bit=4, groupsize=128, zero_point_domain=ZeroPointDomain.FLOAT
 ):
@@ -473,6 +459,8 @@ def groupwise_affine_quantize_tensor_from_qparams(
             not (check_xpu_version(int_data.device))
         ):
             int_data = (int_data[::, ::2] << 4 | int_data[::, 1::2]).to(torch.uint8)
+        if check_xpu_version(int_data.device):
+            int_data = (int_data[::, 1::2] << 4 | int_data[::, ::2]).to(torch.uint8)
     return int_data
 
 
@@ -491,7 +479,6 @@ def groupwise_affine_dequantize_tensor_from_qparams(
         TORCH_VERSION_AT_LEAST_2_5
         and (w_int4x8.dtype == torch.uint8 or w_int4x8.shape[-1] > 1)
         and not (check_cpu_version(w_int4x8.device))
-        and not (check_xpu_version(w_int4x8.device))
     ):
         data = w_int4x8.to(torch.int32)
         high_bits = data >> 4
@@ -501,8 +488,13 @@ def groupwise_affine_dequantize_tensor_from_qparams(
             dtype=torch.int32,
             device=w_int4x8.device,
         )
-        w_int32[::, ::2] = high_bits
-        w_int32[::, 1::2] = low_bits
+        if (not (check_xpu_version(w_int4x8.device))
+        ):
+            w_int32[::, ::2] = high_bits
+            w_int32[::, 1::2] = low_bits
+        else:
+            w_int32[::, ::2] = low_bits
+            w_int32[::, 1::2] = high_bits
     else:
         w_int32 = w_int4x8