Port a few commits from release/2.7 to release/2.8 (#3769)

Xia-Weiwen · chunyuan-w · web-flow · commit ea628af7ca3a · 2025-07-10T07:24:42.000-07:00
* WOQ: reduce memory usage for DeepSeek R1 DA8W8 with TP (#3645)

* WOQ: fix phi3 issue with latest DeepSpeed and TP=6 (#3655)

* LLM BKC: Always set config.use_cache to true (#3657)

Co-authored-by: Chunyuan WU &lt;chunyuan.wu@intel.com&gt;

---------

Co-authored-by: Chunyuan WU &lt;chunyuan.wu@intel.com&gt;
diff --git a/csrc/cpu/aten/kernels/WoqUtilKrnl.cpp b/csrc/cpu/aten/kernels/WoqUtilKrnl.cpp
@@ -36,9 +36,9 @@ at::Tensor qlinear_woq_pack(
     size_t block_k,
     int64_t lowp_mode,
     int64_t weight_format) {
-  TLA_ASSERT(qw.is_contiguous(), "qw must be contiguous");
   bool is_4bit_flag = is_4bit(qw_type);
   auto sizes = qw.sizes();
+  auto strides = qw.strides();
   auto N = sizes[0];
   auto K = is_4bit_flag ? sizes[1] * 2 : sizes[1];
   if (weight_format == GPTQ_WEIGHT_FORMAT) {
@@ -66,6 +66,7 @@ at::Tensor qlinear_woq_pack(
   const int Nc = N / block_n;
   const int Kc = K / block_k;
   if (is_4bit_flag) {
+    TORCH_CHECK(qw.is_contiguous(), "qw must be contiguous");
     auto result = at::empty(
         {Nc, Kc, block_k, block_n / 2}, qw.options().dtype(at::kByte));
     // Pack weight in [N,K] to [N/block_n, K/block_k, block_k, block_n]
@@ -228,7 +229,8 @@ at::Tensor qlinear_woq_pack(
     // Pack weight in [N,K] to [N/block_n, K/block_k, block_k, block_n]
     int8_t* src_data = (int8_t*)qw.data_ptr();
     int8_t* dst_data = (int8_t*)result.data_ptr();
-    auto psrc = GetVLAPtr<int8_t>(src_data, {block_n, Kc, block_k});
+    auto real_Kc = strides[0] / block_k;
+    auto psrc = GetVLAPtr<int8_t>(src_data, {block_n, real_Kc, block_k});
     auto pdst = GetVLAPtr<int8_t>(dst_data, {Kc, block_k, block_n});
     auto pack_loop =
         ThreadedLoop<3>({{Nc}, {Kc}, {0, block_n, N_GROUP_SIZE, false}}, "ABc");
diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
@@ -421,6 +421,8 @@ def get_checkpoint_files(model_name_or_path):
     kv_cache_dtype = torch.float8_e5m2
 config.kv_cache_dtype = kv_cache_dtype
 
+config.use_cache = True  # For inference, it should always be True
+
 # For DeepSeek models
 if not args.ipex_weight_only_quantization and args.ipex and args.dtype == "bfloat16":
     config.use_fused_moe = True
@@ -435,8 +437,6 @@ def get_checkpoint_files(model_name_or_path):
     config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens)
 if model_type == "whisper":
     config.text_max_length = config.max_source_positions + config.max_target_positions
-if model_type == "llava":
-    config.use_cache = True
 if model_type == "jamba":
     config.use_mamba_kernels = False
 if not hasattr(config, "lm_head_generation"):
diff --git a/examples/cpu/llm/inference/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py
@@ -362,6 +362,8 @@ def str_to_kwargs(s):
         args.config_file, torchscript=True, trust_remote_code=True
     )
 
+config.use_cache = True  # For inference, it should always be True
+
 # For DeepSeek models
 if args.ipex_weight_only_quantization and args.weight_dtype == "INT8":
     config.use_fused_moe = True
diff --git a/intel_extension_for_pytorch/llm/utils.py b/intel_extension_for_pytorch/llm/utils.py
@@ -911,14 +911,12 @@ def shard_low_precision_checkpoint(
 
     """
     assert tp_grain_size % 8 == 0, "tp_grain_size must be a multiple of 8"
-    if isinstance(model_config, dict):
-        num_heads = model_config["num_attention_heads"]
-        if "num_key_value_heads" in model_config:
-            num_heads = model_config["num_key_value_heads"]
-    else:
-        num_heads = model_config.num_attention_heads
-        if "num_key_value_heads" in model_config:
-            num_heads = model_config.num_key_value_heads
+    if not isinstance(model_config, dict):
+        model_config = model_config.to_dict()
+    num_heads = model_config["num_attention_heads"]
+    num_kv_heads = num_heads
+    if "num_key_value_heads" in model_config:
+        num_kv_heads = model_config["num_key_value_heads"]
     local_rank = rank
 
     mha_layers_split_by_N = [
@@ -928,6 +926,9 @@ def shard_low_precision_checkpoint(
         "q_b_proj",
         "kv_b_proj",
     ]
+    qkv_proj_layers = [
+        "qkv_proj",
+    ]
     # mlp is split with grain size = tp_grain_size
     mlp_layers_split_by_N = [
         "gate_proj",
@@ -938,6 +939,9 @@ def shard_low_precision_checkpoint(
         "w1",
         "w3",
     ]
+    gate_up_proj_layers = [
+        "gate_up_proj",
+    ]
     mha_layers_split_by_K = [
         "o_proj",
         "out_proj",
@@ -952,20 +956,28 @@ def shard_low_precision_checkpoint(
         "w2",
     ]
     lm_head_layers = ["lm_head"]  # split by K but not quantized
+
+    def _key_belongs_to(key, layer_group):
+        key_split = key.split(".")
+        for layer in layer_group:
+            if layer in key_split:
+                return True
+        return False
+
     low_precision_checkpoint_dict = low_precision_checkpoint.copy()
     head_range = [0]
-    head_per_rank = num_heads // world_size
+    head_per_rank = num_kv_heads // world_size
     for i in range(0, world_size):
         head_this_rank = head_per_rank
-        if i < num_heads % world_size:
+        if i < num_kv_heads % world_size:
             head_this_rank += 1
         head_range.append(head_range[-1] + head_this_rank)
     for key in low_precision_checkpoint.keys():
         q_head_start = head_range[rank]
         q_head_end = q_head_start + (head_range[rank + 1] - head_range[rank])
         if "bias" in key:
             continue
-        if any(substring in key for substring in mha_layers_split_by_N):
+        if _key_belongs_to(key, mha_layers_split_by_N):
             data = low_precision_checkpoint_dict[key]
             if quantization_method == "awq":
                 # qweight shape: [K, N // 8]
@@ -1046,7 +1058,91 @@ def shard_low_precision_checkpoint(
                 ].contiguous()
             else:
                 raise AssertionError(f"{quantization_method} is not supported yet.")
-        elif any(substring in key for substring in mlp_layers_split_by_N):
+        elif _key_belongs_to(key, qkv_proj_layers):
+            # need to split q, k and v proj then shard them separately
+            # finally concat them together
+            # mha layer split by N
+            data = low_precision_checkpoint_dict[key]
+            hidden_size = model_config["hidden_size"]
+            head_dim = hidden_size // num_heads
+            if quantization_method == "awq":
+                # qweight shape: [K, N // 8]
+                # scales shape: [K // G, N]
+                # qzeros shape: [K // G, N // 8]
+                N_pack_factor = 1 if "scales" in key else 8
+                N = data.shape[-1] * N_pack_factor
+                q_pos = N - 2 * num_kv_heads * head_dim
+                k_pos = q_pos + num_kv_heads * head_dim
+                v_pos = k_pos + num_kv_heads * head_dim
+                q_pos //= N_pack_factor
+                k_pos //= N_pack_factor
+                v_pos //= N_pack_factor
+                data_list = [
+                    data[:, :q_pos],
+                    data[:, q_pos:k_pos],
+                    data[:, k_pos:v_pos],
+                ]
+                for i in range(len(data_list)):
+                    data = data_list[i].contiguous()
+                    if data.shape[-1] % head_range[-1] == 0:
+                        dim = data.shape[-1] // head_range[-1]
+                    else:
+                        assert data.shape[-1] % world_size == 0
+                        dim = data.shape[-1] // world_size
+                        q_head_start = local_rank
+                        q_head_end = local_rank + 1
+                    data_list[i] = data[
+                        :, q_head_start * dim : q_head_end * dim
+                    ].contiguous()
+                low_precision_checkpoint_dict[key] = torch.cat(
+                    data_list, dim=-1
+                ).contiguous()
+            elif quantization_method == "gptq" or (
+                quantization_method == "rtn" and bits == 4
+            ):
+                # qweight shape: [K // 8, N]
+                # scales shape: [K // G, N]
+                # qzeros shape: [K // G, N // 8]
+                # g_idx shape: [K]
+                data_list = []
+                if "g_idx" not in key:
+                    N_pack_factor = 8 if "qzeros" in key else 1
+                    N = data.shape[-1] * N_pack_factor
+                    q_pos = N - 2 * num_kv_heads * head_dim
+                    k_pos = q_pos + num_kv_heads * head_dim
+                    v_pos = k_pos + num_kv_heads * head_dim
+                    q_pos //= N_pack_factor
+                    k_pos //= N_pack_factor
+                    v_pos //= N_pack_factor
+                    data_list = [
+                        data[:, :q_pos],
+                        data[:, q_pos:k_pos],
+                        data[:, k_pos:v_pos],
+                    ]
+                for i in range(len(data_list)):
+                    if "g_idx" in key:
+                        continue
+                    data = data_list[i]
+                    if data.shape[-1] % head_range[-1] == 0:
+                        dim = data.shape[-1] // head_range[-1]
+                    else:
+                        assert data.shape[-1] % world_size == 0
+                        dim = data.shape[-1] // world_size
+                        q_head_start = local_rank
+                        q_head_end = local_rank + 1
+                    data_list[i] = data[
+                        :, q_head_start * dim : q_head_end * dim
+                    ].contiguous()
+                if "g_idx" in key:
+                    if not desc_act:
+                        low_precision_checkpoint_dict.pop(key)
+                else:
+                    low_precision_checkpoint_dict[key] = torch.cat(
+                        data_list, dim=-1
+                    ).contiguous()
+            else:
+                raise AssertionError(f"{quantization_method} is not supported yet.")
+        elif _key_belongs_to(key, mlp_layers_split_by_N):
             data = low_precision_checkpoint_dict[key]
             if quantization_method == "awq":
                 # qweight shape: [K, N // 8]
@@ -1183,7 +1279,95 @@ def shard_low_precision_checkpoint(
                 ].contiguous()
             else:
                 raise AssertionError(f"{quantization_method} is not supported yet.")
-        elif any(substring in key for substring in mha_layers_split_by_K):
+        elif _key_belongs_to(key, gate_up_proj_layers):
+            # need to split gate and up proj then shard them separately
+            # finally concat them together
+            # mlp layer split by N
+            data = low_precision_checkpoint_dict[key]
+            if quantization_method == "awq":
+                # qweight shape: [K, N // 8]
+                # scales shape: [K // G, N]
+                # qzeros shape: [K // G, N // 8]
+                data_list = list(data.chunk(2, dim=-1))
+                for i in range(len(data_list)):
+                    data = data_list[i].contiguous()
+                    if "scales" in key:
+                        assert (
+                            data.shape[1] % tp_grain_size == 0
+                        ), "N must be divisible by tp_grain_size"
+                        grains = data.shape[1] // tp_grain_size
+                        dim = tp_grain_size
+                    else:
+                        assert (
+                            data.shape[1] * 8
+                        ) % tp_grain_size == 0, "N must be divisible by tp_grain_size"
+                        grains = data.shape[1] // (tp_grain_size // 8)
+                        dim = tp_grain_size // 8
+                    grains_per_rank = grains // world_size
+                    grains_rem = grains % world_size
+                    grains_start = grains_per_rank * local_rank + min(
+                        local_rank, grains_rem
+                    )
+                    grains_end = (
+                        grains_start
+                        + grains_per_rank
+                        + (1 if local_rank < grains_rem else 0)
+                    )
+                    data_list[i] = data[
+                        :, grains_start * dim : grains_end * dim
+                    ].contiguous()
+                low_precision_checkpoint_dict[key] = torch.cat(
+                    data_list, dim=-1
+                ).contiguous()
+            elif quantization_method == "gptq" or (
+                quantization_method == "rtn" and bits == 4
+            ):
+                # qweight shape: [K // 8, N]
+                # scales shape: [K // G, N]
+                # qzeros shape: [K // G, N // 8]
+                # g_idx shape: [K]
+                data_list = list(data.chunk(2, dim=-1))
+                for i in range(len(data_list)):
+                    if "g_idx" in key:
+                        continue
+                    data = data_list[i]
+                    if "qzeros" in key:
+                        assert (
+                            data.shape[-1] * 8
+                        ) % tp_grain_size == 0, "N must be divisible by tp_grain_size"
+                        grains = data.shape[-1] // (tp_grain_size // 8)
+                        dim = tp_grain_size // 8
+                    elif "g_idx" not in key:  # qweight, scales
+                        assert (
+                            data.shape[-1] % tp_grain_size == 0
+                        ), "N must be divisible by tp_grain_size"
+                        grains = data.shape[-1] // tp_grain_size
+                        dim = tp_grain_size
+                    grains_per_rank = grains // world_size
+                    grains_rem = grains % world_size
+                    grains_start = grains_per_rank * local_rank + min(
+                        local_rank, grains_rem
+                    )
+                    grains_end = (
+                        grains_start
+                        + grains_per_rank
+                        + (1 if local_rank < grains_rem else 0)
+                    )
+                    data_list[i] = data[
+                        :, grains_start * dim : grains_end * dim
+                    ].contiguous()
+                if "g_idx" in key:
+                    if not desc_act:
+                        low_precision_checkpoint_dict.pop(key)
+                else:
+                    low_precision_checkpoint_dict[key] = torch.cat(
+                        data_list, dim=-1
+                    ).contiguous()
+            else:
+                raise AssertionError(f"{quantization_method} is not supported yet.")
+        elif _key_belongs_to(key, mha_layers_split_by_K):
+            if "bias" in key:
+                continue
             data = low_precision_checkpoint_dict[key]
             if ("scales" in key or "qzeros" in key) and data.shape[0] == 1:
                 continue
@@ -1271,10 +1455,10 @@ def shard_low_precision_checkpoint(
                     q_head_end = local_rank + 1
                 low_precision_checkpoint_dict[key] = data[
                     :, q_head_start * dim : q_head_end * dim
-                ].contiguous()
+                ]
             else:
                 raise AssertionError(f"{quantization_method} is not supported yet.")
-        elif any(substring in key for substring in mlp_layers_split_by_K):
+        elif _key_belongs_to(key, mlp_layers_split_by_K):
             data = low_precision_checkpoint_dict[key]
             if ("scales" in key or "qzeros" in key) and data.shape[0] == 1:
                 continue
@@ -1424,10 +1608,10 @@ def shard_low_precision_checkpoint(
                 )
                 low_precision_checkpoint_dict[key] = data[
                     :, grains_start * dim : grains_end * dim
-                ].contiguous()
+                ]
             else:
                 raise AssertionError(f"{quantization_method} is not supported yet.")
-        elif any(substring in key for substring in lm_head_layers):
+        elif _key_belongs_to(key, lm_head_layers):
             # lm_head: [N, K] (not quantized)
             # Same for all quantization methods
             data = low_precision_checkpoint_dict[key]
diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py
@@ -414,10 +414,6 @@ def __init__(self, module, config, tpp=False, woq=False):
                             if len(w2_shared_compensation_list) > 0
                             else None
                         )
-
-                        print(
-                            "[INFO] Using fused shared MOE WOQ INT8 lowbit weights path..."
-                        )
                     else:
                         if (
                             (self.use_fused_moe or self.use_fused_moe_woq)

Original file line number	Diff line number	Diff line change
`@@ -362,6 +362,8 @@ def str_to_kwargs(s):`
`362`	`362`	`args.config_file, torchscript=True, trust_remote_code=True`
`363`	`363`	`)`
`364`	`364`
	`365`	`+config.use_cache = True # For inference, it should always be True`
	`366`	`+`
`365`	`367`	`# For DeepSeek models`
`366`	`368`	`if args.ipex_weight_only_quantization and args.weight_dtype == "INT8":`
`367`	`369`	`config.use_fused_moe = True`