add alibi param on varlen_attention to align API (#3750)

shiyang-weng · jianan-gu · web-flow · commit a07c604a84bb · 2025-07-03T09:10:16.000+08:00
* add alibi param on varlen_attention to align API

* add ut

* update ut

---------

Co-authored-by: jianan-gu &lt;jianan.gu@intel.com&gt;
diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py
@@ -192,6 +192,7 @@ def varlen_attention(
     out: torch.Tensor,
     seqlen_q: torch.Tensor,
     seqlen_k: torch.Tensor,
+    alibi_slopes: torch.Tensor,
     max_seqlen_q: int,
     max_seqlen_k: int,
     pdropout: float,
@@ -240,6 +241,7 @@ def varlen_attention(
         out,
         seqlen_q,
         seqlen_k,
+        alibi_slopes,
         max_seqlen_q,
         max_seqlen_k,
         pdropout,
diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py
@@ -362,6 +362,7 @@ def apply_function(
         out: torch.Tensor,
         seqlen_q: torch.Tensor,
         seqlen_k: torch.Tensor,
+        alibi_slopes: torch.Tensor,
         max_seqlen_q: int,
         max_seqlen_k: int,
         pdropout: float,
@@ -383,6 +384,7 @@ def apply_function(
             out,
             seqlen_q,
             seqlen_k,
+            alibi_slopes,
             max_seqlen_q,
             max_seqlen_k,
             pdropout,
@@ -404,6 +406,7 @@ def forward(
         out: torch.Tensor,
         seqlen_q: torch.Tensor,
         seqlen_k: torch.Tensor,
+        alibi_slopes: torch.Tensor,
         max_seqlen_q: int,
         max_seqlen_k: int,
         pdropout: float,
@@ -412,6 +415,9 @@ def forward(
         is_causal: bool,
         return_softmax: bool,
         gen_: torch.Generator,
+        window_size_left: int,
+        window_size_right: int,
+        softcap: float,
     ):
         runtime_module = self.runtime_ops.get_module_from_device(
             query.device.type, IPEXCustomOpType.VARLEN_ATTENTION, True
@@ -423,6 +429,7 @@ def forward(
             out,
             seqlen_q,
             seqlen_k,
+            alibi_slopes,
             max_seqlen_q,
             max_seqlen_k,
             pdropout,
@@ -431,6 +438,9 @@ def forward(
             is_causal,
             return_softmax,
             gen_,
+            window_size_left,
+            window_size_right,
+            softcap,
         )
 
 
diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py
@@ -510,6 +510,7 @@ def apply_function(
         out,  # [total_q, num_head, head_size]
         seqlen_q,  # [batch_size + 1]
         seqlen_k,  # [batch_size + 1]
+        alibi_slopes,
         max_seqlen_q,
         max_seqlen_k,
         pdropout=0.0,
@@ -528,6 +529,7 @@ def apply_function(
         assert window_size_left == -1, "ipex do not support window_size_left option"
         assert window_size_right == -1, "ipex do not support window_size_right option"
         assert softcap == -1.0, "ipex do not support softcap option"
+        assert alibi_slopes is None, "ipex do not support alibi_slopes"
 
         # Repeat kv if it is GQA.
         key = cls.repeat_kv(key, int(query.shape[1] / key.shape[1]))
@@ -600,6 +602,7 @@ def forward(
         out,
         seqlen_q,
         seqlen_k,
+        alibi_slopes,
         max_seqlen_q,
         max_seqlen_k,
         pdropout,
@@ -619,6 +622,7 @@ def forward(
             out,
             seqlen_q,
             seqlen_k,
+            alibi_slopes,
             max_seqlen_q,
             max_seqlen_k,
             pdropout,
diff --git a/tests/cpu/test_ipex_llm_module.py b/tests/cpu/test_ipex_llm_module.py
@@ -1425,6 +1425,93 @@ def selective_scan_ipex(
             self.assertEqual(out_ref, out_ipex, rtol=rtol, atol=atol)
             self.assertEqual(state_ref, state_ipex, rtol=rtolw, atol=atolw)
 
+    @skipIfNoEINPOS
+    def test_varlen_fwd(self):
+        HEAD_DIM = [64, 70]
+        NUM_HEADS = [(32, 32), (32, 8)]
+        BATCH_SIZE = [1, 3, 8]
+        DTYPE = [torch.float16]
+        USE_ALIBI = [False]
+        SEQLEN_RANGE = [10]
+        IS_CAUSAL = [False, True]
+        WINDOW_SIZE = [(-1, -1)]
+
+        for (
+            head_dim,
+            num_heads,
+            batch_size,
+            dtype,
+            use_alibi,
+            seqlen_range,
+            is_causal,
+            window_size,
+        ) in itertools.product(
+            HEAD_DIM,
+            NUM_HEADS,
+            BATCH_SIZE,
+            DTYPE,
+            USE_ALIBI,
+            SEQLEN_RANGE,
+            IS_CAUSAL,
+            WINDOW_SIZE,
+        ):
+            torch.manual_seed(15)
+            seqlen_list = torch.randint(
+                1, seqlen_range, [batch_size], dtype=torch.int32
+            )
+            max_seqlen = torch.max(seqlen_list)
+            cu_seqlen = torch.cumsum(seqlen_list, dim=0)
+            num_heads_query, num_heads_kv = num_heads
+            cu_seqlen = (
+                torch.cat([torch.tensor([0]), cu_seqlen], dim=0)
+                .to(torch.int32)
+                .to("cpu")
+            )
+
+            query = torch.randn(
+                [cu_seqlen[-1], num_heads_query, head_dim], dtype=dtype, device="cpu"
+            )
+            key = torch.randn(
+                [cu_seqlen[-1], num_heads_kv, head_dim], dtype=dtype, device="cpu"
+            )
+            value = torch.randn(
+                [cu_seqlen[-1], num_heads_kv, head_dim], dtype=dtype, device="cpu"
+            )
+            alibi_slopes = None
+            softmax_scale = 1 / math.sqrt(head_dim)
+            if use_alibi:
+                alibi_slopes = torch.tensor(
+                    [2 ** (-1 - i) for i in range(num_heads_query)],
+                    dtype=torch.float,
+                    device="cpu",
+                )
+                alibi_slopes = (
+                    alibi_slopes.unsqueeze(0)
+                    .expand(batch_size, num_heads_query)
+                    .contiguous()
+                )
+            out = query.clone()
+
+            ipex.llm.functional.varlen_attention(
+                query,
+                key,
+                value,
+                out,
+                cu_seqlen,
+                cu_seqlen,
+                alibi_slopes,
+                max_seqlen,
+                max_seqlen,
+                0.0,
+                softmax_scale,
+                False,
+                is_causal,
+                False,
+                None,
+                window_size_left=window_size[0],
+                window_size_right=window_size[1],
+            )
+
 
 if __name__ == "__main__":
     test = unittest.main()