remove the old paged_attention_mla path after the pta updated to 20250619

ganyi1996ppo · ganyi1996ppo · commit 5e51501f0467 · 2025-06-21T21:13:15.000+08:00
Signed-off-by: ganyi &lt;pleaplusone.gy@gmail.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -13,7 +13,6 @@
                                                UnquantizedLinearMethod)
 from vllm.utils import cdiv, round_down
 
-from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
@@ -998,35 +997,12 @@ def _forward_decode(
                 actual_seq_lengths_kv=decode_meta.seq_lens_list,
             )
         else:
-            # The MLA_PA path will be used as default path in the future, `_npu_paged_attention_mla` will
-            # be removed after the torch_npu contains `torch_npu.atb.npu_multi_head_latent_attention` become
-            # public available
             assert len(kv_c_and_k_pe_cache) > 1
-            if envs.VLLM_ASCEND_MLA_PA:
-                attn_output = torch_npu.atb.npu_multi_head_latent_attention(
-                    q_nope, q_pe, kv_c_and_k_pe_cache[0],
-                    kv_c_and_k_pe_cache[1], attn_metadata.decode.block_table,
-                    attn_metadata.decode.seq_lens, self.num_heads, self.scale,
-                    self.num_kv_heads)
-            else:
-                q = torch.cat([q_nope, q_pe], dim=-1)
-                attn_output = torch.empty(
-                    [num_tokens, self.num_heads, self.kv_lora_rank],
-                    dtype=q.dtype,
-                    device=q.device)
-                k_cache = torch.cat(
-                    [kv_c_and_k_pe_cache[0], kv_c_and_k_pe_cache[1]], dim=-1)
-                torch_npu._npu_paged_attention_mla(
-                    query=q,
-                    key_cache=k_cache,
-                    num_kv_heads=self.num_kv_heads,
-                    num_heads=self.num_heads,
-                    scale_value=self.scale,
-                    block_table=attn_metadata.decode.
-                    block_table,  # type:ignore
-                    context_lens=attn_metadata.decode.seq_lens,  # type:ignore
-                    mla_vheadsize=self.kv_lora_rank,
-                    out=attn_output)
+            attn_output = torch_npu.atb.npu_multi_head_latent_attention(
+                q_nope, q_pe, kv_c_and_k_pe_cache[0],
+                kv_c_and_k_pe_cache[1], attn_metadata.decode.block_table,
+                attn_metadata.decode.seq_lens, self.num_heads, self.scale,
+                self.num_kv_heads)
         current_ms_metadata = get_multistream_comm_context()
         if current_ms_metadata is None:
             return self._v_up_proj_and_o_proj(attn_output)
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -132,11 +132,7 @@
     # rpc communication listening port, which will be used to receive the agent metadata from the
     # remote worker.
     "VLLM_LLMDD_RPC_PORT":
-    lambda: int(os.getenv("VLLM_LLMDD_RPC_PORT", 5557)),
-    # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
-    # and the mla_pa will be the default path of deepseek decode path.
-    "VLLM_ASCEND_MLA_PA":
-    lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0))
+    lambda: int(os.getenv("VLLM_LLMDD_RPC_PORT", 5557))
 }
 
 # end-env-vars-definition