Use build_for_cudagraph_capture for metadata

ProExpertProg · ProExpertProg · commit 40e72489842d · 2025-05-30T20:21:33.000Z
Signed-off-by: luka &lt;luka@neuralmagic.com&gt;
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -322,9 +322,16 @@ def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         return False
 
-    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata):
+    def build_for_cudagraph_capture(
+        self, num_tokens: int, common_attn_metadata: CommonAttentionMetadata
+    ) -> FlashAttentionMetadata:
+        return self.build(num_tokens, num_tokens, num_tokens, 0,
+                          common_attn_metadata)
+
+    def build(
+        self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+        common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata
+    ) -> FlashAttentionMetadata:
         max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
@@ -449,19 +449,18 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
             seq_lens=seq_lens,
         )
 
-    # TODO maybe use this?
     def build_for_cudagraph_capture(
-            self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
-            common_prefix_len: int,
+            self, num_tokens: int,
             common_attn_metadata: CommonAttentionMetadata) -> M:
-        # decode-only cudagraph capture
-        assert num_reqs == num_actual_tokens
-        self._num_decodes = num_reqs
-        self._num_decode_tokens = num_reqs
+        """
+        This method builds the metadata for full cudagraph capture.
+        Currently, only decode is supported for full cudagraphs with MLA.
+        """
+        self._num_decodes = num_tokens
+        self._num_decode_tokens = num_tokens
         self._num_prefills = 0
         self._num_prefill_tokens = 0
-        return self.build(num_reqs, num_actual_tokens, max_query_len,
-                          common_prefix_len, common_attn_metadata)
+        return self.build(num_tokens, num_tokens, 1, 0, common_attn_metadata)
 
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -13,7 +13,7 @@
 
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadataBuilder, AttentionMetadata)
+                                              AttentionMetadataBuilder)
 from vllm.attention.layer import Attention
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import (CompilationLevel, VllmConfig,
@@ -191,8 +191,8 @@ def __init__(
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
         # The batch sizes in the config are in descending order.
-        self.cudagraph_batch_sizes = list(reversed(
-            self.compilation_config.cudagraph_capture_sizes))
+        self.cudagraph_batch_sizes = list(
+            reversed(self.compilation_config.cudagraph_capture_sizes))
 
         self.full_cuda_graph = self.compilation_config.full_cuda_graph
 
@@ -1726,19 +1726,12 @@ def _dummy_run(
             attn_metadata = {}
             for kv_cache_group_id, kv_cache_group_spec in enumerate(
                     self.kv_cache_config.kv_cache_groups):
-                # hack for flashMLA state
-                self.attn_metadata_builders[kv_cache_group_id]._num_decodes = num_tokens
-                self.attn_metadata_builders[kv_cache_group_id]._num_decode_tokens = num_tokens
-                self.attn_metadata_builders[kv_cache_group_id]._num_prefills = 0
-
-                attn_metadata_i = (
-                    self.attn_metadata_builders[kv_cache_group_id].build(
-                        num_reqs=num_tokens,
-                        num_actual_tokens=num_tokens,
-                        max_query_len=num_tokens,
-                        common_prefix_len=0,
+
+                attn_metadata_i = self.attn_metadata_builders[
+                    kv_cache_group_id].build_for_cudagraph_capture(
+                        num_tokens=num_tokens,
                         common_attn_metadata=common_attn_metadata,
-                    ))
+                    )
                 for layer_name in kv_cache_group_spec.layer_names:
                     attn_metadata[layer_name] = attn_metadata_i
 
@@ -2095,10 +2088,9 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             # group
             self.drafter.validate_same_kv_cache_group(kv_cache_config)
 
-        bind_kv_cache(
-            kv_caches,
-            self.compilation_config.static_forward_context,
-            self.kv_caches)
+        bind_kv_cache(kv_caches,
+                      self.compilation_config.static_forward_context,
+                      self.kv_caches)
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)