Add profiling multimodal model step and fix the OOM bug when profiling the multimodal model.

ChenTaoyu-SJTU · ChenTaoyu-SJTU · commit 292d018fea74 · 2025-06-25T10:55:28.000Z
Signed-off-by: root &lt;ctynb@qq.com&gt;
diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
@@ -115,10 +115,6 @@ jobs:
           contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
             '["Qwen/Qwen2.5-VL-7B-Instruct"]'
          ) }}
-        # Remove exclude after https://github.com/vllm-project/vllm-ascend/issues/1044 resolved
-        exclude:
-          - model_name: Qwen/Qwen2.5-VL-7B-Instruct
-            vllm_use_version: 1
 
       fail-fast: false
     name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
@@ -225,6 +221,7 @@ jobs:
         env:
           PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
           VLLM_USE_V1: ${{ matrix.vllm_use_version }}
+          VLLM_LOGGING_LEVEL: DEBUG
         run: |
           model_base_name=$(basename ${{ matrix.model_name }})
           markdown_name="${model_base_name}-V${{ matrix.vllm_use_version }}"
diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py
@@ -77,7 +77,7 @@ def run_accuracy_unimodal(queue, model, dataset):
 
 def run_accuracy_multimodal(queue, model, dataset):
     try:
-        model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2"
+        model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2,gpu_memory_utilization=0.6"
         results = lm_eval.simple_evaluate(
             model="vllm-vlm",
             model_args=model_args,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -49,8 +49,8 @@
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, LazyLoader, cdiv)
+from vllm.utils import (DeviceMemoryProfiler, LazyLoader, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
@@ -143,7 +143,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         else:
             self.chunked_prefill_enabled = True
         self.device = device
-
+        self.pin_memory = is_pin_memory_available()
         self.is_multimodal_model = self.model_config.is_multimodal_model
         self.block_size = vllm_config.cache_config.block_size
 
@@ -1607,89 +1607,6 @@ def execute_model(
 
         return model_runner_output
 
-    def _profile_multimodal(self) -> None:
-        # TODO: handle encoder-decoder models once we support them.
-        # NOTE: Currently model is profiled with a single non-text
-        # modality with the max possible input tokens even when
-        # it supports multiple.
-
-        if (not self.is_multimodal_model
-                or self.max_num_encoder_input_tokens <= 0
-                or self.encoder_cache_size <= 0):
-            return
-
-        max_tokens_by_modality_dict = (
-            MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(
-                self.model_config))
-        dummy_data_modality, max_tokens_per_mm_item = max(
-            max_tokens_by_modality_dict.items(), key=lambda item: item[1])
-
-        # Check how many items of this modality can be supported by
-        # the encoder budget.
-        encoder_budget = min(self.max_num_encoder_input_tokens,
-                             self.encoder_cache_size)
-
-        max_num_mm_items_encoder_budget = cdiv(encoder_budget,
-                                               max_tokens_per_mm_item)
-
-        # Check how many items of this modality can be supported by
-        # the decoder budget.
-        max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
-            self.model_config)[dummy_data_modality]
-
-        # NOTE: We do not consider max_num_batched_tokens on purpose
-        # because the multimodal embeddings can be generated in advance
-        # and chunked prefilled.
-        max_num_mm_items_decoder_budget = self.max_num_reqs * \
-            max_mm_items_per_req
-
-        max_num_mm_items = min(max_num_mm_items_encoder_budget,
-                               max_num_mm_items_decoder_budget)
-
-        logger.info(
-            "Encoder cache will be initialized with a budget of %s tokens,"
-            " and profiled with %s %s items of the maximum feature size.",
-            encoder_budget, max_num_mm_items, dummy_data_modality)
-
-        # Create dummy batch of multimodal inputs.
-        dummy_request_data = self.input_registry.dummy_data_for_profiling(
-            model_config=self.model_config,
-            seq_len=self.max_num_tokens,
-            mm_registry=self.mm_registry,
-        )
-        dummy_mm_data = dummy_request_data.multi_modal_data
-
-        if not isinstance(dummy_mm_data, MultiModalKwargs):
-            # TODO: Delete this check once input mapper is fully removed.
-            raise RuntimeError("Legacy input mapper is not supported in V1")
-
-        # Dummy data definition in V0 may contain multiple multimodal items
-        # (e.g, multiple images) for a single request, therefore here we
-        # always replicate first item by max_num_mm_items times since in V1
-        # they are scheduled to be processed separately.
-
-        dummy_mm_item = dummy_mm_data.get_item(modality=dummy_data_modality,
-                                               item_index=0)
-        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
-
-        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
-                                                         max_num_mm_items)
-        batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
-            batched_dummy_mm_inputs, device=self.device)
-
-        # Run multimodal encoder.
-        dummy_encoder_outputs = self.model.get_multimodal_embeddings(
-            **batched_dummy_mm_inputs)
-        assert len(dummy_encoder_outputs) == max_num_mm_items, (
-            "Expected dimension 0 of encoder outputs to match the number "
-            f"of multimodal data items: {max_num_mm_items}, got "
-            f"{len(dummy_encoder_outputs)=} instead. This is most likely "
-            "due to the 'get_multimodal_embeddings' method of the model "
-            "not implemented correctly.")
-
-        # Cache the dummy encoder outputs.
-        self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
-
     @torch.inference_mode()
     def _dummy_run(
         self,
@@ -1796,12 +1713,76 @@ def _dummy_run(
                         self.drafter.dummy_run(num_tokens)
             return hidden_states
 
+    @torch.inference_mode()
     def profile_run(self) -> None:
-        # FIXME Profile with multimodal encoder & encoder cache.
-        # current _profile_multimodal() using PyTorch SDPA backend method not
-        # support for window/full attn to reduce Memcpy operations, so will cause
-        # Out Of Memory problem, so we currently don't use self._profile_multimodal()
-        # self._profile_multimodal()
+        # Profile with multimodal encoder & encoder cache.
+        # TODO: handle encoder-decoder models once we support them.
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            max_tokens_by_modality_dict = self.mm_registry \
+                .get_max_tokens_per_item_by_nonzero_modality(self.model_config)
+            dummy_data_modality, max_tokens_per_mm_item = max(
+                max_tokens_by_modality_dict.items(), key=lambda item: item[1])
+
+            # Check how many items of this modality can be supported by
+            # the encoder budget.
+            encoder_budget = min(self.max_num_encoder_input_tokens,
+                                 self.encoder_cache_size)
+
+            max_num_mm_items_encoder_budget = cdiv(encoder_budget,
+                                                   max_tokens_per_mm_item)
+
+            # Check how many items of this modality can be supported by
+            # the decoder budget.
+            max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
+                self.model_config)[dummy_data_modality]
+
+            # NOTE: We do not consider max_num_batched_tokens on purpose
+            # because the multimodal embeddings can be generated in advance
+            # and chunked prefilled.
+            max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                max_mm_items_per_req
+
+            max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                   max_num_mm_items_decoder_budget)
+
+            logger.info(
+                "Encoder cache will be initialized with a budget of %s tokens,"
+                " and profiled with %s %s items of the maximum feature size.",
+                encoder_budget, max_num_mm_items, dummy_data_modality)
+
+            # Create dummy batch of multimodal inputs.
+            dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
+                model_config=self.model_config,
+                seq_len=self.max_num_tokens,
+                mm_counts={
+                    dummy_data_modality: 1
+                },
+            ).multi_modal_data
+
+            batched_dummy_mm_inputs = MultiModalKwargs.batch(
+                [dummy_mm_kwargs] * max_num_mm_items,
+                pin_memory=self.pin_memory)
+            batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_dummy_mm_inputs,
+                device=self.device,
+            )
+
+            # Run multimodal encoder.
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+
+            sanity_check_mm_encoder_outputs(
+                dummy_encoder_outputs,
+                expected_num_items=max_num_mm_items,
+            )
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
         # For profile, have maximum num_reqs and that collectively have
         # maximum num_tokens.