diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 1d40199f0..e3473f6b7 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -115,10 +115,6 @@ jobs: contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') && '["Qwen/Qwen2.5-VL-7B-Instruct"]' ) }} - # Remove exclude after https://github.com/vllm-project/vllm-ascend/issues/1044 resolved - exclude: - - model_name: Qwen/Qwen2.5-VL-7B-Instruct - vllm_use_version: 1 fail-fast: false name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }} @@ -164,7 +160,7 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty # Please also update this when bump matched version - ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }} + ref: ${{ github.event.inputs.vllm-version || 'main' }} - name: Install vllm-project/vllm from source working-directory: ./vllm-empty @@ -246,6 +242,7 @@ jobs: env: PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 VLLM_USE_V1: ${{ matrix.vllm_use_version }} + VLLM_LOGGING_LEVEL: DEBUG run: | model_base_name=$(basename ${{ matrix.model_name }}) markdown_name="${model_base_name}-V${{ matrix.vllm_use_version }}" diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py index 6b320ff1f..1be95a985 100644 --- a/benchmarks/scripts/run_accuracy.py +++ b/benchmarks/scripts/run_accuracy.py @@ -73,7 +73,7 @@ def run_accuracy_unimodal(queue, model, dataset): try: - model_args = f"pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6" + model_args = f"pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.85" results = lm_eval.simple_evaluate( model="vllm", model_args=model_args, @@ -97,7 +97,7 @@ def run_accuracy_unimodal(queue, model, dataset): def run_accuracy_multimodal(queue, model, dataset): try: - model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2" + model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2,gpu_memory_utilization=0.95" results = lm_eval.simple_evaluate( model="vllm-vlm", model_args=model_args, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 45bce5833..f498c228c 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -51,7 +51,7 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - LazyLoader, cdiv) + LazyLoader, cdiv, is_pin_memory_available) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) @@ -148,6 +148,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dp_size = vllm_config.parallel_config.data_parallel_size self.dp_rank = vllm_config.parallel_config.data_parallel_rank self.device = device + self.pin_memory = is_pin_memory_available() self.dtype = self.model_config.dtype self.sampler = Sampler() # Multi-modal data support @@ -1616,89 +1617,6 @@ def execute_model( return model_runner_output - def _profile_multimodal(self) -> None: - # TODO: handle encoder-decoder models once we support them. - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - - if (not self.is_multimodal_model - or self.max_num_encoder_input_tokens <= 0 - or self.encoder_cache_size <= 0): - return - - max_tokens_by_modality_dict = ( - MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( - self.model_config)) - dummy_data_modality, max_tokens_per_mm_item = max( - max_tokens_by_modality_dict.items(), key=lambda item: item[1]) - - # Check how many items of this modality can be supported by - # the encoder budget. - encoder_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) - - max_num_mm_items_encoder_budget = cdiv(encoder_budget, - max_tokens_per_mm_item) - - # Check how many items of this modality can be supported by - # the decoder budget. - max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( - self.model_config)[dummy_data_modality] - - # NOTE: We do not consider max_num_batched_tokens on purpose - # because the multimodal embeddings can be generated in advance - # and chunked prefilled. - max_num_mm_items_decoder_budget = self.max_num_reqs * \ - max_mm_items_per_req - - max_num_mm_items = min(max_num_mm_items_encoder_budget, - max_num_mm_items_decoder_budget) - - logger.info( - "Encoder cache will be initialized with a budget of %s tokens," - " and profiled with %s %s items of the maximum feature size.", - encoder_budget, max_num_mm_items, dummy_data_modality) - - # Create dummy batch of multimodal inputs. - dummy_request_data = self.input_registry.dummy_data_for_profiling( - model_config=self.model_config, - seq_len=self.max_num_tokens, - mm_registry=self.mm_registry, - ) - dummy_mm_data = dummy_request_data.multi_modal_data - - if not isinstance(dummy_mm_data, MultiModalKwargs): - # TODO: Delete this check once input mapper is fully removed. - raise RuntimeError("Legacy input mapper is not supported in V1") - - # Dummy data definition in V0 may contain multiple multimodal items - # (e.g, multiple images) for a single request, therefore here we - # always replicate first item by max_num_mm_items times since in V1 - # they are scheduled to be processed separately. - - dummy_mm_item = dummy_mm_data.get_item(modality=dummy_data_modality, - item_index=0) - dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) - - batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * - max_num_mm_items) - batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( - batched_dummy_mm_inputs, device=self.device) - - # Run multimodal encoder. - dummy_encoder_outputs = self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) - assert len(dummy_encoder_outputs) == max_num_mm_items, ( - "Expected dimension 0 of encoder outputs to match the number " - f"of multimodal data items: {max_num_mm_items}, got " - f"{len(dummy_encoder_outputs)=} instead. This is most likely " - "due to the 'get_multimodal_embeddings' method of the model " - "not implemented correctly.") - - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) - @torch.inference_mode() def _dummy_run( self, @@ -1804,12 +1722,76 @@ def _dummy_run( self.drafter.dummy_run(num_tokens) return hidden_states + @torch.inference_mode() def profile_run(self) -> None: - # FIXME Profile with multimodal encoder & encoder cache. - # current _profile_multimodal() using PyTorch SDPA backend method not - # support for window/full attn to reduce Memcpy operations, so will cause - # Out Of Memory problem, so we currently don't use self._profile_multimodal() - # self._profile_multimodal() + # Profile with multimodal encoder & encoder cache. + # TODO: handle encoder-decoder models once we support them. + if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 + and self.encoder_cache_size > 0): + + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + max_tokens_by_modality_dict = self.mm_registry \ + .get_max_tokens_per_item_by_nonzero_modality(self.model_config) + dummy_data_modality, max_tokens_per_mm_item = max( + max_tokens_by_modality_dict.items(), key=lambda item: item[1]) + + # Check how many items of this modality can be supported by + # the encoder budget. + encoder_budget = min(self.max_num_encoder_input_tokens, + self.encoder_cache_size) + + max_num_mm_items_encoder_budget = cdiv(encoder_budget, + max_tokens_per_mm_item) + + # Check how many items of this modality can be supported by + # the decoder budget. + max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( + self.model_config)[dummy_data_modality] + + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. + max_num_mm_items_decoder_budget = self.max_num_reqs * \ + max_mm_items_per_req + + max_num_mm_items = min(max_num_mm_items_encoder_budget, + max_num_mm_items_decoder_budget) + + logger.info( + "Encoder cache will be initialized with a budget of %s tokens," + " and profiled with %s %s items of the maximum feature size.", + encoder_budget, max_num_mm_items, dummy_data_modality) + + # Create dummy batch of multimodal inputs. + dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_counts={ + dummy_data_modality: 1 + }, + ).multi_modal_data + + batched_dummy_mm_inputs = MultiModalKwargs.batch( + [dummy_mm_kwargs] * max_num_mm_items, + pin_memory=self.pin_memory) + batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( + batched_dummy_mm_inputs, + device=self.device, + ) + + # Run multimodal encoder. + dummy_encoder_outputs = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) + + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_num_mm_items, + ) + + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) # For profile, have maximum num_reqs and that collectively have # maximum num_tokens.