Skip to content

Add profiling multimodal model step and fix the OOM bug when profilin… #1408

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions .github/workflows/accuracy_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,6 @@ jobs:
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
'["Qwen/Qwen2.5-VL-7B-Instruct"]'
) }}
# Remove exclude after https://github.com/vllm-project/vllm-ascend/issues/1044 resolved
exclude:
- model_name: Qwen/Qwen2.5-VL-7B-Instruct
vllm_use_version: 1

fail-fast: false
name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
Expand Down Expand Up @@ -164,7 +160,7 @@ jobs:
repository: vllm-project/vllm
path: ./vllm-empty
# Please also update this when bump matched version
ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}
ref: ${{ github.event.inputs.vllm-version || 'main' }}

- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
Expand Down Expand Up @@ -246,6 +242,7 @@ jobs:
env:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
VLLM_USE_V1: ${{ matrix.vllm_use_version }}
VLLM_LOGGING_LEVEL: DEBUG
run: |
model_base_name=$(basename ${{ matrix.model_name }})
markdown_name="${model_base_name}-V${{ matrix.vllm_use_version }}"
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/scripts/run_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@

def run_accuracy_unimodal(queue, model, dataset):
try:
model_args = f"pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6"
model_args = f"pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.85"
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
Expand All @@ -97,7 +97,7 @@ def run_accuracy_unimodal(queue, model, dataset):

def run_accuracy_multimodal(queue, model, dataset):
try:
model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2"
model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2,gpu_memory_utilization=0.95"
results = lm_eval.simple_evaluate(
model="vllm-vlm",
model_args=model_args,
Expand Down
160 changes: 71 additions & 89 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
LazyLoader, cdiv)
LazyLoader, cdiv, is_pin_memory_available)
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheSpec)
Expand Down Expand Up @@ -148,6 +148,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
self.dp_size = vllm_config.parallel_config.data_parallel_size
self.dp_rank = vllm_config.parallel_config.data_parallel_rank
self.device = device
self.pin_memory = is_pin_memory_available()
self.dtype = self.model_config.dtype
self.sampler = Sampler()
# Multi-modal data support
Expand Down Expand Up @@ -1616,89 +1617,6 @@ def execute_model(

return model_runner_output

def _profile_multimodal(self) -> None:
# TODO: handle encoder-decoder models once we support them.
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.

if (not self.is_multimodal_model
or self.max_num_encoder_input_tokens <= 0
or self.encoder_cache_size <= 0):
return

max_tokens_by_modality_dict = (
MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(
self.model_config))
dummy_data_modality, max_tokens_per_mm_item = max(
max_tokens_by_modality_dict.items(), key=lambda item: item[1])

# Check how many items of this modality can be supported by
# the encoder budget.
encoder_budget = min(self.max_num_encoder_input_tokens,
self.encoder_cache_size)

max_num_mm_items_encoder_budget = cdiv(encoder_budget,
max_tokens_per_mm_item)

# Check how many items of this modality can be supported by
# the decoder budget.
max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
self.model_config)[dummy_data_modality]

# NOTE: We do not consider max_num_batched_tokens on purpose
# because the multimodal embeddings can be generated in advance
# and chunked prefilled.
max_num_mm_items_decoder_budget = self.max_num_reqs * \
max_mm_items_per_req

max_num_mm_items = min(max_num_mm_items_encoder_budget,
max_num_mm_items_decoder_budget)

logger.info(
"Encoder cache will be initialized with a budget of %s tokens,"
" and profiled with %s %s items of the maximum feature size.",
encoder_budget, max_num_mm_items, dummy_data_modality)

# Create dummy batch of multimodal inputs.
dummy_request_data = self.input_registry.dummy_data_for_profiling(
model_config=self.model_config,
seq_len=self.max_num_tokens,
mm_registry=self.mm_registry,
)
dummy_mm_data = dummy_request_data.multi_modal_data

if not isinstance(dummy_mm_data, MultiModalKwargs):
# TODO: Delete this check once input mapper is fully removed.
raise RuntimeError("Legacy input mapper is not supported in V1")

# Dummy data definition in V0 may contain multiple multimodal items
# (e.g, multiple images) for a single request, therefore here we
# always replicate first item by max_num_mm_items times since in V1
# they are scheduled to be processed separately.

dummy_mm_item = dummy_mm_data.get_item(modality=dummy_data_modality,
item_index=0)
dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])

batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
max_num_mm_items)
batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
batched_dummy_mm_inputs, device=self.device)

# Run multimodal encoder.
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
**batched_dummy_mm_inputs)
assert len(dummy_encoder_outputs) == max_num_mm_items, (
"Expected dimension 0 of encoder outputs to match the number "
f"of multimodal data items: {max_num_mm_items}, got "
f"{len(dummy_encoder_outputs)=} instead. This is most likely "
"due to the 'get_multimodal_embeddings' method of the model "
"not implemented correctly.")

# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))

@torch.inference_mode()
def _dummy_run(
self,
Expand Down Expand Up @@ -1804,12 +1722,76 @@ def _dummy_run(
self.drafter.dummy_run(num_tokens)
return hidden_states

@torch.inference_mode()
def profile_run(self) -> None:
# FIXME Profile with multimodal encoder & encoder cache.
# current _profile_multimodal() using PyTorch SDPA backend method not
# support for window/full attn to reduce Memcpy operations, so will cause
# Out Of Memory problem, so we currently don't use self._profile_multimodal()
# self._profile_multimodal()
# Profile with multimodal encoder & encoder cache.
# TODO: handle encoder-decoder models once we support them.
if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
and self.encoder_cache_size > 0):

# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.
max_tokens_by_modality_dict = self.mm_registry \
.get_max_tokens_per_item_by_nonzero_modality(self.model_config)
dummy_data_modality, max_tokens_per_mm_item = max(
max_tokens_by_modality_dict.items(), key=lambda item: item[1])

# Check how many items of this modality can be supported by
# the encoder budget.
encoder_budget = min(self.max_num_encoder_input_tokens,
self.encoder_cache_size)

max_num_mm_items_encoder_budget = cdiv(encoder_budget,
max_tokens_per_mm_item)

# Check how many items of this modality can be supported by
# the decoder budget.
max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
self.model_config)[dummy_data_modality]

# NOTE: We do not consider max_num_batched_tokens on purpose
# because the multimodal embeddings can be generated in advance
# and chunked prefilled.
max_num_mm_items_decoder_budget = self.max_num_reqs * \
max_mm_items_per_req

max_num_mm_items = min(max_num_mm_items_encoder_budget,
max_num_mm_items_decoder_budget)

logger.info(
"Encoder cache will be initialized with a budget of %s tokens,"
" and profiled with %s %s items of the maximum feature size.",
encoder_budget, max_num_mm_items, dummy_data_modality)

# Create dummy batch of multimodal inputs.
dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
model_config=self.model_config,
seq_len=self.max_num_tokens,
mm_counts={
dummy_data_modality: 1
},
).multi_modal_data

batched_dummy_mm_inputs = MultiModalKwargs.batch(
[dummy_mm_kwargs] * max_num_mm_items,
pin_memory=self.pin_memory)
batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
batched_dummy_mm_inputs,
device=self.device,
)

# Run multimodal encoder.
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
**batched_dummy_mm_inputs)

sanity_check_mm_encoder_outputs(
dummy_encoder_outputs,
expected_num_items=max_num_mm_items,
)

# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))

# For profile, have maximum num_reqs and that collectively have
# maximum num_tokens.
Expand Down
Loading