vllm-project · juncheoll · Jun 19, 2025 · Jun 19, 2025 · Jun 19, 2025
@@ -39,9 +39,9 @@ This living user guide outlines a few known **important changes and limitations*
 For each item, our progress towards V1 support falls into one of the following states:
 
 - **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
-- **🟢 Functional**: Fully operational, with ongoing optimizations.  
-- **🚧 WIP**: Under active development.  
-- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).  
+- **🟢 Functional**: Fully operational, with ongoing optimizations.
+- **🚧 WIP**: Under active development.
+- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
 - **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
 - **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
 
@@ -70,7 +70,7 @@ For each item, our progress towards V1 support falls into one of the following s
 |-----------------------------|------------------------------------------------------------------------------------|
 | **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
 | **Encoder-Decoder Models**  | <nobr>🟠 Delayed</nobr>                                                            |
-| **Embedding Models**        | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
+| **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
 | **Mamba Models**            | <nobr>🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327))</nobr> |
 | **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
 
@@ -80,11 +80,11 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco
 
     This corresponds to the V1 column in our [list of supported models][supported-models].
 
-See below for the status of models that are still not yet supported in V1.
+See below for the status of models that are not yet supported or have more features planned in V1.
 
 #### Embedding Models
 
-The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
+The initial basic support is now functional.
 
 Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
 which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)

diff --git a/vllm/config.py b/vllm/config.py
@@ -2119,6 +2119,20 @@ class SchedulerConfig:
     default scheduler. Can be a class directly or the path to a class of form
     "mod.custom_class"."""
 
+    use_batch_scheduler: bool = False
+    """Whether to use the BatchScheduler instead of the default scheduler.
+
+    If set to True, the engine will use
+    "vllm.v1.core.sched.scheduler.BatchScheduler" as the scheduler class unless
+    a custom `scheduler_cls` is explicitly provided.
+
+    If both `use_batch_scheduler=True` and a non-default `scheduler_cls` are
+    specified, the `scheduler_cls` will take precedence and
+    `use_batch_scheduler` will be ignored.
+
+    Default is False.
+    """
+
     disable_hybrid_kv_cache_manager: bool = False
     """If set to True, KV cache manager will allocate the same size of KV cache
     for all attention layers even if there are multiple type of attention layers

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -411,6 +411,7 @@ class EngineArgs:
     disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
+    use_batch_scheduler: bool = SchedulerConfig.use_batch_scheduler
 
     override_neuron_config: dict[str, Any] = \
         get_field(ModelConfig, "override_neuron_config")
@@ -855,6 +856,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             **scheduler_kwargs["disable_chunked_mm_input"])
         scheduler_group.add_argument("--scheduler-cls",
                                      **scheduler_kwargs["scheduler_cls"])
+        scheduler_group.add_argument("--use-batch-scheduler",
+                                     **scheduler_kwargs["use_batch_scheduler"])
         scheduler_group.add_argument(
             "--disable-hybrid-kv-cache-manager",
             **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
@@ -1182,6 +1185,7 @@ def create_engine_config(
                              and parallel_config.use_ray),
             policy=self.scheduling_policy,
             scheduler_cls=self.scheduler_cls,
+            use_batch_scheduler=self.use_batch_scheduler,
             max_num_partial_prefills=self.max_num_partial_prefills,
             max_long_partial_prefills=self.max_long_partial_prefills,
             long_prefill_token_threshold=self.long_prefill_token_threshold,
@@ -1550,6 +1554,18 @@ def _set_default_args_v1(self, usage_context: UsageContext,
         if not self.enable_chunked_prefill:
             self.max_num_batched_tokens = model_config.max_model_len
 
+        if self.use_batch_scheduler:
+            if self.scheduler_cls == EngineArgs.scheduler_cls:
+                self.scheduler_cls = \
+                    "vllm.v1.core.sched.scheduler.BatchScheduler"
+            else:
+                logger.warning(
+                    "use_batch_scheduler is set to True, "
+                    "but a custom scheduler_cls is also provided. "
+                    "The specified scheduler_cls (%s) will take precedence, "
+                    "and use_batch_scheduler will be ignored.",
+                    self.scheduler_cls)
+
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:

@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, Union
 
 if TYPE_CHECKING:
     import numpy as np
@@ -155,3 +155,13 @@ class SchedulerOutput:
 
     # KV Cache Connector metadata.
     kv_connector_metadata: Optional[KVConnectorMetadata] = None
+
+
+@dataclass
+class ScheduledRequest:
+    request_id: str
+    num_new_tokens: int
+    encoder_inputs_to_schedule: list[int] | None
+    num_scheduled_spec_tokens: int
+    spec_token_ids: list[int] | None
+    request_data: Union[NewRequestData, CachedRequestData]