Skip to content

[Doc] Convert Sphinx directives ( {class}, {meth}, {attr}, ...) to MkDocs format for better documentation linking #18663

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 27, 2025
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion vllm/compilation/compiler_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
Gather all the relevant information from the vLLM config,
to compute a hash so that we can cache the compiled model.

See {meth}`VllmConfig.compute_hash` to check what information
See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
to check what information
is already considered by default. This function should only
consider the information that is specific to the compiler.
"""
Expand Down
36 changes: 20 additions & 16 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2980,7 +2980,7 @@ class PoolerConfig:
pooling_type: Optional[str] = None
"""
The pooling method of the pooling model. This should be a key in
{class}`vllm.model_executor.layers.pooler.PoolingType`.
[`vllm.model_executor.layers.pooler.PoolingType`][].
"""

normalize: Optional[bool] = None
Expand Down Expand Up @@ -3691,23 +3691,27 @@ class CompilationConfig:
"""Configuration for compilation. It has three parts:

- Top-level Compilation control:
- {attr}`level`
- {attr}`debug_dump_path`
- {attr}`cache_dir`
- {attr}`backend`
- {attr}`custom_ops`
- {attr}`splitting_ops`
- [`level`][vllm.config.CompilationConfig.level]
- [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
- [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
- [`backend`][vllm.config.CompilationConfig.backend]
- [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
- [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
- CudaGraph capture:
- {attr}`use_cudagraph`
- {attr}`cudagraph_capture_sizes`
- {attr}`cudagraph_num_of_warmups`
- {attr}`cudagraph_copy_inputs`
- {attr}`full_cuda_graph`
- [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
- [`cudagraph_capture_sizes`]
[vllm.config.CompilationConfig.cudagraph_capture_sizes]
- [`cudagraph_num_of_warmups`]
[vllm.config.CompilationConfig.cudagraph_num_of_warmups]
- [`cudagraph_copy_inputs`]
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
- [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
- Inductor compilation:
- {attr}`use_inductor`
- {attr}`compile_sizes`
- {attr}`inductor_compile_config`
- {attr}`inductor_passes`
- [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
- [`inductor_compile_config`]
[vllm.config.CompilationConfig.inductor_compile_config]
- [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
- custom inductor passes

Why we have different sizes for cudagraph and inductor:
Expand Down
5 changes: 4 additions & 1 deletion vllm/connections.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,7 @@ async def async_download_file(


global_http_connection = HTTPConnection()
"""The global {class}`HTTPConnection` instance used by vLLM."""
"""
The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used
by vLLM.
"""
50 changes: 27 additions & 23 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,8 @@ async def add_request_async(
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
"""Async version of {meth}`add_request`."""
"""Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
Expand Down Expand Up @@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async(


class AsyncLLMEngine(EngineClient):
"""An asynchronous wrapper for {class}`LLMEngine`.
"""An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].

This class is used to wrap the {class}`LLMEngine` class to make it
asynchronous. It uses asyncio to create a background loop that keeps
processing incoming requests. The {class}`LLMEngine` is kicked by the
generate method when there are requests in the waiting queue. The generate
method yields the outputs from the {class}`LLMEngine` to the caller.
This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
make it asynchronous. It uses asyncio to create a background loop that keeps
processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
by the generate method when there are requests in the waiting queue. The
generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
to the caller.

Args:
log_requests: Whether to log the requests.
start_engine_loop: If True, the background task to run the engine
will be automatically started in the generate call.
*args: Arguments for {class}`LLMEngine`.
**kwargs: Arguments for {class}`LLMEngine`.
*args: Arguments for [`LLMEngine`][vllm.LLMEngine].
**kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
"""

_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
Expand Down Expand Up @@ -985,8 +987,9 @@ async def generate(
from the LLMEngine to the caller.

Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
sampling_params: The sampling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
Expand All @@ -1003,7 +1006,7 @@ async def generate(
Details:
- If the engine is not running, start the background loop,
which iteratively invokes
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
[`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
to process the waiting requests.
- Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to
Expand Down Expand Up @@ -1075,8 +1078,9 @@ async def encode(
from the LLMEngine to the caller.

Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
Expand All @@ -1089,15 +1093,15 @@ async def encode(
for the request.

Details:
- If the engine is not running, start the background loop,
which iteratively invokes
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
to process the waiting requests.
- Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to
the underlying engine.
Also, a corresponding `AsyncStream` will be created.
- Wait for the request outputs from `AsyncStream` and yield them.
- If the engine is not running, start the background loop,
which iteratively invokes
[`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
to process the waiting requests.
- Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to
the underlying engine.
Also, a corresponding `AsyncStream` will be created.
- Wait for the request outputs from `AsyncStream` and yield them.

Example:
```
Expand Down
8 changes: 4 additions & 4 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,11 @@ class LLMEngine:
iteration-level scheduling and efficient memory management to maximize the
serving throughput.

The [LLM][vllm.LLM] class wraps this class for offline batched inference
and the [AsyncLLMEngine][] class wraps this class for online serving.
The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
class wraps this class for online serving.

The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See
[engine-args][])
The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].

Args:
vllm_config: The configuration for initializing and running vLLM.
Expand Down
10 changes: 6 additions & 4 deletions vllm/engine/multiprocessing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,9 @@ def generate(
from the LLMEngine to the caller.

Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
sampling_params: The sampling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
Expand Down Expand Up @@ -561,8 +562,9 @@ def encode(
from the LLMEngine to the caller.

Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
Expand Down
19 changes: 11 additions & 8 deletions vllm/engine/multiprocessing/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,22 @@


class MQLLMEngine:
"""A multiprocessing wrapper for {class}`LLMEngine`.
"""A multiprocessing wrapper for
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine].

This class is used to wrap the {class}`LLMEngine` class to enable use
This class is used to wrap the
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
in concurrnet manner. It runs a background loop and uses zeromq to
receive new requests and stream outputs incrementally via ipc.

The {class}`LLMEngine` generate or encode process is kicked off when a new
RPCProcessRequest is received by the input_socket.
The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
process is kicked off when a new RPCProcessRequest is received by the
input_socket.

The self.engine_loop checks the input_socket for new requests,
adds them to the LLMEngine if there are any, calls the internal
{class}`LLMEngine.step()`, and sends the RequestOutputs back over
the output_socket.
[`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
the RequestOutputs back over the output_socket.

If use_async_sockets is set, the logic associated with reading new
requests from the socket and sending data to the socket is passed
Expand All @@ -65,8 +68,8 @@ class MQLLMEngine:
ipc_path: Base path for zeromq interprocess messaging
use_async_sockets: Whether to make send/recv async with GPU
log_requests: Whether to log the requests.
*args: Arguments for {class}`LLMEngine`.
**kwargs: Arguments for {class}`LLMEngine`.
*args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
**kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
"""

def __init__(self,
Expand Down
7 changes: 5 additions & 2 deletions vllm/engine/output_processor/multi_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,11 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
scheduled computation.

Args:
seq_group: the outputs are associated with this {class}`SequenceGroup`
outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
seq_group: the outputs are associated with this
[`SequenceGroup`][vllm.sequence.SequenceGroup]
outputs: the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
for all scheduler steps
"""
for output in outputs:
# Concatenate single-step prompt logprob processing results.
Expand Down
21 changes: 14 additions & 7 deletions vllm/engine/output_processor/single_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,21 @@
def single_step_process_prompt_logprob(
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
output: CompletionSequenceGroupOutput) -> None:
"""Process prompt logprobs associated with the {class}`SequenceGroupOutput`
for a given step.
"""Process prompt logprobs associated with the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.

Do nothing if the output has no prompt logprobs.

Account for the fact that transformers do not compute first-token logprobs.

Args:
sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
seq_group: the output is associated with this {class}`SequenceGroup`
output: the {class}`SequenceGroupOutput` for a single scheduler step
sg_output_proc:
[`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
instance
seq_group: the output is associated with this
[`SequenceGroup`][vllm.sequence.SequenceGroup]
output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
for a single scheduler step
"""
prompt_logprobs = output.prompt_logprobs

Expand Down Expand Up @@ -103,8 +107,11 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
scheduled computation.

Args:
seq_group: the output is associated with this {class}`SequenceGroup`
outputs: the {class}`SequenceGroupOutput` for a single scheduler step
seq_group: the output is associated with this
[`SequenceGroup`][vllm.sequence.SequenceGroup]
outputs: the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
for a single scheduler step
"""
assert len(outputs) == 1, "Single step should only have 1 output."
output = outputs[0]
Expand Down
5 changes: 2 additions & 3 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ class LLM:
compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration.
**kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See
[engine-args][])
**kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].

Note:
This class is intended to be used for offline inference. For online
Expand Down Expand Up @@ -494,7 +493,7 @@ def collective_rpc(self,
`self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
[`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method.

Expand Down
6 changes: 4 additions & 2 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,8 @@ def _tokenize_prompt_input(
add_special_tokens: bool = True,
) -> TextTokensPrompt:
"""
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
A simpler implementation of
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
that assumes single input.
"""
return next(
Expand All @@ -603,7 +604,8 @@ def _tokenize_prompt_inputs(
add_special_tokens: bool = True,
) -> Iterator[TextTokensPrompt]:
"""
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
A simpler implementation of
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
that assumes multiple inputs.
"""
for text in prompt_inputs:
Expand Down
2 changes: 1 addition & 1 deletion vllm/executor/executor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def collective_rpc(self,
`self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
[`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method.

Expand Down
5 changes: 3 additions & 2 deletions vllm/inputs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@

INPUT_REGISTRY = InputRegistry()
"""
The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine`
to dispatch data processing according to the target model.
The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
target model.
"""

__all__ = [
Expand Down
Loading