Skip to content

Commit 2908094

Browse files
authored
Add /v1/chat/completions/batch endpoint for batched chat completions (vllm-project#38011)
Signed-off-by: Matej Rojec <64556640+MatejRojec@users.noreply.github.com>
1 parent e6bf9f1 commit 2908094

File tree

8 files changed

+771
-21
lines changed

8 files changed

+771
-21
lines changed
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""Examples of batched chat completions via the vLLM OpenAI-compatible API.
4+
5+
The /v1/chat/completions/batch endpoint accepts ``messages`` as a list of
6+
conversations. Each conversation is processed independently and the response
7+
contains one choice per conversation, indexed 0, 1, ..., N-1.
8+
9+
Start a server first, e.g.:
10+
vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
11+
12+
Current limitations compared to /v1/chat/completions:
13+
- Streaming is not supported.
14+
- Tool use is not supported.
15+
- Beam search is not supported.
16+
"""
17+
18+
import json
19+
import os
20+
21+
import httpx
22+
23+
BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000")
24+
MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
25+
BATCH_URL = f"{BASE_URL}/v1/chat/completions/batch"
26+
27+
28+
def post_batch(payload: dict) -> dict:
29+
response = httpx.post(BATCH_URL, json=payload, timeout=60)
30+
response.raise_for_status()
31+
return response.json()
32+
33+
34+
def main() -> None:
35+
print("=== Example 1a: single conversation (standard endpoint) ===")
36+
response = httpx.post(
37+
f"{BASE_URL}/v1/chat/completions",
38+
json={
39+
"model": MODEL,
40+
"messages": [{"role": "user", "content": "What is the capital of Japan?"}],
41+
},
42+
timeout=60,
43+
)
44+
response.raise_for_status()
45+
data = response.json()
46+
for choice in data["choices"]:
47+
print(f" [{choice['index']}] {choice['message']['content']}")
48+
49+
print("\n=== Example 1b: batched plain text (2 conversations) ===")
50+
data = post_batch(
51+
{
52+
"model": MODEL,
53+
"messages": [
54+
[{"role": "user", "content": "What is the capital of France?"}],
55+
[{"role": "user", "content": "What is the capital of Japan?"}],
56+
],
57+
}
58+
)
59+
for choice in data["choices"]:
60+
print(f" [{choice['index']}] {choice['message']['content']}")
61+
62+
print("\n=== Example 2: batch with regex constraint (yes|no) ===")
63+
data = post_batch(
64+
{
65+
"model": MODEL,
66+
"messages": [
67+
[{"role": "user", "content": "Is the sky blue? Answer yes or no."}],
68+
[{"role": "user", "content": "Is fire cold? Answer yes or no."}],
69+
],
70+
"structured_outputs": {"regex": "(yes|no)"},
71+
}
72+
)
73+
for choice in data["choices"]:
74+
print(f" [{choice['index']}] {choice['message']['content']}")
75+
76+
print("\n=== Example 3: batch with json_schema ===")
77+
person_schema = {
78+
"type": "object",
79+
"properties": {
80+
"name": {"type": "string", "description": "Full name of the person"},
81+
"age": {"type": "integer", "description": "Age in years"},
82+
},
83+
"required": ["name", "age"],
84+
}
85+
data = post_batch(
86+
{
87+
"model": MODEL,
88+
"messages": [
89+
[
90+
{
91+
"role": "user",
92+
"content": "Describe the person: name Alice, age 30.",
93+
}
94+
],
95+
[{"role": "user", "content": "Describe the person: name Bob, age 25."}],
96+
],
97+
"response_format": {
98+
"type": "json_schema",
99+
"json_schema": {
100+
"name": "person",
101+
"strict": True,
102+
"schema": person_schema,
103+
},
104+
},
105+
}
106+
)
107+
for choice in data["choices"]:
108+
person = json.loads(choice["message"]["content"])
109+
print(f" [{choice['index']}] {person}")
110+
111+
print("\n=== Example 4: batch book summaries ===")
112+
book_schema = {
113+
"type": "object",
114+
"properties": {
115+
"author": {
116+
"type": "string",
117+
"description": "Full name of the author",
118+
},
119+
"num_pages": {
120+
"type": "integer",
121+
"description": "Number of pages in the book",
122+
},
123+
"short_summary": {
124+
"type": "string",
125+
"description": "A one-sentence summary of the book",
126+
},
127+
"long_summary": {
128+
"type": "string",
129+
"description": (
130+
"A detailed two to three sentence summary covering "
131+
"the main themes and plot"
132+
),
133+
},
134+
},
135+
"required": ["author", "num_pages", "short_summary", "long_summary"],
136+
}
137+
system_msg = {
138+
"role": "system",
139+
"content": (
140+
"You are a literary analyst. Extract structured information "
141+
"from book descriptions."
142+
),
143+
}
144+
data = post_batch(
145+
{
146+
"model": MODEL,
147+
"messages": [
148+
[
149+
system_msg,
150+
{
151+
"role": "user",
152+
"content": (
153+
"Extract information from this book: '1984' by George"
154+
" Orwell, published in 1949, 328 pages. A dystopian"
155+
" novel set in a totalitarian society ruled by Big"
156+
" Brother, following Winston Smith as he secretly"
157+
" rebels against the oppressive Party that surveils"
158+
" and controls every aspect of life."
159+
),
160+
},
161+
],
162+
[
163+
system_msg,
164+
{
165+
"role": "user",
166+
"content": (
167+
"Extract information from this book: 'The Hitchhiker's"
168+
" Guide to the Galaxy' by Douglas Adams, published in"
169+
" 1979, 193 pages. A comedic science fiction novel"
170+
" following Arthur Dent, an ordinary Englishman who is"
171+
" whisked off Earth moments before it is demolished to"
172+
" make way for a hyperspace bypass, and his subsequent"
173+
" absurd adventures across the universe."
174+
),
175+
},
176+
],
177+
],
178+
"response_format": {
179+
"type": "json_schema",
180+
"json_schema": {
181+
"name": "book_summary",
182+
"strict": True,
183+
"schema": book_schema,
184+
},
185+
},
186+
}
187+
)
188+
for choice in data["choices"]:
189+
book = json.loads(choice["message"]["content"])
190+
print(f" [{choice['index']}] {book}")
191+
192+
193+
if __name__ == "__main__":
194+
main()
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import json
5+
6+
import httpx
7+
import pytest
8+
9+
from tests.utils import RemoteOpenAIServer
10+
11+
# any model with a chat template defined in tokenizer_config should work here
12+
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
13+
14+
15+
@pytest.fixture(scope="module")
16+
def default_server_args():
17+
return [
18+
# use half precision for speed and memory savings in CI environment
19+
"--max-model-len",
20+
"2048",
21+
"--max-num-seqs",
22+
"128",
23+
"--enforce-eager",
24+
]
25+
26+
27+
@pytest.fixture(scope="module")
28+
def server(default_server_args):
29+
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
30+
yield remote_server
31+
32+
33+
@pytest.mark.asyncio
34+
@pytest.mark.parametrize(
35+
"model_name",
36+
[MODEL_NAME],
37+
)
38+
async def test_batched_chat_completions(
39+
server: RemoteOpenAIServer, model_name: str
40+
) -> None:
41+
conversations = [
42+
[{"role": "user", "content": "Reply with exactly the word: alpha"}],
43+
[{"role": "user", "content": "Reply with exactly the word: beta"}],
44+
]
45+
46+
async with httpx.AsyncClient() as http_client:
47+
response = await http_client.post(
48+
f"{server.url_for('v1/chat/completions/batch')}",
49+
json={
50+
"model": model_name,
51+
"messages": conversations,
52+
},
53+
timeout=60,
54+
)
55+
56+
assert response.status_code == 200, response.text
57+
data = response.json()
58+
59+
choices = data["choices"]
60+
assert len(choices) == 2
61+
62+
indices = {choice["index"] for choice in choices}
63+
assert indices == {0, 1}
64+
65+
# Each conversation should produce a non-empty text response.
66+
for choice in choices:
67+
assert choice["message"]["content"]
68+
69+
70+
@pytest.mark.asyncio
71+
@pytest.mark.parametrize(
72+
"model_name",
73+
[MODEL_NAME],
74+
)
75+
async def test_batched_chat_completions_with_json_schema(
76+
server: RemoteOpenAIServer, model_name: str
77+
) -> None:
78+
schema = {
79+
"type": "object",
80+
"properties": {
81+
"answer": {"type": "string", "enum": ["yes", "no"]},
82+
},
83+
"required": ["answer"],
84+
}
85+
conversations = [
86+
[{"role": "user", "content": "Is the sky blue? Answer in JSON."}],
87+
[{"role": "user", "content": "Is fire cold? Answer in JSON."}],
88+
]
89+
90+
async with httpx.AsyncClient() as http_client:
91+
response = await http_client.post(
92+
f"{server.url_for('v1/chat/completions/batch')}",
93+
json={
94+
"model": model_name,
95+
"messages": conversations,
96+
"response_format": {
97+
"type": "json_schema",
98+
"json_schema": {"name": "answer", "schema": schema, "strict": True},
99+
},
100+
},
101+
timeout=60,
102+
)
103+
104+
assert response.status_code == 200, response.text
105+
data = response.json()
106+
107+
choices = data["choices"]
108+
assert len(choices) == 2
109+
110+
for choice in choices:
111+
parsed = json.loads(choice["message"]["content"])
112+
assert "answer" in parsed
113+
assert parsed["answer"] in ("yes", "no")

tests/entrypoints/openai/test_openai_schema.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ def test_openapi_stateless(case: Case):
174174
timeout = {
175175
# requires a longer timeout
176176
("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
177+
("POST", "/v1/chat/completions/batch"): LONG_TIMEOUT_SECONDS,
177178
("POST", "/v1/completions"): LONG_TIMEOUT_SECONDS,
178179
("POST", "/v1/messages"): LONG_TIMEOUT_SECONDS,
179180
}.get(key, DEFAULT_TIMEOUT_SECONDS)

vllm/entrypoints/openai/chat_completion/api_router.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
from fastapi import APIRouter, Depends, FastAPI, Request
88
from fastapi.responses import JSONResponse, StreamingResponse
99

10+
from vllm.entrypoints.openai.chat_completion.batch_serving import OpenAIServingChatBatch
1011
from vllm.entrypoints.openai.chat_completion.protocol import (
12+
BatchChatCompletionRequest,
1113
ChatCompletionRequest,
1214
ChatCompletionResponse,
1315
)
@@ -31,6 +33,10 @@ def chat(request: Request) -> OpenAIServingChat | None:
3133
return request.app.state.openai_serving_chat
3234

3335

36+
def batch_chat(request: Request) -> OpenAIServingChatBatch | None:
37+
return request.app.state.openai_serving_chat_batch
38+
39+
3440
@router.post(
3541
"/v1/chat/completions",
3642
dependencies=[Depends(validate_json_request)],
@@ -68,5 +74,33 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
6874
return StreamingResponse(content=generator, media_type="text/event-stream")
6975

7076

77+
@router.post(
78+
"/v1/chat/completions/batch",
79+
dependencies=[Depends(validate_json_request)],
80+
responses={
81+
HTTPStatus.OK.value: {},
82+
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
83+
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
84+
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
85+
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
86+
},
87+
)
88+
@with_cancellation
89+
@load_aware_call
90+
async def create_batch_chat_completion(
91+
request: BatchChatCompletionRequest, raw_request: Request
92+
):
93+
handler = batch_chat(raw_request)
94+
if handler is None:
95+
raise NotImplementedError("The model does not support Chat Completions API")
96+
97+
result = await handler.create_batch_chat_completion(request, raw_request)
98+
99+
if isinstance(result, ErrorResponse):
100+
return JSONResponse(content=result.model_dump(), status_code=result.error.code)
101+
102+
return JSONResponse(content=result.model_dump())
103+
104+
71105
def attach_router(app: FastAPI):
72106
app.include_router(router)

0 commit comments

Comments
 (0)